Change python doc push script to print the undocumented modules

Update
Test
2025-10-21 13:44:15 +08:00 · 2025-10-08 09:28:48 -07:00 · 2025-10-08 09:04:44 -07:00 · 2025-10-08 09:04:43 -07:00 · 2025-10-08 15:26:50 +00:00 · 2025-10-08 15:15:45 +00:00
552 changed files with 6911 additions and 4672 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -344,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -115,6 +115,9 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0

+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
+
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -84,9 +84,9 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950 conditionally starting in ROCm 7.0
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,15 +1,11 @@
-sphinx==5.3.0
+sphinx==7.2.6
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 5.3.0
+#Pinned versions: 7.2.6

-standard-imghdr==3.13.0; python_version >= "3.13"
-#Description: This is needed by Sphinx, so it needs to be added here.
-# The reasons are as follows:
-# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
-# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
-# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
+pytorch_sphinx_theme2==0.1.0
+#Description: This is needed to generate PyTorch docs
+#Pinned versions: 0.1.0

-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@ -36,17 +32,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

-breathe==4.34.0
+breathe==4.36.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.34.0
+#Pinned versions: 4.36.0

-exhale==0.2.3
+exhale==0.3.7
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.2.3
+#Pinned versions: 0.3.7

-docutils==0.16
+docutils==0.20
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.16
+#Pinned versions: 0.20

 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -56,13 +52,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0

-myst-nb==0.17.2
+myst-nb==1.3.0
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 0.17.2
+#Pinned versions: 1.3.0

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.4.0
+sphinx-design==0.6.1
 sphinxcontrib-mermaid==1.0.0
-myst-parser==0.18.1
+myst-parser==4.0.1
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
+if [[ "$(uname)" != 'Darwin' ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -102,8 +102,18 @@ if [ "$is_main_doc" = true ]; then
    echo coverage output not found
    exit 1
  elif [ $undocumented -gt 0 ]; then
-    echo undocumented objects found:
+    echo "======================================"
+    echo "ERROR: $undocumented undocumented objects found!"
+    echo "======================================"
+    echo ""
+    echo "Full coverage report:"
    cat build/coverage/python.txt
+    echo ""
+    echo "======================================"
+    echo "Undocumented modules/objects (lines after TOTAL):"
+    tail -n +$((lines - undocumented + 1)) build/coverage/python.txt
+    echo "======================================"
+    echo ""
    echo "Make sure you've updated relevant .rsts in docs/source!"
    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
    exit 1
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
  done

  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -18,6 +18,7 @@ class GitHubComment:
    body_text: str
    created_at: str
    author_login: str
+    author_url: Optional[str]
    author_association: str
    editor_login: Optional[str]
    database_id: int
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -38,6 +38,7 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text="mock_body_text",
            created_at="",
            author_login="",
+            author_url=None,
            author_association="",
            editor_login=None,
            database_id=1,
@ -48,6 +49,7 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
            created_at="",
            author_login=BOT_AUTHORS[1],
+            author_url=None,
            author_association="",
            editor_login=None,
            database_id=2,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -32,6 +32,7 @@ from trymerge import (
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
+    PostCommentError,
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
@ -588,6 +589,23 @@ class TestTryMerge(TestCase):
            self.assertEqual(mock_merge_base, pr.get_merge_base())
            mocked_gh_fetch_merge_base.assert_called_once()

+    def test_app_can_revert(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 164660)
+        repo = DummyGitRepo()
+        app_comment_id, impostor_comment_id = 3375785595, 3377647892
+        # Check that app can revert
+        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
+        # But impostor can not
+        self.assertRaises(
+            PostCommentError,
+            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
+        )
+        # Despite it's name being the name of the bot
+        self.assertEqual(
+            pr.get_comment_by_id(impostor_comment_id).author_login,
+            "pytorch-auto-revert",
+        )
+

@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -234,6 +234,7 @@ query ($owner: String!, $name: String!, $number: Int!) {
          createdAt
          author {
            login
+            url
          }
          authorAssociation
          editor {
@ -1093,6 +1094,7 @@ class GitHubPR:
            body_text=node["bodyText"],
            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
+            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
            editor_login=editor["login"] if editor else None,
            database_id=node["databaseId"],
@ -2029,6 +2031,11 @@ def validate_revert(
    # For some reason, one can not be a member of private repo, only CONTRIBUTOR
    if pr.is_base_repo_private():
        allowed_reverters.append("CONTRIBUTOR")
+    # Special case the pytorch-auto-revert app, whose does not have association
+    # But should be able to issue revert command
+    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
+        allowed_reverters.append("NONE")
+
    if author_association not in allowed_reverters:
        raise PostCommentError(
            f"Will not revert as @{author_login} is not one of "
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -63,6 +63,7 @@ jobs:
      # Same as the build job
      python-version: 3.12.7
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
+      timeout-minutes: 300
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -59,3 +59,29 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-rocm-py3_10-gfx1100-test:
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3_10-gfx1100
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+        ]}
+      tests-to-include: >
+         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
+         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
+         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
+         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
+         inductor/test_flex_attention inductor/test_max_autotune
+    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -88,7 +88,7 @@ torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
-torch/csrc/api/include/torch/version.h
+torch/headeronly/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -28,7 +28,7 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
-    "tools/experimental/dynamic_shapes/torchfuzz/**",
+    "tools/experimental/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
@ -198,7 +198,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
-    'tools/experimental/dynamic_shapes/torchfuzz/**',
+    'tools/experimental/torchfuzz/**',
 ]
 command = [
    'python3',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -13,6 +13,9 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")

+# Export files for use by torch/headeronly (where version.h generation now lives)
+exports_files(["version.txt"])
+
 define_targets(rules = rules)

 COMMON_COPTS = [
@ -690,7 +693,9 @@ cc_library(
            "torch/csrc/*/generated/*.h",
            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
-    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
+    ) + GENERATED_AUTOGRAD_CPP + [
+        "//torch/headeronly:version_h",
+    ],
    includes = [
        "third_party/kineto/libkineto/include",
        "torch/csrc",
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -229,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  }

  void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
  }

  void resize_(
      int64_t sparse_dim,
      int64_t dense_dim,
      ArrayRef<c10::SymInt> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
  }

  // NOTE: this function will resize the sparse tensor and also set `indices`
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
    }
  }

-  return set_item(self, indices, value);
+  set_item(self, indices, value);
 }

 } // namespace indexing
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -765,7 +765,8 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
  if (numel == 0) {
    return;
  } else if (numel < grain_size || at::get_num_threads() == 1) {
-    return serial_for_each(loop, {0, numel});
+    serial_for_each(loop, {0, numel});
+    return;
  } else {
    at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
      serial_for_each(loop, {begin, end});
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
 }

 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
-  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+  impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
 }

 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
        const std::optional<Tensor>& gradient,
        std::optional<bool> keep_graph,
        bool create_graph) const {
-  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+  impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }

 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
  }

  void checkInvariants() const {
-    return operatorDef_->op.checkInvariants();
+    operatorDef_->op.checkInvariants();
  }

  c10::ArrayRef<at::Tag> getTags() const {
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
  }
 #endif
  const auto& kernel = entry.lookup(dispatchKeySet);
-  return kernel.callBoxed(op, dispatchKeySet, stack);
+  kernel.callBoxed(op, dispatchKeySet, stack);
 }

 } // namespace c10
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -326,6 +326,23 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #endif
 }

+bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
+#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
+  if (!hasCUDA()) {
+    return false;
+  }
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  // Check for Volta cores
+  if (prop->major >= 8) {
+    return true;
+  } else {
+    return false;
+  }
+#else
+  return false;
+#endif
+}
+
 long CUDAHooks::versionCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_VERSION;
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -45,6 +45,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool supportsDilatedConvolutionWithCuDNN() const override;
  bool supportsDepthwiseConvolutionWithCuDNN() const override;
  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool supportsBFloat16RNNWithCuDNN() const override;
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -166,6 +166,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return false;
  }

+  virtual bool supportsBFloat16RNNWithCuDNN() const {
+    return false;
+  }
+
  virtual long versionCuDNN() const {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s

 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, true);
+  dynamicLayerBack(op, stack, true);
 }

 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, false);
+  dynamicLayerBack(op, stack, false);
 }

 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
  const at::BFloat16 beta,
  at::BFloat16* y,
  const int incy) {
-  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+  bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

 template <>
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
    const Tensor& raw_boundaries) {
  Tensor trimmed_sorter;
  Tensor raw_sorter;
-  return searchsorted_maybe_trim_input_tensors(
+  searchsorted_maybe_trim_input_tensors(
      trimmed_input,
      trimmed_boundaries,
      trimmed_sorter,
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@ -93,6 +93,12 @@ inline bool cond_cudnn_grid_sampler(
  const TensorBase& input,
  const TensorBase& grid
 ) {
+  auto st = input.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
+  st = grid.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
  return (
    at::native::cudnn_is_acceptable(input) &&
    at::native::cudnn_is_acceptable(grid) &&
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -108,6 +108,13 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
  return false;
 }

+bool use_cudnn(const Tensor& t) {
+  bool acceptable = at::cudnn_is_acceptable(t);
+  auto st = t.scalar_type();
+  bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
+  return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
+}
+
 template<typename T>
 using pair_of = std::pair<T, T>;

@ -1200,7 +1207,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      bool train,                                                           \
      bool bidirectional,                                                   \
      bool batch_first) {                                                   \
-    if (at::cudnn_is_acceptable(_input)) {                                  \
+    if (use_cudnn(_input)) {                                                \
      Tensor output, hy;                                                    \
      NAME##_cudnn_stub(                                                    \
          _input.device().type(),                                           \
@ -1262,7 +1269,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      double dropout_p,                                                     \
      bool train,                                                           \
      bool bidirectional) {                                                 \
-    if (at::cudnn_is_acceptable(data)) {                                    \
+    if (use_cudnn(data)) {                                                  \
      Tensor output, hy;                                                    \
      NAME##_packed_cudnn_stub(                                             \
          data.device().type(),                                             \
@ -1430,7 +1437,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(_input)) {
+  if (use_cudnn(_input)) {
    Tensor output, hy, cy;
    lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
@ -1491,7 +1498,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(data)) {
+  if (use_cudnn(data)) {
    Tensor output, hy, cy;
    lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
            _params, has_biases, num_layers, dropout_p, train, bidirectional);
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -23,14 +23,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/_cast_Byte_native.h>
-#include <ATen/ops/_cast_Char_native.h>
-#include <ATen/ops/_cast_Double_native.h>
-#include <ATen/ops/_cast_Float_native.h>
-#include <ATen/ops/_cast_Half_native.h>
-#include <ATen/ops/_cast_Int_native.h>
-#include <ATen/ops/_cast_Long_native.h>
-#include <ATen/ops/_cast_Short_native.h>
 #include <ATen/ops/_dim_arange_native.h>
 #include <ATen/ops/_efficientzerotensor_native.h>
 #include <ATen/ops/_empty_affine_quantized.h>
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -91,9 +91,6 @@ bool cudnn_is_acceptable(const TensorBase& self) {
    return false;
  if (!self.is_cuda())
    return false;
-  auto st = self.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
  if (!detail::getCUDAHooks().compiledWithCuDNN())
    return false;
  // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@ -25,11 +25,11 @@
 namespace at::native {

 void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
-  return self._backward(inputs, gradient_opt, keep_graph, create_graph);
+  self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }

 void set_data(Tensor& self, const Tensor& new_data) {
-  return self.set_data(new_data);
+  self.set_data(new_data);
 }

 Tensor data(const Tensor& self) {
@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
 }

 void retain_grad(Tensor& self) {
-  return self.retain_grad();
+  self.retain_grad();
 }

 bool retains_grad(const Tensor& self) {
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -300,7 +300,8 @@ void div_floor_kernel(TensorIteratorBase& iter) {
    // In the special case of unsigned integer division, floor division is
    // equivalent to truncation division (since the signs of the divisor and
    // dividend are always the same)
-    return div_trunc_kernel(iter);
+    div_trunc_kernel(iter);
+    return;
  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
    // There's no SIMD integer division, so don't try to vectorize it.
    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -749,21 +749,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
        // });

        if (iter_dtype == kByte) {
-          return cpu_hflip_vec<uint8_t>(iter);
+          cpu_hflip_vec<uint8_t>(iter);
+          return;
        } else if (iter_dtype == kChar) {
-          return cpu_hflip_vec<int8_t>(iter);
+          cpu_hflip_vec<int8_t>(iter);
+          return;
        } else if (iter_dtype == kInt) {
-          return cpu_hflip_vec<int32_t>(iter);
+          cpu_hflip_vec<int32_t>(iter);
+          return;
        } else if (iter_dtype == kLong) {
-          return cpu_hflip_vec<int64_t>(iter);
+          cpu_hflip_vec<int64_t>(iter);
+          return;
        } else if (iter_dtype == kShort) {
-          return cpu_hflip_vec<int16_t>(iter);
+          cpu_hflip_vec<int16_t>(iter);
+          return;
        } else if (iter_dtype == kBool) {
-          return cpu_hflip_vec<bool>(iter);
+          cpu_hflip_vec<bool>(iter);
+          return;
        } else if (iter_dtype == kFloat) {
-          return cpu_hflip_vec<float>(iter);
+          cpu_hflip_vec<float>(iter);
+          return;
        } else if (iter_dtype == kDouble) {
-          return cpu_hflip_vec<double>(iter);
+          cpu_hflip_vec<double>(iter);
+          return;
        }
      }
      // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
@ -778,10 +786,12 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
          c == input_strides_2[1] &&
          c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
      ) {
-        return cpu_hflip_channels_last_vec(iter);
+        cpu_hflip_channels_last_vec(iter);
+        return;
      }
      // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
-      return cpu_vflip_memcpy(iter);
+      cpu_vflip_memcpy(iter);
+      return;
    }

    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -96,11 +96,14 @@ static void pow_tensor_scalar_kernel(
      dtype == kBFloat16 || isComplexType(dtype)) {
    // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
    if (exp_scalar.equal(.5)) {
-      return sqrt_kernel(iter);
+      sqrt_kernel(iter);
+      return;
    } else if (exp_scalar.equal(-0.5)) {
-      return rsqrt_kernel(iter);
+      rsqrt_kernel(iter);
+      return;
    } else if (exp_scalar.equal(-1.0)) {
-      return reciprocal_kernel(iter);
+      reciprocal_kernel(iter);
+      return;
    }
  }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
  } else {
    if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
+      norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
    } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
      // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
+      norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
    }

    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@ -428,10 +428,11 @@ void fp16_gemv_trans(
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
 #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
  if (at::globalContext().allowFP16ReductionCPU()) {
-    return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    return;
  }
 #endif
-  return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+  fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
 }

 float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
@ -465,7 +466,7 @@ void bf16_gemv_trans(
  at::BFloat16* y,
  const int incy) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
-  return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+  bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
 }

 float fp16_dot(
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -121,7 +121,7 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  return cufft_get_plan_cache(device_index).resize(max_size);
+  cufft_get_plan_cache(device_index).resize(max_size);
 }

 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
@ -137,7 +137,7 @@ void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
    "cufft_clear_plan_cache: expected 0 <= device_index < ",
    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
-  return cufft_get_plan_cache(device_index).clear();
+  cufft_get_plan_cache(device_index).clear();
 }

 } // namespace at::native::detail
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@ -230,7 +230,7 @@ constexpr int BLOCK_THREADS = 256;
 constexpr int RADIX_BITS = 8;
 constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
 constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
-static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+static_assert(RADIX_DIGITS <= BLOCK_THREADS, "RADIX_DIGITS must be <= BLOCK_THREADS");
 constexpr int MIN_ITEMS_PER_THREAD = 4;
 constexpr int MAX_ITEMS_PER_THREAD = 64;

@ -242,11 +242,10 @@ __global__ void fill(T* x, T value, IndexType size) {
  }
 }

-// find the kth smallest value,
-// for largest topk, k_to_find = slice_size - k + 1
+// compute local histogram for each block
 template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
-__global__ void radixFindKthValues(
+__global__ void computeBlockDigitCounts(
    at::cuda::detail::TensorInfo<const T, IndexType> input,
    uint32_t slice_size,
    uint32_t* ks_to_find,  // size: num_slices, unused arg but for mysterious reasons perf is better when it's present
@ -321,12 +320,51 @@ __global__ void radixFindKthValues(
  }
 }

+// compute global histogram and cumsum for each row
+__global__ void computeDigitCumSum(
+  short* counts,
+  uint32_t* digit_cum_sum,
+  uint32_t blocks_per_slice) {
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  int digit_idx = threadIdx.x;
+  uint32_t slice_idx = blockIdx.x;
+
+  typedef cub::BlockScan<uint32_t, RADIX_DIGITS> BlockScan;
+  __shared__ typename BlockScan::TempStorage scan_storage;
+  // accumulates counters from multiple blocks
+  uint32_t digit_count = 0;
+  if (threadIdx.x < RADIX_DIGITS) {
+    constexpr int HISTO_ACCUM_TILE = 4;
+    uint32_t rounds = blocks_per_slice / HISTO_ACCUM_TILE;
+    for (int iter = 0; iter < rounds; iter++)  {
+      int base = HISTO_ACCUM_TILE * iter;
+      #pragma unroll
+      for (int j = 0; j < HISTO_ACCUM_TILE; j++) {
+        int blk = base + j;
+        digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+      }
+    }
+    for (int blk = HISTO_ACCUM_TILE * rounds; blk < blocks_per_slice; blk++)  {
+      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+    }
+
+  }
+  // compute the block-wide inclusive prefix sum
+  uint32_t digit_count_cumsum;
+  BlockScan(scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
+  __syncthreads();
+  if (threadIdx.x < RADIX_DIGITS) {
+    digit_cum_sum[tidx] = digit_count_cumsum;
+  }
+}
+
 // Assumption: k can not be larger than UINT32_MAX
 template <typename Bitwise, typename T>
 C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
 __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_in,          // size: num_slices
  short* counts,             // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t* digit_cum_sum,
  uint32_t* ks_to_find_in,  // size: num_slices
  uint32_t blocks_per_slice,
  int current_bit,
@ -338,7 +376,7 @@ __global__ void computeBlockwiseWithinKCounts(
  Bitwise* desires_out,
  uint32_t num_blocks
 ) {
-  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
+  // This kernel should be launched with the same number of blocks as the `computeBlockDigitCounts` kernel.
  int tidx = threadIdx.x;
  uint32_t block_idx = getLinearBlockId<uint32_t>();
  uint32_t slice_idx = block_idx / blocks_per_slice;
@ -351,36 +389,15 @@ __global__ void computeBlockwiseWithinKCounts(
  if (block_idx >= num_blocks) {
    return;
  }
-  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
-  union __align__(16) TempStorage {
-    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
-    typename BlockScan::TempStorage scan_storage;
-  };
-  __shared__ TempStorage temp_storage;

-  // accumulates counters from multiple blocks
-  uint32_t digit_count = 0;
-  if (tidx < RADIX_DIGITS) {
-    for (int blk = 0; blk < blocks_per_slice; ++blk) {
-      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
-    }
-  }
-
-  // compute the block-wide inclusive prefix sum
-  uint32_t digit_count_cumsum;
-  BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
-  __syncthreads();
-  // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
-  if (tidx < RADIX_DIGITS) {
-    temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
-  }
-  __syncthreads();

  __shared__ Bitwise desired;
  uint32_t k_to_find = ks_to_find_in[slice_idx];

  if (tidx < RADIX_DIGITS) {
-    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
+    uint32_t position = slice_idx * RADIX_DIGITS + tidx;
+    uint32_t digit_count_cumsum = digit_cum_sum[position];
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : digit_cum_sum[position - 1];

    // if not the last pass: update desired and ks_to_find
    // if last pass: write out the kth value
@ -466,7 +483,7 @@ template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
  Bitwise* desires,            // size: num_slices
  short* counts,               // size: num_slices * blocks_per_slice * radix_digits
-  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
+  uint32_t num_blocks,         // the number of blocks used by `computeBlockDigitCounts` kernel
  uint32_t blocks_per_slice,
  // outputs:
  uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
@ -649,9 +666,7 @@ void launch(
  T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());

  TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
-  auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
-  uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
-  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
+

  auto ks_to_find_buffer = allocator.allocate(2 * numInputSlices * sizeof(uint32_t));
  uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
@ -668,6 +683,10 @@ void launch(
  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
    "blockwise counter too large");

+  auto digit_cum_sum_buffer = allocator.allocate(numInputSlices * RADIX_DIGITS * sizeof(uint32_t));
+  uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
+  AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
+
 #if CUB_SUPPORTS_SCAN_BY_KEY()
  auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
  uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
@ -691,7 +710,7 @@ void launch(

  // iterate radix bits for multiple passes
  for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
-    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
+    computeBlockDigitCounts<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
        input,
        inputSliceSize,
        ks_to_find_in, // unused arg
@ -704,10 +723,14 @@ void launch(
        desired_in,
        counts);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    computeDigitCumSum<<<numInputSlices, RADIX_DIGITS, 0, stream>>>(counts, digit_cum_sum, blocks_per_slice);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
    // we unconditionally call this kernel to update desired/ks_to_find/kthValues
    // if cub supports scan_by_key we additionally do k counts
    computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
-      desired_in, counts, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
+      desired_in, counts, digit_cum_sum, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    // swap desired/ks_to_find in and out for next iter
    auto tmp_desired = desired_in;
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1107,10 +1107,14 @@ void ldl_factor_kernel(
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
-      return ldl_factor_cusolver(
+       { ldl_factor_cusolver(
          LD, pivots, info, upper, hermitian);
+        return;
+}
    case at::LinalgBackend::Magma:
-      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
+       { ldl_factor_magma(LD, pivots, info, upper, hermitian);
+        return;
+}
    default:
    // By default use cusolver if available and magma otherwise.
    // If cusolver and magma 2.5.4+ are both available and hermitian=true,
@ -1122,8 +1126,10 @@ void ldl_factor_kernel(
            LD, pivots, info, upper, hermitian);
      }
 #endif
-      return ldl_factor_cusolver(
-          LD, pivots, info, upper, hermitian);
+    { ldl_factor_cusolver(
+      LD, pivots, info, upper, hermitian);
+      return;
+    }
 #else
      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
 #endif
@ -1839,11 +1845,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
      // For the benchmarks see
      // https://github.com/pytorch/pytorch/pull/56253#discussion_r622851107
      if (input.size(-2) <= 256 && batchCount(input) >= std::max<int64_t>(2, input.size(-2) / 16)) {
-        return geqrf_batched_cublas(input, tau);
+        geqrf_batched_cublas(input, tau);
+        return;
      } else {
-        return geqrf_cusolver(input, tau);
+        geqrf_cusolver(input, tau);
+        return;
      }
-      return geqrf_batched_cublas(input, tau);
+      geqrf_batched_cublas(input, tau);
+      return;
  };

  auto preferred_backend = at::globalContext().linalgPreferredBackend();
@ -1856,10 +1865,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
  // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
  // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
    case at::LinalgBackend::Magma:
-      return geqrf_magma(input, tau);
+      { geqrf_magma(input, tau);
+        return;
+      }
    case at::LinalgBackend::Cusolver:
    default:
-      return geqrf_cusolver_backend(input, tau);
+      { geqrf_cusolver_backend(input, tau);
+        return;
+      }
  }
 #else
  return geqrf_magma(input, tau);
@ -2703,13 +2716,17 @@ void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Magma:
-      return gels_magma(a, b, infos);
+      { gels_magma(a, b, infos);
+        return;
+      }
    case at::LinalgBackend::Cusolver:
    default:
      // linalg_lstsq_gels is a generic function that is implemented using
      // geqrf_stub, ormqr_stub, and triangular_solve_stub
      // It dispatches to cuSOLVER for CUDA inputs if USE_LINALG_SOLVER is defined
-      return linalg_lstsq_gels(a, b, infos);
+      { linalg_lstsq_gels(a, b, infos);
+        return;
+      }
  }
 #else
  return gels_magma(a, b, infos);
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -1222,7 +1222,7 @@ cudnnRNNAlgo_t get_algo(
 }

 cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-  if (dtype == CUDNN_DATA_HALF) {
+  if (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) {
    return CUDNN_DATA_FLOAT;
  }
  return dtype;
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@ -373,59 +373,67 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
            result.transpose(0, 1));
+            return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
+            return;
      }
    }
    if (mat2.layout() == kSparseBsc) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
            result.transpose(-2, -1));
+            return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsr) {
-        return addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_input_dense_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_input_dense_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsr) {
        // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
    }
  }
@ -433,56 +441,62 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: avoid csc->csr conversion with native csc support
-        return addmm_dense_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_dense_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
        // TODO: avoid csc->csr conversion with native csc support
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_input_dense_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
      }
      if (result.layout() == kSparseCsr) {
        // TODO avoid csc->csr
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        return;
      }
      if (result.layout() == kSparseCsc) {
-        return addmm_sparse_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
      }
    }
  }
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
      }
    }
  }
  TORCH_CHECK(
-      false,
-      "addmm: computation on CPU is not implemented for ",
-      result.layout(),
-      " + ",
-      mat1.layout(),
-      " @ ",
-      mat2.layout());
+    false,
+    "addmm: computation on CPU is not implemented for ",
+    result.layout(),
+    " + ",
+    mat1.layout(),
+    " @ ",
+    mat2.layout());
 }

 /*
@ -496,16 +510,16 @@ void addmm_out_sparse_csr(
               [out] result of the operation.
 */
 void addmv_out_sparse_csr(
-    const Tensor& mat,
-    const Tensor& vec,
-    const Scalar& beta,
-    const Scalar& alpha,
-    const Tensor& result) {
+  const Tensor& mat,
+  const Tensor& vec,
+  const Scalar& beta,
+  const Scalar& alpha,
+  const Tensor& result) {
 #if !AT_USE_MKL_SPARSE()
  TORCH_CHECK(
-      false,
-      "Calling addmv on a sparse CPU tensor requires Linux platform. ",
-      "Please use PyTorch built with MKL on Linux.");
+    false,
+    "Calling addmv on a sparse CPU tensor requires Linux platform. ",
+    "Please use PyTorch built with MKL on Linux.");
 #else
  c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_mkl(result);
  c10::MaybeOwned<Tensor> vec_ = prepare_dense_vector_for_mkl(vec);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -5,38 +5,6 @@
 # representing ScalarType's. They are now superseded by usage of
 # `aten::to()`. The ops remain here for backward compatibility purposes.

-# DEPRECATED. DO NOT USE
- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
-# DEPRECATED. DO NOT USE
- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
-  variants: function
-
 # Computes the gradient of current tensor w.r.t. graph leaves.
 - func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
  manual_cpp_binding: True
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@ -810,7 +810,8 @@ void addmm_out_sparse_csr(
  if (mat1.layout() == kSparseBsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided)
-        return block_sparse_mm(input, mat1, mat2, beta, alpha, result);
+         { block_sparse_mm(input, mat1, mat2, beta, alpha, result); return;
+}
    }
  }

@ -819,13 +820,13 @@ void addmm_out_sparse_csr(
      if (result.layout() == kStrided) {
        auto result_t = result.transpose(-2, -1);
        auto input_t = (result.is_same(input) ? result_t : input.transpose(-2, -1));
-        return block_sparse_mm(
+        block_sparse_mm(
            input_t,
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result_t);
+            result_t); return;
      }
    }
  }
@ -840,41 +841,41 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(
+        spmm(
            mat2.transpose(0, 1).to_sparse_csr(),
            mat1.transpose(0, 1),
            beta,
            alpha,
-            result.transpose(0, 1));
+            result.transpose(0, 1)); return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kStrided) {
-        return spmm(
+        spmm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
      }
    }
  }
  if (mat1.layout() == kSparseCsr) {
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
-        return spmm(mat1, mat2, beta, alpha, result);
+        spmm(mat1, mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr) {
-        return spgemm(mat1, mat2, beta, alpha, result);
+        spgemm(mat1, mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
        // CSR @ CSC kernel would be very fast due to format alignment
-        return spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result); return;
      }
    }
  }
@ -882,27 +883,28 @@ void addmm_out_sparse_csr(
    if (mat2.layout() == kStrided) {
      if (result.layout() == kStrided) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
      }
    }
    if (mat2.layout() == kSparseCsr) {
      if (result.layout() == kSparseCsr)
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+         { spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
+}
    }
    if (mat2.layout() == kSparseCsc) {
      if (result.layout() == kSparseCsr) {
        // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result); return;
      }
      if (result.layout() == kSparseCsc) {
-        return spgemm(
+        spgemm(
            mat2.transpose(-2, -1),
            mat1.transpose(-2, -1),
            beta,
            alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
      }
    }
  }
@ -933,7 +935,7 @@ void addmv_out_sparse_csr(
    const Scalar& alpha,
    const Tensor& result) {
  if (mat.layout() == kSparseBsr) {
-    return block_sparse_mv(mat, vec, beta, alpha, result);
+    block_sparse_mv(mat, vec, beta, alpha, result); return;
  }
  cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;

@ -1213,9 +1215,9 @@ void triangular_solve_out_sparse_csr(
  }
  if (A.layout() == kSparseBsr) {
    if (B.size(-1) == 1) {
-      return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); return;
    } else {
-      return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); return;
    }
  }
 #ifdef USE_ROCM
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
@ -117,7 +117,7 @@ class FwdKernel:
    def get_all(cls) -> list["FwdKernel"]:
        kernels: list[FwdKernel] = []
        for aligned, dtype, (sm, sm_max) in itertools.product(
-            [True, False], DTYPES.keys(), zip(SM, SM[1:])
+            [True, False], DTYPES.keys(), itertools.pairwise(SM)
        ):
            # Remove some kernels we don't use
            if dtype == "bf16" and sm < 80:
@ -228,7 +228,7 @@ class BwdKernel:
        for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product(
            [True, False],
            DTYPES.keys(),
-            zip(SM, SM[1:]),
+            itertools.pairwise(SM),
            [True, False],
            [32, 64, 128, 2**16],
        ):
--- a/benchmarks/distributed/bench_nvshmem_tile_reduce.py
+++ b/benchmarks/distributed/bench_nvshmem_tile_reduce.py
@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Benchmark for NVSHMEM tile reduce operations.
+
+Usage:
+python benchmarks/distributed/bench_nvshmem_tile_reduce.py
+
+This benchmark measures the performance of tile reduce operations across different
+matrix sizes and tile configurations.
+"""
+
+import time
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+)
+
+
+# Decorator
+def requires_nvshmem():
+    return skip_but_pass_in_sandcastle_if(
+        not symm_mem.is_nvshmem_available(),
+        "bench_nvshmem_tile_reduce requires NVSHMEM, skipping benchmark",
+    )
+
+
+# So that benchmarks are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class NVSHMEMTileReduceBenchmark(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def _benchmark_tile_reduce_single(
+        self,
+        full_size: int,
+        tile_size: int,
+        warmup_iters: int = 5,
+        bench_iters: int = 10,
+    ) -> dict:
+        """
+        Benchmark a single configuration of tile reduce.
+
+        Args:
+            full_size: Size of the full matrix (full_size x full_size)
+            warmup_iters: Number of warmup iterations
+            bench_iters: Number of benchmark iterations
+
+        Returns:
+            Dictionary with benchmark results
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+
+        # Allocate full matrices
+        full_inp = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(self.rank)
+        full_out = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(0)
+
+        slice_ut = slice(0, tile_size)
+        inp_tile = full_inp[slice_ut, slice_ut]
+        out_tile = full_out[slice_ut, slice_ut]
+
+        root = 0
+
+        # Warmup iterations
+        for _ in range(warmup_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+            torch.cuda.synchronize(self.device)
+
+        # Benchmark iterations
+        times = []
+
+        dist.barrier()
+        torch.cuda.synchronize(self.device)
+        start_time = time.perf_counter()
+
+        for _ in range(bench_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+
+        torch.cuda.synchronize(self.device)
+        end_time = time.perf_counter()
+        times.append((end_time - start_time) / bench_iters)
+
+        # Calculate statistics
+        times = torch.tensor(times, dtype=torch.float64)
+        tile_elements = tile_size * tile_size
+        tile_bytes = (
+            tile_elements * dtype.itemsize
+            if hasattr(dtype, "itemsize")
+            else tile_elements * 4
+        )
+
+        results = {
+            "full_size": full_size,
+            "tile_size": tile_size,
+            "tile_elements": tile_elements,
+            "tile_bytes": tile_bytes,
+            "world_size": self.world_size,
+            "mean_time_ms": times.mean().item() * 1000,
+            "std_time_ms": times.std().item() * 1000,
+            "min_time_ms": times.min().item() * 1000,
+            "max_time_ms": times.max().item() * 1000,
+            "throughput_gb_s": tile_bytes / (times.mean().item() * 1e9),
+            "elements_per_sec": tile_elements / times.mean().item(),
+        }
+
+        return results
+
+    @skipIfRocm
+    def test_benchmark_tile_reduce_various_sizes(self) -> None:
+        """
+        Benchmark tile reduce across various matrix sizes.
+        """
+        # Test various matrix sizes
+        tile_sizes = [512, 1024, 2048, 4096, 8192, 16384]
+        full_size = tile_sizes[-1]
+        warmup_iters = 5
+        bench_iters = 20
+
+        results = []
+
+        for tile_size in tile_sizes:
+            try:
+                result = self._benchmark_tile_reduce_single(
+                    full_size, tile_size, warmup_iters, bench_iters
+                )
+                results.append(result)
+
+                if self.rank == 0:
+                    print(
+                        f"Matrix Size: {full_size}x{full_size}, Tile Size: {tile_size}x{tile_size}"
+                    )
+                    print(
+                        f"  Mean Time: {result['mean_time_ms']:.3f} ± {result['std_time_ms']:.3f} ms"
+                    )
+                    print(f"  Throughput: {result['throughput_gb_s']:.2f} GB/s")
+                    print(f"  Bytes: {result['tile_bytes']:.0f}")
+                    print()
+
+            except Exception as e:
+                if self.rank == 0:
+                    print(f"Failed to benchmark matrix size {full_size}: {e}")
+
+        # Print summary
+        if self.rank == 0 and results:
+            print("=== BENCHMARK SUMMARY ===")
+            print(
+                f"{'Matrix Size':<12} {'Tile Size':<10} {'Time (ms)':<12} {'Throughput (GB/s)':<18} {'Bytes':<15}"
+            )
+            print("-" * 70)
+
+            for result in results:
+                print(
+                    f"{result['full_size']}x{result['full_size']:<7} "
+                    f"{result['tile_size']}x{result['tile_size']:<5} "
+                    f"{result['mean_time_ms']:<12.3f} "
+                    f"{result['throughput_gb_s']:<18.2f} "
+                    f"{result['tile_bytes']:<15.0f}"
+                )
+
+
+if __name__ == "__main__":
+    # For standalone usage, you'd need to set up distributed environment
+    # For now, this is meant to be run via the PyTorch test framework
+    from torch.testing._internal.common_utils import run_tests
+
+    run_tests()
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,4
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0


@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -94,10 +54,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,4
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,4
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,4
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0



-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4


@ -18,50 +14,22 @@ BartForCausalLM,pass,0



-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0



-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0



-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0



-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0



-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0



-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0


@ -98,10 +58,6 @@ MBartForCausalLM,pass,0



-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0



-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0



-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0



-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0



-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0



-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4



-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9


@ -18,50 +14,22 @@ BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5



-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5



-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7


@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4



-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6


@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5



-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4


@ -98,10 +58,6 @@ MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5


@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5



-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3



-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8


@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5



-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5


--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -123,8 +123,6 @@ CI_SKIP_OPTIMIZER = {
    # HF
    "pnasnet5large",  # Stack issue in fx
    "MobileBertForMaskedLM",  # Stack issue in fx
-    "MobileBertForQuestionAnswering",  # Stack issue in fx
-    "PegasusForConditionalGeneration",  # OOM
 }

 try:
@ -192,17 +190,11 @@ BENCHMARK_USE_SGD = {
    # HF
    "AlbertForMaskedLM",
    "BartForCausalLM",
-    "BartForConditionalGeneration",
-    "BlenderbotSmallForCausalLM",
-    "BlenderbotSmallForConditionalGeneration",
-    "DebertaV2ForQuestionAnswering",  # eager OOM
    "ElectraForCausalLM",
    "M2M100ForConditionalGeneration",
    "MBartForCausalLM",
-    "MBartForConditionalGeneration",
    "OPTForCausalLM",
    "PLBartForCausalLM",
-    "PLBartForConditionalGeneration",
    "PegasusForCausalLM",
    "TrOCRForCausalLM",
    "XGLMForCausalLM",
@ -3789,7 +3781,6 @@ def run(runner, args, original_dir=None):
            torch.use_deterministic_algorithms(True, warn_only=True)
        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
        if args.only is not None and args.only in {
-            "DebertaForQuestionAnswering",
            "nvidia_deeprecommender",
            "crossvit_9_240",
        }:
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -59,7 +59,6 @@ imports = [
    "BigBirdConfig",
    "BlenderbotForConditionalGeneration",
    "BlenderbotModel",
-    "BlenderbotSmallForConditionalGeneration",
    "BlenderbotSmallModel",
    "CLIPModel",
    "CLIPVisionModel",
@ -73,7 +72,6 @@ imports = [
    "MarianForCausalLM",
    "MarianModel",
    "MarianMTModel",
-    "PegasusForConditionalGeneration",
    "PegasusModel",
    "ReformerConfig",
    "ViTForImageClassification",
@ -167,7 +165,7 @@ def get_sequence_length(model_cls, model_name):
            "Bert",
            "Roberta",
        )
-    ) or model_name in ("DistillGPT2", "GoogleFnet", "YituTechConvBert", "CamemBert"):
+    ) or model_name in ("DistillGPT2", "GoogleFnet", "YituTechConvBert"):
        seq_length = 512
    elif model_name in ("TrOCRForCausalLM"):
        seq_length = 256
@ -222,9 +220,7 @@ def generate_inputs_for_model(
        BlenderbotModel,
        BlenderbotSmallModel,
        BlenderbotForConditionalGeneration,
-        BlenderbotSmallForConditionalGeneration,
        PegasusModel,
-        PegasusForConditionalGeneration,
        MarianModel,
        MarianMTModel,
    ]:
@ -333,10 +329,6 @@ EXTRA_MODELS = {
        AutoConfig.from_pretrained("YituTech/conv-bert-base"),
        AutoModelForMaskedLM,
    ),
-    "CamemBert": (
-        AutoConfig.from_pretrained("camembert-base"),
-        AutoModelForMaskedLM,
-    ),
 }


@ -375,8 +367,6 @@ class HuggingfaceRunner(BenchmarkRunner):

    def use_larger_multiplier_for_smaller_tensor(self, name):
        return name in [
-            "ElectraForQuestionAnswering",
-            "MegatronBertForQuestionAnswering",
            "GPT2ForSequenceClassification",
        ]

--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@ -31,24 +31,15 @@ batch_size:
  # TODO - Fails even after fake tensors
  divisors:
    AlbertForMaskedLM: 2
-    AlbertForQuestionAnswering: 2
    AllenaiLongformerBase: 2
    BartForCausalLM: 2
-    BartForConditionalGeneration: 2
    BertForMaskedLM: 2
-    BertForQuestionAnswering: 2
    BlenderbotForCausalLM: 8
    # BlenderbotForConditionalGeneration : 16
-    BlenderbotSmallForCausalLM: 4
-    BlenderbotSmallForConditionalGeneration: 2
-    CamemBert: 2
    DebertaV2ForMaskedLM: 4
-    DebertaV2ForQuestionAnswering: 8
    DistilBertForMaskedLM: 2
-    DistilBertForQuestionAnswering: 2
    DistillGPT2: 2
    ElectraForCausalLM: 2
-    ElectraForQuestionAnswering: 2
    GPT2ForSequenceClassification: 2
    # GPTJForCausalLM : 2
    # GPTJForQuestionAnswering : 2
@ -56,22 +47,15 @@ batch_size:
    # GPTNeoForSequenceClassification : 2
    GoogleFnet: 2
    LayoutLMForMaskedLM: 2
-    LayoutLMForSequenceClassification: 2
    M2M100ForConditionalGeneration: 4
    MBartForCausalLM: 2
-    MBartForConditionalGeneration: 2
    MT5ForConditionalGeneration: 2
    MegatronBertForCausalLM: 4
-    MegatronBertForQuestionAnswering: 2
    MobileBertForMaskedLM: 2
-    MobileBertForQuestionAnswering: 2
    OPTForCausalLM: 2
    PLBartForCausalLM: 2
-    PLBartForConditionalGeneration: 2
    PegasusForCausalLM: 4
-    PegasusForConditionalGeneration: 2
    RobertaForCausalLM: 2
-    RobertaForQuestionAnswering: 2
    T5ForConditionalGeneration: 2
    T5Small: 2
    TrOCRForCausalLM: 2
@ -90,20 +74,13 @@ batch_size:
 tolerance:
  higher_training:
    - MT5ForConditionalGeneration
-    # AlbertForQuestionAnswering fails in CI GCP A100 but error does not seem
-    # harmful.
-    - AlbertForQuestionAnswering

-  higher_max_autotune_training:
-    # DebertaForQuestionAnswering needs higher tolerance in Max-Autotune mode
-    - DebertaForQuestionAnswering
+  higher_max_autotune_training: []

  higher_inference:
    - GPT2ForSequenceClassification
-    - RobertaForQuestionAnswering

  higher_inference_cpu:
-    - LayoutLMForSequenceClassification
    - GPT2ForSequenceClassification

  cosine: []
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@ -1,22 +1,13 @@
 AlbertForMaskedLM,8
-AlbertForQuestionAnswering,8
 AllenaiLongformerBase,8
 BartForCausalLM,8
-BartForConditionalGeneration,4
 BertForMaskedLM,32
-BertForQuestionAnswering,32
 BlenderbotForCausalLM,32
 BlenderbotForConditionalGeneration,16
-BlenderbotSmallForCausalLM,256
-BlenderbotSmallForConditionalGeneration,128
-CamemBert,32
 DebertaV2ForMaskedLM,8
-DebertaV2ForQuestionAnswering,8
 DistilBertForMaskedLM,256
-DistilBertForQuestionAnswering,512
 DistillGPT2,32
 ElectraForCausalLM,64
-ElectraForQuestionAnswering,128
 GPT2ForSequenceClassification,8
 GPTJForCausalLM,1
 GPTJForQuestionAnswering,1
@ -24,22 +15,15 @@ GPTNeoForCausalLM,32
 GPTNeoForSequenceClassification,32
 GoogleFnet,32
 LayoutLMForMaskedLM,32
-LayoutLMForSequenceClassification,32
 M2M100ForConditionalGeneration,64
 MBartForCausalLM,8
-MBartForConditionalGeneration,4
 MT5ForConditionalGeneration,32
 MegatronBertForCausalLM,16
-MegatronBertForQuestionAnswering,16
 MobileBertForMaskedLM,256
-MobileBertForQuestionAnswering,256
 OPTForCausalLM,4
 PLBartForCausalLM,16
-PLBartForConditionalGeneration,8
 PegasusForCausalLM,128
-PegasusForConditionalGeneration,64
 RobertaForCausalLM,32
-RobertaForQuestionAnswering,32
 T5ForConditionalGeneration,8
 T5Small,8
 TrOCRForCausalLM,64
--- a/benchmarks/dynamo/huggingface_models_list_cpu.txt
+++ b/benchmarks/dynamo/huggingface_models_list_cpu.txt
@ -1,41 +1,25 @@
 AlbertForMaskedLM,4
-AlbertForQuestionAnswering,4
 AllenaiLongformerBase,4
 BartForCausalLM,4
-BartForConditionalGeneration,2
 BertForMaskedLM,16
-BertForQuestionAnswering,16
 BigBird,32
 BlenderbotForCausalLM,32
-BlenderbotSmallForCausalLM,64
-BlenderbotSmallForConditionalGeneration,64
-CamemBert,16
 DebertaV2ForMaskedLM,16
-DebertaV2ForQuestionAnswering,2
 DistilBertForMaskedLM,128
-DistilBertForQuestionAnswering,256
 DistillGPT2,16
 ElectraForCausalLM,8
-ElectraForQuestionAnswering,8
 GoogleFnet,16
 GPT2ForSequenceClassification,4
 LayoutLMForMaskedLM,16
-LayoutLMForSequenceClassification,16
 M2M100ForConditionalGeneration,16
 MBartForCausalLM,4
-MBartForConditionalGeneration,2
 MegatronBertForCausalLM,4
-MegatronBertForQuestionAnswering,8
 MobileBertForMaskedLM,64
-MobileBertForQuestionAnswering,64
 MT5ForConditionalGeneration,16
 OPTForCausalLM,2
 PegasusForCausalLM,32
-PegasusForConditionalGeneration,32
 PLBartForCausalLM,8
-PLBartForConditionalGeneration,4
 RobertaForCausalLM,16
-RobertaForQuestionAnswering,16
 T5ForConditionalGeneration,4
 T5Small,1
 TrOCRForCausalLM,32
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -1038,7 +1038,8 @@ def define_buck_targets(
        name = "generated-version-header",
        header_namespace = "torch",
        exported_headers = {
-            "version.h": ":generate-version-header[version.h]",
+            "headeronly/version.h": ":generate-version-header[version.h]",
+            "version.h": "torch/csrc/api/include/torch/version.h"
        },
        labels = labels,
    )
@ -1047,19 +1048,27 @@ def define_buck_targets(
    fb_native.genrule(
        name = "generate-version-header",
        srcs = [
-            "torch/csrc/api/include/torch/version.h.in",
+            "torch/headeronly/version.h.in",
            "version.txt",
        ],
-        cmd = "$(exe {}tools:gen-version-header) ".format(ROOT_PATH) + " ".join([
+        cmd = "mkdir -p $OUT/torch/headeronly && $(exe {}tools:gen-version-header) ".format(ROOT_PATH) + " ".join([
            "--template-path",
-            "torch/csrc/api/include/torch/version.h.in",
+            "torch/headeronly/version.h.in",
            "--version-path",
            "version.txt",
            "--output-path",
-            "$OUT/version.h",
+            "$OUT/torch/headeronly/version.h",
+        ]),
+        cmd_exe = "md $OUT\\torch\\headeronly 2>nul & $(exe {}tools:gen-version-header) ".format(ROOT_PATH) + " ".join([
+            "--template-path",
+            "torch/headeronly/version.h.in",
+            "--version-path",
+            "version.txt",
+            "--output-path",
+            "$OUT\\torch\\headeronly\\version.h",
        ]),
        outs = {
-            "version.h": ["version.h"],
+            "version.h": ["torch/headeronly/version.h"],
        },
        default_outs = ["."],
    )
--- a/build.bzl
+++ b/build.bzl
@ -142,18 +142,6 @@ def define_targets(rules):
        visibility = ["//visibility:public"],
    )

-    rules.genrule(
-        name = "version_h",
-        srcs = [
-            ":torch/csrc/api/include/torch/version.h.in",
-            ":version.txt",
-        ],
-        outs = ["torch/csrc/api/include/torch/version.h"],
-        cmd = "$(execpath //tools/setup_helpers:gen_version_header) " +
-              "--template-path $(location :torch/csrc/api/include/torch/version.h.in) " +
-              "--version-path $(location :version.txt) --output-path $@ ",
-        tools = ["//tools/setup_helpers:gen_version_header"],
-    )

 #
 # ATen generated code
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -913,7 +913,6 @@ libtorch_python_core_sources = [
    "torch/csrc/autograd/python_torch_functions_manual.cpp",
    "torch/csrc/autograd/python_variable.cpp",
    "torch/csrc/autograd/python_variable_indexing.cpp",
-    "torch/csrc/distributed/python_placement.cpp",
    "torch/csrc/dynamo/python_compiled_autograd.cpp",
    "torch/csrc/dynamo/cache_entry.cpp",
    "torch/csrc/dynamo/cpp_shim.cpp",
--- a/c10/core/Event.h
+++ b/c10/core/Event.h
@ -127,7 +127,7 @@ struct Event final {
  }

  void synchronize() const {
-    return impl_.synchronize();
+    impl_.synchronize();
  }

 private:
--- a/c10/core/Storage.h
+++ b/c10/core/Storage.h
@ -149,7 +149,7 @@ struct C10_API Storage {
  }

  void set_data_ptr_noswap(at::DataPtr&& data_ptr) const {
-    return storage_impl_->set_data_ptr_noswap(std::move(data_ptr));
+    storage_impl_->set_data_ptr_noswap(std::move(data_ptr));
  }

  DeviceType device_type() const {
--- a/c10/core/impl/VirtualGuardImpl.h
+++ b/c10/core/impl/VirtualGuardImpl.h
@ -94,11 +94,11 @@ class VirtualGuardImpl final : public DeviceGuardImplInterface {
  }

  void synchronizeEvent(void* event) const override {
-    return impl_->synchronizeEvent(event);
+    impl_->synchronizeEvent(event);
  }

  void synchronizeDevice(const DeviceIndex device_index) const override {
-    return impl_->synchronizeDevice(device_index);
+    impl_->synchronizeDevice(device_index);
  }

 private:
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -1183,6 +1183,8 @@ class DeviceCachingAllocator {
  // device statistics
  DeviceStats stats;

+  c10::DeviceIndex device_id;
+
  // unallocated cached blocks larger than 1 MB
  BlockPool large_blocks;

@ -1271,8 +1273,10 @@ class DeviceCachingAllocator {

 public:
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  DeviceCachingAllocator()
-      : large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
+  explicit DeviceCachingAllocator(c10::DeviceIndex id)
+      : device_id(id),
+        large_blocks(/*small=*/false),
+        small_blocks(/*small=*/true) {
    stats.max_split_size =
        static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
    context_recorder_.store(nullptr);
@ -1358,10 +1362,7 @@ class DeviceCachingAllocator {
  // All public methods (except the above) acquire the allocator mutex.
  // Thus, do not call a public method from another public method.

-  Block* malloc(
-      c10::DeviceIndex device,
-      size_t orig_size,
-      cudaStream_t stream) {
+  Block* malloc(size_t orig_size, cudaStream_t stream) {
    // done outside the lock because we don't know what locks the recorder needs
    // to have...
    auto context = maybeGatherContext(RecordContext::STATE);
@ -1389,7 +1390,7 @@ class DeviceCachingAllocator {
    size_t size = round_size(orig_size);
    auto& pool = get_pool(size, stream);
    const size_t alloc_size = get_allocation_size(size);
-    AllocParams params(device, size, stream, &pool, alloc_size);
+    AllocParams params(device_id, size, stream, &pool, alloc_size);
    params.stat_types = get_stat_types_for_pool(pool);

    // First, try to get a block from the existing pool.
@ -1436,7 +1437,7 @@ class DeviceCachingAllocator {
          beginAllocateToPool(mempool_id, filter);
          auto& mempool = get_pool(size, stream);
          AllocParams mempool_params(
-              device, size, stream, &mempool, alloc_size);
+              device_id, size, stream, &mempool, alloc_size);
          mempool_params.stat_types = get_stat_types_for_pool(mempool);
          block_found = get_free_block(mempool_params);
          endAllocateToPool(mempool_id);
@ -1463,7 +1464,7 @@ class DeviceCachingAllocator {
        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
      }

-      std::string proc_info = reportProcessMemoryInfo(device);
+      std::string proc_info = reportProcessMemoryInfo(device_id);

      record_trace(
          TraceEntry::OOM,
@ -1481,7 +1482,7 @@ class DeviceCachingAllocator {
              .current,
          stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
              .current,
-          c10::Device(c10::DeviceType::CUDA, device));
+          c10::Device(c10::DeviceType::CUDA, device_id));

      auto allocated_bytes =
          stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
@ -1519,7 +1520,7 @@ class DeviceCachingAllocator {
      lock.unlock();

      for (const auto& obs : observers_local) {
-        obs(device,
+        obs(device_id,
            alloc_size,
            set_fraction ? allowed_memory_maximum : device_total,
            device_free);
@ -1549,7 +1550,7 @@ class DeviceCachingAllocator {
          "CUDA out of memory. Tried to allocate ",
          format_size(alloc_size),
          ". GPU ",
-          static_cast<int>(device),
+          static_cast<int>(device_id),
          " has a total capacity of ",
          format_size(device_total),
          " of which ",
@ -2501,6 +2502,8 @@ class DeviceCachingAllocator {
      auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
      if (divisions > 1 && size > (kMinBlockSize * divisions)) {
        return roundup_power2_next_division(size, divisions);
+      } else if (divisions == 1) {
+        return llvm::PowerOf2Ceil(size);
      } else {
        return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
      }
@ -3809,7 +3812,8 @@ class NativeCachingAllocator : public CUDAAllocator {
    if (size < device_count) {
      device_allocator.resize(device_count);
      for (const auto i : c10::irange(size, device_count)) {
-        device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
+        device_allocator[i] =
+            std::make_unique<DeviceCachingAllocator>(c10::DeviceIndex(i));
      }
    }
  }
@ -3829,7 +3833,7 @@ class NativeCachingAllocator : public CUDAAllocator {
        "Allocator not initialized for device ",
        device,
        ": did you call init?");
-    Block* block = device_allocator[device]->malloc(device, size, stream);
+    Block* block = device_allocator[device]->malloc(size, stream);
    add_allocated_block(block);
    *devPtr = block->ptr;
    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -360,11 +360,11 @@ inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
 }

 inline void raw_delete(void* ptr) {
-  return get()->raw_delete(ptr);
+  get()->raw_delete(ptr);
 }

 inline void init(int device_count) {
-  return get()->init(device_count);
+  get()->init(device_count);
 }

 inline double getMemoryFraction(c10::DeviceIndex device) {
@ -372,7 +372,7 @@ inline double getMemoryFraction(c10::DeviceIndex device) {
 }

 inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
-  return get()->setMemoryFraction(fraction, device);
+  get()->setMemoryFraction(fraction, device);
 }

 inline std::vector<StreamSegmentSize> getExpandableSegmentSizes(
@ -381,11 +381,11 @@ inline std::vector<StreamSegmentSize> getExpandableSegmentSizes(
 }

 inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
-  return get()->emptyCache(mempool_id);
+  get()->emptyCache(mempool_id);
 }

 inline void enable(bool value) {
-  return get()->enable(value);
+  get()->enable(value);
 }

 inline bool isEnabled() {
@ -393,7 +393,7 @@ inline bool isEnabled() {
 }

 inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
-  return get()->cacheInfo(device, largestBlock);
+  get()->cacheInfo(device, largestBlock);
 }

 inline void* getBaseAllocation(void* ptr, size_t* size) {
@ -401,7 +401,7 @@ inline void* getBaseAllocation(void* ptr, size_t* size) {
 }

 inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
-  return get()->recordStream(dataPtr, stream);
+  get()->recordStream(dataPtr, stream);
 }

 inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
@ -410,11 +410,11 @@ inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
 }

 inline void resetAccumulatedStats(c10::DeviceIndex device) {
-  return get()->resetAccumulatedStats(device);
+  get()->resetAccumulatedStats(device);
 }

 inline void resetPeakStats(c10::DeviceIndex device) {
-  return get()->resetPeakStats(device);
+  get()->resetPeakStats(device);
 }

 inline SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
@ -451,21 +451,21 @@ inline void recordHistory(
    size_t alloc_trace_max_entries,
    RecordContext when,
    bool clearHistory) {
-  return get()->recordHistory(
+  get()->recordHistory(
      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
 }

 inline void recordAnnotation(
    const std::vector<std::pair<std::string, std::string>>& md) {
-  return get()->recordAnnotation(md);
+  get()->recordAnnotation(md);
 }

 inline void pushCompileContext(std::string& md) {
-  return get()->pushCompileContext(md);
+  get()->pushCompileContext(md);
 }

 inline void popCompileContext() {
-  return get()->popCompileContext();
+  get()->popCompileContext();
 }

 inline bool isHistoryEnabled() {
@ -481,15 +481,15 @@ inline bool checkPoolLiveAllocations(
 }

 inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
-  return get()->attachOutOfMemoryObserver(std::move(observer));
+  get()->attachOutOfMemoryObserver(std::move(observer));
 }

 inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
-  return get()->attachAllocatorTraceTracker(std::move(tracker));
+  get()->attachAllocatorTraceTracker(std::move(tracker));
 }

 inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
-  return get()->releasePool(device, mempool_id);
+  get()->releasePool(device, mempool_id);
 }
 inline void createOrIncrefPool(
    c10::DeviceIndex device,
@ -533,7 +533,7 @@ inline cudaError_t memcpyAsync(
 inline void enablePeerAccess(
    c10::DeviceIndex dev,
    c10::DeviceIndex dev_to_access) {
-  return get()->enablePeerAccess(dev, dev_to_access);
+  get()->enablePeerAccess(dev, dev_to_access);
 }

 } // namespace c10::cuda::CUDACachingAllocator
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -51,17 +51,6 @@

 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
 #define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
-  _(cuCtxFromGreenCtx, 12080)              \
-  _(cuCtxGetCurrent, 12080)                \
-  _(cuCtxPopCurrent, 12080)                \
-  _(cuCtxPushCurrent, 12080)               \
-  _(cuCtxSetCurrent, 12080)                \
-  _(cuGreenCtxCreate, 12080)               \
-  _(cuGreenCtxDestroy, 12080)              \
-  _(cuDevSmResourceSplitByCount, 12080)    \
-  _(cuDeviceGet, 12080)                    \
-  _(cuDeviceGetDevResource, 12080)         \
-  _(cuDevResourceGenerateDesc, 12080)      \
  _(cuMulticastAddDevice, 12030)           \
  _(cuMulticastBindMem, 12030)             \
  _(cuMulticastCreate, 12030)              \
--- a/c10/util/WaitCounter.cpp
+++ b/c10/util/WaitCounter.cpp
@ -49,7 +49,7 @@ class DynamicBackendWrapper : public WaitCounterBackendIf {

  void stop(std::chrono::steady_clock::time_point now, intptr_t ctx) noexcept
      override {
-    return impl_.stop(
+    impl_.stop(
        impl_.self,
        std::chrono::duration_cast<std::chrono::microseconds>(
            now.time_since_epoch())
@ -162,6 +162,6 @@ WaitCounterHandle::WaitGuard WaitCounterHandle::start() {
 }

 void WaitCounterHandle::stop(const SmallVector<intptr_t>& ctxs) {
-  return impl_.stop(ctxs);
+  impl_.stop(ctxs);
 }
 } // namespace c10::monitor
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -243,8 +243,8 @@ configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
  COPYONLY)

 # Generate header with version info
-configure_file("${TORCH_SRC_DIR}/csrc/api/include/torch/version.h.in"
-  "${TORCH_SRC_DIR}/csrc/api/include/torch/version.h"
+configure_file("${TORCH_SRC_DIR}/headeronly/version.h.in"
+  "${TORCH_SRC_DIR}/headeronly/version.h"
  @ONLY)

 set(GENERATED_CXX_TORCH
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -207,6 +207,42 @@ templates_path = [
 ]
 # TODO: document these and remove them from here.

+# Fixes the duplicated
+autosummary_filename_map = {
+    "torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
+    "torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
+    "torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
+    "torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
+    "torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
+    "torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
+    "torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
+    "torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
+    "torch.optim.radam.radam": "torch.optim.radam.radam_function",
+    "torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
+    "torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
+    "torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
+    "torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
+    "torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
+    "torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
+    "torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
+    "torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
+    "torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
+    "torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
+    "torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
+    "torch.optim.adam.adam": "torch.optim.adam.adam_function",
+    "torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
+    "torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
+    "torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
+    "torch.mtia.stream": "torch.mtia.stream_function",
+    "torch.mtia.Stream": "torch.mtia.Stream_class",
+    "torch.cpu.stream": "torch.cpu.stream_function",
+    "torch.cpu.Stream": "torch.cpu.Stream_class",
+    "torch.cuda.stream": "torch.cuda.stream_function",
+    "torch.cuda.Stream": "torch.cuda.Stream_class",
+    "torch.xpu.stream": "torch.xpu.stream_function",
+    "torch.xpu.Stream": "torch.xpu.Stream_class",
+}
+
 coverage_ignore_functions = [
    # torch
    "typename",
@ -3253,6 +3289,11 @@ autodoc_type_aliases = {
 # Enable overriding of function signatures in the first line of the docstring.
 autodoc_docstring_signature = True

+# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
+autodoc_default_options = {
+    "exclude-members": "from_bytes, to_bytes",
+}
+
 # -- katex javascript in header
 #
 #    def setup(app):
--- a/docs/source/cuda.md
+++ b/docs/source/cuda.md
@ -262,28 +262,6 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t

 ```

-## Green Contexts (experimental)
-
-`torch.cuda.green_contexts` provides thin wrappers around the CUDA Green Context APIs
-to enable more general carveout of SM resources for CUDA kernels.
-
-These APIs can be used in PyTorch with CUDA versions greater than or equal to 12.8.
-
-See the docs for {class}`~torch.cuda.green_contexts.GreenContext` for an example of how to use these.
-
-```{eval-rst}
-.. currentmodule:: torch.cuda.green_contexts
-```
-
-```{eval-rst}
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-
-    GreenContext
-```
-
-
 % This module needs to be documented. Adding here in the meantime

 % for tracking purposes
@ -296,10 +274,6 @@ See the docs for {class}`~torch.cuda.green_contexts.GreenContext` for an example
 .. py:module:: torch.cuda.gds
 ```

-```{eval-rst}
-.. py:module:: torch.cuda.green_contexts
-```
-
 ```{eval-rst}
 .. py:module:: torch.cuda.jiterator
 ```
@ -325,4 +299,4 @@ See the docs for {class}`~torch.cuda.green_contexts.GreenContext` for an example
    :hidden:

    cuda.aliases.md
-```
+```
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@ -233,7 +233,6 @@ regular full-precision tensor.
 .. autosummary::
    :toctree: generated
    :nosignatures:
-    :template: classtemplate.rst

    view
    as_strided
--- a/pyproject.toml
+++ b/pyproject.toml
@ -242,6 +242,7 @@ select = [
    "Q003",  # avoidable escaped quote
    "Q004",  # unnecessary escaped quote
    "RSE",
+    "RUF007", # pairwise over zip
    "RUF008", # mutable dataclass default
    "RUF013", # ban implicit optional
    "RUF015", # access first ele in constant time
--- a/pyrefly.toml
+++ b/pyrefly.toml
@ -22,18 +22,16 @@ project-excludes = [
  # ==== to test Pyrefly on a specific directory, simply comment it out ====
  "torch/_inductor/**",
  "torch/distributed/**",
-  "torch/nn/**",
-  "torch/_dynamo/**",
-  "torch/utils/**",
-  "torch/ao/**",
-  "torch/fx/**",
-  "torch/distributions/**",
-  "torch/onnx/**",
  # formatting issues
  "torch/linalg/__init__.py",
  "torch/package/importer.py",
  "torch/package/_package_pickler.py",
  "torch/jit/annotations.py",
+  "torch/utils/data/datapipes/_typing.py",
+  "torch/nn/functional.py",
+  "torch/_export/utils.py",
+  "torch/fx/experimental/unification/multipledispatch/__init__.py",
+  "torch/nn/modules/__init__.py",
  # ====
  "benchmarks/instruction_counts/main.py",
  "benchmarks/instruction_counts/definitions/setup.py",
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@ -1111,14 +1111,6 @@
    "_amp_update_scale_",
    "_assert_async",
    "_batch_norm_impl_index",
-    "_cast_Byte",
-    "_cast_Char",
-    "_cast_Double",
-    "_cast_Float",
-    "_cast_Half",
-    "_cast_Int",
-    "_cast_Long",
-    "_cast_Short",
    "_choose_qparams_per_tensor",
    "_coalesce",
    "_compute_linear_combination",
--- a/Show More
+++ b/Show More