[export] Turn on install_free_tensors flag

The final step in removing the discrepancy between torch.compile(fullgraph=True) and torch.export(strict=True). ghstack-source-id: 22998e0bc950685ba76a27d0bd172e3336dc4e82 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164305
Enable ruff FURB161 rule (#164654 )
2025-11-05 08:24:57 +08:00 · 2025-10-04 23:51:44 -07:00 · 2025-10-04 23:26:28 +00:00 · 2025-10-04 22:06:04 +00:00 · 2025-10-04 22:06:04 +00:00 · 2025-10-04 21:02:39 +00:00
376 changed files with 5584 additions and 1761 deletions
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-e0dda9059d082537cee36be6c5e4fe3b18c880c0
+deb42f2a8e48f5032b4a98ee781a15fa87a157cf
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -19,8 +19,8 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
-pip_install onnxruntime==1.22.1
-pip_install onnxscript==0.4.0
+pip_install onnxruntime==1.23.0
+pip_install onnxscript==0.5.3

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -341,7 +341,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:

-onnxscript==0.4.0
+onnxscript==0.5.3
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -34,12 +34,14 @@ fi


 # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
-NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
-if [ -n "$NUMBA_CUDA_DIR" ]; then
-  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
-  pushd "$NUMBA_CUDA_DIR"
-  patch -p4 <"$NUMBA_PATCH"
-  popd
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+  NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
+  if [ -n "$NUMBA_CUDA_DIR" ]; then
+    NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
+    pushd "$NUMBA_CUDA_DIR"
+    patch -p4 <"$NUMBA_PATCH"
+    popd
+  fi
 fi

 echo "Environment variables:"
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-78a47f87ce259a48f0391fa9ae15add05ea7432b
+0ad9951c416d33c5da4f7a504fb162cbe62386f5
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -202,7 +202,7 @@ ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=4
 ENV NVCC_THREADS=$nvcc_threads
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}

 ARG USE_SCCACHE
@ -297,16 +297,28 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

+# Install build and runtime dependencies, this is needed for flashinfer install
+COPY requirements/build.txt requirements/build.txt
+COPY use_existing_torch.py use_existing_torch.py
+RUN python3 use_existing_torch.py
+RUN cat requirements/build.txt
+
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
        python3 -m pip install uv==0.8.4; \
    fi
+
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+
 # Default mount file as placeholder, this just avoid the mount error
 ARG TORCH_WHEELS_PATH="./requirements"
 # Install torch, torchaudio and torchvision
@ -332,13 +344,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install xformers wheel from previous stage
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /wheels/xformers/*.whl --verbose
-
 # Build flashinfer from source.
 ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
 # install package for build flashinfer
 # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738

-RUN pip install build==1.3.0
 RUN pip freeze | grep -E 'setuptools|packaging|build'

 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
--- a/.github/ci_configs/vllm/use_existing_torch.py
+++ b/.github/ci_configs/vllm/use_existing_torch.py
@ -1,9 +1,14 @@
 import glob
+import os


 requires_files = glob.glob("requirements/*.txt")
 requires_files += ["pyproject.toml"]
+
 for file in requires_files:
+    if not os.path.exists(file):
+        print(f"!!! skipping missing {file}")
+        continue
    print(f">>> cleaning {file}")
    with open(file) as f:
        lines = f.readlines()
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -40,6 +40,15 @@ jobs:
              # Use gh CLI to get changed files in the PR with explicit repo
              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')

+              # See https://github.com/pytorch/pytorch/pull/134215#issuecomment-2332128790
+              PYI_FILES_TO_ADD=""
+              for file in ${CHANGED_FILES}; do
+                if [[ "${file}" == *".pyi.in" ]]; then
+                  PYI_FILES_TO_ADD="${PYI_FILES_TO_ADD} ${file//.in/}"
+                fi
+              done
+              CHANGED_FILES="${CHANGED_FILES}${PYI_FILES_TO_ADD}"
+
              if [ -z "$CHANGED_FILES" ]; then
                echo "No changed files found, setting to '*'"
                CHANGED_FILES="*"
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -106,6 +106,16 @@ jobs:
          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
        ]}
    secrets: inherit

--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@ -73,3 +73,28 @@ jobs:
      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
    secrets: inherit
+
+  # ROCM MI300 runner
+  opmicrobenchmark-build-rocm:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build-rocm
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test-rocm:
+    name: opmicrobenchmark-test-rocm
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: opmicrobenchmark-build-rocm
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -160,9 +160,10 @@ jobs:
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
        ]}
    secrets: inherit

@ -189,41 +190,6 @@ jobs:
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    secrets: inherit

-  linux-jammy-rocm-py3_10-build:
-    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
-    secrets: inherit
-
  inductor-build:
    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -42,7 +42,7 @@ jobs:
      build-external-packages: "vllm"
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.0;8.9;9.0'
+      cuda-arch-list: '8.0 8.9 9.0'
      runner: linux.24xlarge.memory
      test-matrix: |
        { include: [
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -28,6 +28,7 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
+    "tools/experimental/dynamic_shapes/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
--- a/5
+++ b/5
@ -50,11 +50,10 @@ RUN git submodule update --init --recursive
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
 ARG CUDA_PATH=cu121
-ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
-RUN /opt/conda/bin/conda update -y -n base -c defaults conda
-RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
+# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
+RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0

 ARG TARGETPLATFORM

--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -40,41 +40,6 @@ namespace {
                ->conv
                ->rnn
 */
-const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
-    {"generic", {{"ieee", "tf32", "bf16", "none"}}},
-    {"mkldnn", {{"ieee", "tf32", "bf16", "none"}}},
-    {"cuda", {{"ieee", "tf32", "none"}}}};
-
-// Check whether the backend and op are legal
-void check_fp32_prec_backend_and_op(
-    const std::string& backend,
-    const std::string& op) {
-  static std::vector<std::string> backends = {"generic", "mkldnn", "cuda"};
-  static std::vector<std::string> operators = {"conv", "matmul", "rnn", "all"};
-  TORCH_CHECK(
-      std::find(backends.begin(), backends.end(), backend) != backends.end(),
-      "Invalid backend: ",
-      backend);
-  TORCH_CHECK(
-      std::find(operators.begin(), operators.end(), op) != operators.end(),
-      "Invalid operator: ",
-      op);
-  if (backend == "generic") {
-    TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op);
-  }
-  }
-
-  // Return whether the precision is supported by backends
-  bool validate_fp32_prec(
-      const std::string& backend,
-      const std::string& precision) {
-    auto iterp = _fp32_precisions.find(backend);
-    TORCH_CHECK(iterp != _fp32_precisions.end());
-    auto precisions = iterp->second;
-    bool valid = std::find(precisions.begin(), precisions.end(), precision) !=
-        precisions.end();
-    return valid;
-  }

  C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
    TORCH_WARN_ONCE(
@ -86,6 +51,54 @@ void check_fp32_prec_backend_and_op(
  }
 } // namespace

+Float32Backend str2backend(const std::string& name) {
+  if (name == "generic")
+    return Float32Backend::GENERIC;
+  else if (name == "cuda")
+    return Float32Backend::CUDA;
+  else if (name == "mkldnn")
+    return Float32Backend::MKLDNN;
+  TORCH_CHECK(false, "Unknown backend: ", name);
+}
+
+Float32Op str2op(const std::string& name) {
+  if (name == "all")
+    return Float32Op::ALL;
+  else if (name == "conv")
+    return Float32Op::CONV;
+  else if (name == "rnn")
+    return Float32Op::RNN;
+  else if (name == "matmul")
+    return Float32Op::MATMUL;
+  TORCH_CHECK(false, "Unknown op: ", name);
+}
+
+Float32Precision str2precision(const std::string& name) {
+  if (name == "none")
+    return Float32Precision::NONE;
+  else if (name == "ieee")
+    return Float32Precision::IEEE;
+  else if (name == "tf32")
+    return Float32Precision::TF32;
+  else if (name == "bf16")
+    return Float32Precision::BF16;
+  TORCH_CHECK(false, "Unknown precision: ", name);
+}
+
+std::string precision2str(Float32Precision prec) {
+  switch (prec) {
+    case Float32Precision::NONE:
+      return "none";
+    case Float32Precision::IEEE:
+      return "ieee";
+    case Float32Precision::TF32:
+      return "tf32";
+    case Float32Precision::BF16:
+      return "bf16";
+  }
+  TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast<int>(prec), ")");
+}
+
 Context::Context() = default;

 // TODO: This could be bad juju if someone calls globalContext() in the
@ -179,10 +192,10 @@ void Context::setUserEnabledNNPACK(bool e) {
  enabled_nnpack = e;
 }

-bool Context::allowTF32CuDNN(const std::string& op) const {
-  if (op.empty()){
-    bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
-    bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
+bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
+  if (!op.has_value()) {
+    bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32;
+    bool allow_tf32_conv = float32Precision(Float32Backend::CUDA, Float32Op::CONV) == Float32Precision::TF32;
    TORCH_CHECK(
        allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn,
        "PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,",
@ -191,15 +204,15 @@ bool Context::allowTF32CuDNN(const std::string& op) const {
        "We suggest only using the new API to set the TF32 flag(s). See also: ",
        "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
  } else {
-    return float32Precision("cuda", op) == "tf32";
+    return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32;
  }
  warn_deprecated_fp32_precision_api();
  return allow_tf32_cudnn;
 }

 void Context::setAllowTF32CuDNN(bool b) {
-  setFloat32Precision("cuda", "rnn", b ? "tf32" : "none");
-  setFloat32Precision("cuda", "conv", b ? "tf32" : "none");
+  setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE);
+  setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE);
  allow_tf32_cudnn = b;
  warn_deprecated_fp32_precision_api();
 }
@ -279,42 +292,6 @@ bool Context::userEnabledOverrideableSDP() const {
  return enabled_overrideable;
 }

-static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
-static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
-
-bool Context::checkCuBLASConfigDeterministic() {
-  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
-  // is set to deterministic setting
-  if (hasCUDART()) {
-    const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
-    return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
-  }
-  return true;
-}
-
-void Context::alertCuBLASConfigNotDeterministic() const {
-  static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
-  if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
-    return;
-  }
-
-  auto msg = c10::str(
-    "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
-    "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
-    "it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
-    "case, you must set an environment variable before running your PyTorch application: ",
-    cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
-    cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
-    "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
-  );
-
-  if (deterministicAlgorithmsWarnOnly()) {
-    TORCH_WARN(msg);
-  } else {
-    TORCH_CHECK(false, msg);
-  }
-}
-
 bool Context::benchmarkCuDNN() const {
  return benchmark_cudnn;
 }
@ -341,7 +318,7 @@ void Context::setImmediateMiopen(bool b) {

 bool Context::allowTF32CuBLAS() const {
  bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
-  bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
+  bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32;
  TORCH_CHECK(
      legacy_allow_tf32 == allow_tf32_new,
      "PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,",
@ -354,17 +331,17 @@ bool Context::allowTF32CuBLAS() const {

 void Context::setAllowTF32CuBLAS(bool b) {
  float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
-  setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
+  setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, b ? Float32Precision::TF32 : Float32Precision::IEEE);
 }

 Float32MatmulPrecision Context::float32MatmulPrecision() const {
-  bool invalid = float32Precision("cuda", "matmul") == "tf32" &&
+  bool invalid = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32 &&
      float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST;
  invalid = invalid ||
-      (float32Precision("mkldnn", "matmul") == "bf16" &&
+      (float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::BF16 &&
       float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
  invalid = invalid ||
-      (float32Precision("mkldnn", "matmul") == "tf32" &&
+      (float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::TF32 &&
       float32_matmul_precision != at::Float32MatmulPrecision::HIGH);
  TORCH_CHECK(
      !invalid,
@ -376,15 +353,26 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
  return float32_matmul_precision;
 }

-std::string Context::float32Precision(const std::string& backend, const std::string& op) const {
-  check_fp32_prec_backend_and_op(backend, op);
-  auto precision = fp32_precision.find(backend)->second.find(op)->second;
-  if (precision == "none")
-    precision = fp32_precision.find(backend)->second.find("all")->second;
-  if (precision == "none")
-    precision = fp32_precision.find("generic")->second.find("all")->second;
-  bool valid_prec = validate_fp32_prec(backend, precision);
-  return valid_prec ? precision : "none";
+Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) const {
+  std::pair<Float32Backend, Float32Op> key{backend, op};
+  auto it = fp32_precision.find(key);
+  TORCH_CHECK(it != fp32_precision.end(), "Invalid (backend, op) pair: (", backend, ", ", op, ")");
+
+  Float32Precision precision = it->second;
+  if (precision == Float32Precision::NONE) {
+    key.second = Float32Op::ALL;
+    precision = fp32_precision.find(key)->second;
+  }
+  if (precision == Float32Precision::NONE) {
+    key.first = Float32Backend::GENERIC;
+    precision = fp32_precision.find(key)->second;
+  }
+
+  // "cuda" does not support "bf16"
+  if (backend == Float32Backend::CUDA && precision == Float32Precision::BF16) {
+    return Float32Precision::NONE;
+  }
+  return precision;
 }

 void Context::setFloat32MatmulPrecision(const std::string &s) {
@ -393,18 +381,18 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
    // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
    if (s_ == "highest") {
      float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
-      setFloat32Precision("cuda", "matmul", "ieee");
-      setFloat32Precision("mkldnn", "matmul", "ieee");
+      setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::IEEE);
+      setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::IEEE);
      return true;
    } else if (s_ == "high") {
      float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
-      setFloat32Precision("cuda", "matmul", "tf32");
-      setFloat32Precision("mkldnn", "matmul", "tf32");
+      setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
+      setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::TF32);
      return true;
    } else if (s_ == "medium") {
      float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
-      setFloat32Precision("cuda", "matmul", "tf32");
-      setFloat32Precision("mkldnn", "matmul", "bf16");
+      setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
+      setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16);
      return true;
    }
    return false;
@ -418,25 +406,16 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
    "setFloat32MatmulPrecision call has no effect.");
 }

-void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) {
-  check_fp32_prec_backend_and_op(backend, op);
-  if (validate_fp32_prec(backend, p)) {
-    fp32_precision[backend][op] = p;
-  } else {
-    std::string msg;
-    auto iterp = _fp32_precisions.find(backend);
-    TORCH_CHECK(iterp != _fp32_precisions.end());
-    for (const auto& p : iterp->second) {
-      msg += p;
-      msg += " ";
-    }
-    TORCH_WARN(
-        "you have set wrong precision for backend:",
-        backend,
-        " setFloat32Precision call has no effect.",
-        "Please choose precision from: ",
-        msg);
-  }
+void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) {
+  auto it = fp32_precision.find(std::make_pair(backend, op));
+  TORCH_CHECK(
+      it != fp32_precision.end(),
+      "Invalid (backend, op) pair: (", backend, ", ", op, ")");
+  TORCH_CHECK(
+      !(backend == Float32Backend::CUDA && p == Float32Precision::BF16),
+      "backend 'cuda' does not support precision 'bf16'");
+
+  it->second = p;
 }

 at::LinalgBackend Context::linalgPreferredBackend() const {
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -25,17 +25,27 @@
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
+#include <c10/util/hash.h>
 #include <c10/util/irange.h>

 #include <cstdint>
 #include <map>
 #include <mutex>
+#include <unordered_map>

 namespace at {

 class Tensor;

 enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
+enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
+enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
+enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
+
+TORCH_API Float32Backend str2backend(const std::string& name);
+TORCH_API Float32Op str2op(const std::string& name);
+TORCH_API Float32Precision str2precision(const std::string& name);
+TORCH_API std::string precision2str(Float32Precision prec);

 class TORCH_API Context {
 public:
@ -310,13 +320,7 @@ class TORCH_API Context {
  //
  // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
  //   of the time, this should be accomplished by calling
-  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
-  //   nondeterministic behavior is caused by the CuBLAS workspace
-  //   configuration in CUDA >= 10.2,
-  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
-  //   called instead (in this case, a comment explaining why the operation is
-  //   nondeterministic is not necessary). See below for details on these
-  //   methods.
+  //   `at::globalContext().alertNotDeterminstic().
  //
  // * Have an entry in the list of nondeterministic PyTorch operations in the
  //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
@ -340,27 +344,19 @@ class TORCH_API Context {
  // Throws an error if `Context::deterministicAlgorithms()` is true
  static void alertNotDeterministic(std::string_view const& caller);

-  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
-  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
-  // ":4096:8". For more details:
-  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
-  void alertCuBLASConfigNotDeterministic() const;
-
  void setFloat32MatmulPrecision(const std::string& s);
  void setFloat32Precision(
-      const std::string& backend,
-      const std::string& op,
-      const std::string& s);
-  bool allowTF32CuDNN(const std::string& op = std::string()) const;
+      Float32Backend backend,
+      Float32Op op,
+      Float32Precision p);
+  bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
  void setAllowTF32CuDNN(bool);
  bool allowTF32OneDNN() const;
  void setAllowTF32OneDNN(bool);
  bool allowTF32CuBLAS() const;
  void setAllowTF32CuBLAS(bool);
  Float32MatmulPrecision float32MatmulPrecision() const;
-  std::string float32Precision(
-      const std::string& backend,
-      const std::string& op) const;
+  Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
  bool allowFP16ReductionCuBLAS() const;
  void setAllowFP16ReductionCuBLAS(bool);
  bool allowBF16ReductionCuBLAS() const;
@ -429,7 +425,6 @@ class TORCH_API Context {
  }

 private:
-  static bool checkCuBLASConfigDeterministic();
  std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
@ -488,21 +483,20 @@ class TORCH_API Context {
  bool enable_sparse_tensor_invariant_checks = false;
  bool allow_fp16_reduction_cpu = false;

-  std::map<std::string, std::map<std::string, std::string>> fp32_precision = {
-      {"generic", {{"all", "none"}}},
-      {"mkldnn",
-       {{"matmul", "none"},
-        {"conv", "none"},
-        {"rnn", "none"},
-        {"all", "none"}}},
-      {"cuda",
-       {{"matmul",
-         float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
-             ? "none"
-             : "tf32"},
-        {"conv", "tf32"},
-        {"rnn", "tf32"},
-        {"all", "none"}}},
+  using Key = std::pair<Float32Backend, Float32Op>;
+  std::unordered_map<Key, Float32Precision, c10::hash<Key>> fp32_precision = {
+      {{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE},
+      {{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE},
+      {{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE},
+      {{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE},
+      {{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE},
+      {{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE},
+      {{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32},
+      {{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32},
+      {{Float32Backend::CUDA, Float32Op::MATMUL},
+       float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
+           ? Float32Precision::NONE
+           : Float32Precision::TF32},
  };

  Allocator* prev_allocator_ptr_{nullptr};
@ -684,5 +678,4 @@ struct TORCH_API ROCmBackwardPassGuard {
  ~ROCmBackwardPassGuard();
  static bool is_backward_pass();
 };
-
 } // namespace at
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@ -179,7 +179,7 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef
    return;
  }
  const auto src_names = src.names();
-  const auto result_dim = static_cast<int64_t>(result.dim());
+  const auto result_dim = result.dim();
  const auto src_dim = static_cast<int64_t>(src_names.size());
  const auto excluded_dim = static_cast<int64_t>(excluded_idxs.size());
  TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim);
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -273,11 +273,11 @@ void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout)
 }

 void * maybe_data_ptr(const Tensor& tensor) {
-  return tensor.defined() ? (void *)tensor.data_ptr() : nullptr;
+  return tensor.defined() ? tensor.data_ptr() : nullptr;
 }

 void * maybe_data_ptr(const TensorArg& tensor) {
-  return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
+  return tensor->defined() ? tensor->data_ptr() : nullptr;
 }

 void check_dim_size(
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -50,6 +50,46 @@ namespace {
  constexpr size_t MAX_SIZE_INDEX = 64;
 }

+// A large reserved pinned memory segment that is created in advance which is used
+// to allocate small pinned memory requests to avoid calling into expensive APIs.
+// We never free this memory and move up the pointer as we allocate new blocks
+// and when blocks are freed, they are cached in the free lists.
+struct PinnedReserveSegment {
+  PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size),
+    current_ptr_(start_), initialized_(true) {}
+
+  PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {}
+
+  bool initialized() {
+    return initialized_;
+  }
+
+  void* allocate(size_t bytes) {
+    std::lock_guard<std::mutex> guard(mutex_);
+
+    // Round up the requested size to 4KB boundary for all including the small ones.
+    size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1);
+
+    if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) {
+      return nullptr;
+    }
+
+    void* ptr = current_ptr_;
+    current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes;
+    return ptr;
+  }
+
+  bool owns(void* ptr) {
+    return ptr >= start_ && ptr < (uint8_t*)start_ + size_;
+  }
+
+  std::mutex mutex_;
+  void* start_;
+  size_t size_;
+  void* current_ptr_;
+  bool initialized_;
+};
+
 // Struct containing memory allocator summary statistics for host.
 struct TORCH_API HostStats {
  // COUNT: total allocations (active)
@ -203,17 +243,6 @@ struct CachingHostAllocatorImpl {
    // background.
    if (!pinned_use_background_threads()) {
      process_events();
-    } else {
-      // Launch the background thread and process events in a loop.
-      static bool background_thread_flag [[maybe_unused]] = [this] {
-        getBackgroundThreadPool()->run([&]() {
-          while (active_) {
-            process_events();
-            std::this_thread::sleep_for(std::chrono::microseconds(100));
-          }
-        });
-        return true;
-      }();
    }

    // Round up the allocation to the nearest power of two to improve reuse.
@ -226,6 +255,21 @@ struct CachingHostAllocatorImpl {
      return {block->ptr_, reinterpret_cast<void*>(block)};
    }

+    // Check in the recently freed blocks with pending events to see if we
+    // can reuse them. Call get_free_block again after processing events
+    if (pinned_use_background_threads()) {
+      // Launch the background thread and process events in a loop.
+      static bool background_thread_flag [[maybe_unused]] = [this] {
+        getBackgroundThreadPool()->run([&]() {
+          while (active_) {
+            process_events();
+            std::this_thread::sleep_for(std::chrono::microseconds(100));
+          }
+        });
+        return true;
+      }();
+    }
+
    // Slow path: if we can't allocate from the cached free list, we need
    // to create a new block.
    void* ptr = nullptr;
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -76,13 +76,7 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name,

 OpRegistrationListener::~OpRegistrationListener()= default;

-Dispatcher::Dispatcher()
-: operators_()
-, operatorLookupTable_()
-, backendFallbackKernels_()
-, listeners_(std::make_unique<detail::RegistrationListenerList>())
-, cond_var_()
-, guard_(std::make_shared<Guard>())
+Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique<detail::RegistrationListenerList>()), guard_(std::make_shared<Guard>())
 {}

 Dispatcher::~Dispatcher() {
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -62,17 +62,7 @@ static const auto& getDispatchTableIndexToKey() {
 }

 OperatorEntry::OperatorEntry(OperatorName&& operator_name)
-: name_(std::move(operator_name))
-, schema_()
-#ifndef C10_MOBILE
-, tags_()
-#endif
-, dispatchTable_()
-, dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
-, kernels_()
-, cpp_signature_()
-, sym_cpp_signature_()
-, is_observed_(ObservedOperators::isObserved(name_))
+: name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_))
 {
  // Pick up any backend fallbacks that were registered prior to this
  // OperatorEntry being created.
--- a/aten/src/ATen/core/op_registration/op_registration.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration.cpp
@ -73,7 +73,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(

  std::optional<FunctionSchema> inferred_schema = std::nullopt;
  for (const auto& kernel : options.kernels) {
-    if (nullptr != kernel.inferred_function_schema.get()) {
+    if (nullptr != kernel.inferred_function_schema) {
      if (!inferred_schema.has_value()) {
        inferred_schema = *kernel.inferred_function_schema;
        break;
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@ -905,7 +905,7 @@ class Vectorized8 : public Vectorizedi {
    // Because loadu(const void* ptr, T count) requires zero initialization for
    // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128
    // bits of the result are undefined.
-    // TODO<leslie> We can use _mm256_zextsi128_si256 in the furture,
+    // TODO<leslie> We can use _mm256_zextsi128_si256 in the future,
    // since gcc 9.3 doesn't support it now.
    __m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
    return _mm256_castsi128_si256(input_128);
@ -1844,7 +1844,7 @@ Vectorized<int16_t> inline shift_256_16(
    c0 = _mm256_srav_epi32(a0, b0);
  c0 = _mm256_shuffle_epi8(c0, ctl_1_0);

-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
  // idx%2==1.
  __m256i a1 = _mm256_and_si256(a, keep_1);
  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@ -2180,7 +2180,7 @@ Vectorized<T> inline shift_256_8(
    c0 = _mm256_srlv_epi32(a0, b0);
  c0 = _mm256_shuffle_epi8(c0, ctl_3_0);

-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
  // idx%4==1.
  __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@ -2193,7 +2193,7 @@ Vectorized<T> inline shift_256_8(
    c1 = _mm256_srlv_epi32(a1, b1);
  c1 = _mm256_shuffle_epi8(c1, ctl_3_1);

-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
  // idx%4==2.
  __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
  __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
@ -2206,7 +2206,7 @@ Vectorized<T> inline shift_256_8(
    c2 = _mm256_srlv_epi32(a2, b2);
  c2 = _mm256_shuffle_epi8(c2, ctl_3_2);

-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
  // idx%4==3.
  __m256i a3 = _mm256_and_si256(a, keep_3);
  __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@ -1088,7 +1088,7 @@ class Vectorized8 : public Vectorizedi {
    // Because loadu(const void* ptr, T count) requires zero initialization for
    // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384
    // bits of the result are undefined.
-    // TODO<leslie> We can use _mm512_zextsi128_si512 in the furture,
+    // TODO<leslie> We can use _mm512_zextsi128_si512 in the future,
    // since gcc 9.3 doesn't support it now.
    __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
    return _mm512_castsi128_si512(input_128);
@ -2022,7 +2022,7 @@ Vectorized<T> inline shift_512_8(
    c0 = _mm512_srlv_epi16(a0, b0);
  c0 = _mm512_shuffle_epi8(c0, ctl_1_0);

-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
  // idx%2==1.
  __m512i a1 = _mm512_and_si512(a, keep_1);
  __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -323,7 +323,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
    descriptor_.reset(raw_descriptor);
  }
  template <typename T>
-  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+  void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
    // NOLINTNEXTLINE(bugprone-sizeof-expression)
    TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value)));
  }
@ -345,7 +345,7 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
    descriptor_.reset(raw_descriptor);
  }
  template <typename T>
-  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
+  void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
    TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
  }
 };
@ -360,7 +360,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
    descriptor_.reset(raw_descriptor);
  }
  template <typename T>
-  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
+  void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
    TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
  }
 };
@ -395,7 +395,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_R_64F;
  } else if constexpr (std::is_same_v<Dtype, float>) {
-    if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
+    if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
    }
  } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
@ -440,7 +440,6 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
  }

-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -574,8 +573,6 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D

 template <>
 void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -587,8 +584,6 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {

 template <>
 void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -600,8 +595,6 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {

 template <>
 void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -615,8 +608,6 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co

 template <>
 void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -630,8 +621,6 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com

 template <typename C_Dtype>
 inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -703,8 +692,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP

 template <typename C_Dtype>
 inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  BGEMM_CHECK_ARGVALUES(at::BFloat16);
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1028,8 +1015,6 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty

 template <>
 void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1041,8 +1026,6 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {

 template <>
 void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1054,8 +1037,6 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {

 template <>
 void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1069,8 +1050,6 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp

 template <>
 void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1084,8 +1063,6 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl

 template <typename C_Dtype>
 inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1194,7 +1171,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(

 template <typename C_Dtype>
 inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1583,7 +1559,7 @@ bool gemm_and_bias(
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_R_64F;
  } else if constexpr (std::is_same_v<Dtype, float>) {
-    if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
+    if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
    }
  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
@ -2408,8 +2384,6 @@ void trsmBatched<c10::complex<double>>(

 template <>
 void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2425,8 +2399,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
  // gemv is bw bound, and does not benefit from TF32. But the precision
  // loss still happens on TF32. So we disable it here.
  NoTF32Guard disable_tf32;
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2439,8 +2411,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {

 template <>
 void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2454,8 +2424,6 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
  // gemv is bw bound, and does not benefit from TF32. But the precision
  // loss still happens on TF32. So we disable it here.
  NoTF32Guard disable_tf32;
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@ -109,7 +109,7 @@ void CUDAGeneratorState::increase(uint64_t increment) {
        offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
    // Ensures the increment does not cause overflow.
    TORCH_INTERNAL_ASSERT(
-        offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
+        offset_intragraph_ <= std::numeric_limits<uint64_t>::max() - increment,
        "Increment causes overflow in the offset value.");
    offset_intragraph_ += increment;
  } else {
@ -461,7 +461,7 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) {
 */
 PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
  if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
-    uint32_t offset = state_->offset_intragraph_;
+    uint64_t offset = state_->offset_intragraph_;
    state_->increase(increment);
    return PhiloxCudaState(
        state_->seed_extragraph_.data_ptr<int64_t>(),
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@ -96,7 +96,7 @@ struct CUDAGraph;
 struct CUDAGeneratorState : public c10::intrusive_ptr_target {
  uint64_t seed_;
  uint64_t philox_offset_per_thread_;
-  uint32_t offset_intragraph_;
+  uint64_t offset_intragraph_;
  bool capturing_{};
  std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
  at::TensorBase seed_extragraph_{};
@ -105,7 +105,7 @@ struct CUDAGeneratorState : public c10::intrusive_ptr_target {
  CUDAGeneratorState(
      uint64_t seed = default_rng_seed_val,
      uint64_t philox_offset_per_thread = 0,
-      uint32_t offset_intragraph = 0)
+      uint64_t offset_intragraph = 0)
      : seed_(seed),
        philox_offset_per_thread_(philox_offset_per_thread),
        offset_intragraph_(offset_intragraph) {}
--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@ -6,43 +6,15 @@
 #define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch)
 #endif

-// cuSparse Generic API added in CUDA 10.1
-// Windows support added in CUDA 11.0
-#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32)))
-#define AT_USE_CUSPARSE_GENERIC_API() 1
-#else
-#define AT_USE_CUSPARSE_GENERIC_API() 0
-#endif
-
-// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0
-#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
-    (CUSPARSE_VERSION < 12000)
-#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1
-#else
-#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0
-#endif
-
-#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
-    (CUSPARSE_VERSION >= 12000)
-#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1
-#else
-#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0
-#endif

 #if defined(USE_ROCM)
 // hipSparse const API added in v2.4.0
 #if HIPSPARSE_VERSION >= 200400
-#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1
-#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #else
-#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
-#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #endif
 #else // USE_ROCM
-#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
-#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 0
 #endif // USE_ROCM

--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@ -12,8 +12,6 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) {
  return cusparseDestroyDnMat(const_cast<cusparseDnMatDescr*>(dnMatDescr));
 }

-#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
-
 namespace {

 // If a specific GPU model does not provide native support for a given data
@ -210,6 +208,4 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
  descriptor_.reset(raw_descriptor);
 }

-#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
-
 } // namespace at::cuda::sparse
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.h
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h
@ -35,7 +35,6 @@ class CuSparseDescriptor {
  std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };

-#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
 template <typename T, cusparseStatus_t (*destructor)(const T*)>
 struct ConstCuSparseDescriptorDeleter {
  void operator()(T* x) {
@ -58,7 +57,6 @@ class ConstCuSparseDescriptor {
 protected:
  std::unique_ptr<T, ConstCuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };
-#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS

 #if defined(USE_ROCM)
 using cusparseMatDescr = std::remove_pointer_t<hipsparseMatDescr_t>;
@ -123,39 +121,8 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info

 #endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE

-#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
-
 cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);

-#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS()
-class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
-    : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
- public:
-  explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
-};
-
-class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
-    : public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> {
- public:
-  explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
-  cusparseDnMatDescr* unsafe_mutable_descriptor() const {
-    return const_cast<cusparseDnMatDescr*>(descriptor());
-  }
-  cusparseDnMatDescr* unsafe_mutable_descriptor() {
-    return const_cast<cusparseDnMatDescr*>(descriptor());
-  }
-};
-
-class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
-    : public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> {
- public:
-  explicit CuSparseDnVecDescriptor(const Tensor& input);
-};
-
-class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
-    : public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {};
-
-#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
  class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
      : public ConstCuSparseDescriptor<
            cusparseDnMatDescr,
@ -194,7 +161,6 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
      : public ConstCuSparseDescriptor<
            cusparseSpMatDescr,
            &cusparseDestroySpMat> {};
-#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()

 class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
    : public CuSparseSpMatDescriptor {
@ -283,6 +249,4 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
  }
 };

-#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
-
 } // namespace at::cuda::sparse
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -9,7 +9,6 @@

 #include <cuda_runtime_api.h>
 #include <future>
-#include <unordered_map>

 namespace at::cuda {
 namespace {
@ -72,9 +71,20 @@ using Block = HostBlock<CUDAStream>;
 struct CUDACachingHostAllocatorImpl
    : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
 private:
-  std::unordered_map<void*, bool> use_host_register;
+  ska::flat_hash_map<void*, bool> use_host_register;

  void allocate_host_memory(size_t size, void** ptr) override {
+    // try allocating from reserve segment first before calling into expensive APIs
+    if (get_reserve_segment().initialized()) {
+      *ptr = get_reserve_segment().allocate(size);
+      if (*ptr != nullptr) {
+        return;
+      }
+    }
+    allocate_host_memory_slowpath(size, ptr);
+  }
+
+  void allocate_host_memory_slowpath(size_t size, void** ptr) {
    // Pinned memory pointers allocated by any device can be directly used by
    // any other device, regardless of the current device at the time of
    // allocation, since we assume unified addressing. So we grab any existing
@ -113,6 +123,18 @@ struct CUDACachingHostAllocatorImpl
  }

  void free_block(Block* block) override {
+    // We never free blocks from the reserve segment
+    if (get_reserve_segment().initialized()) {
+      // Check if the block is from the reserve segment
+      if (get_reserve_segment().owns(block->ptr_)) {
+        return;
+      }
+    }
+
+    free_block_slowpath(block);
+  }
+
+  void free_block_slowpath(Block* block) {
    auto start = std::chrono::steady_clock::now();
    // Users may change the allocator config at will. torch unit tests do this.
    // However, allocations using cudaHostRegister should use corresonding
@ -172,6 +194,20 @@ struct CUDACachingHostAllocatorImpl
    return event_pool->get(idx);
  }

+  PinnedReserveSegment& get_reserve_segment() {
+    static auto reserve_segment = [&]() {
+      if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
+        void *ptr;
+        size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024;
+        allocate_host_memory_slowpath(sz, &ptr);
+        return PinnedReserveSegment(ptr, sz);
+      } else {
+        return PinnedReserveSegment();
+      }
+    } ();
+    return reserve_segment;
+  }
+
  TaskThreadPool* getThreadPool() {
    static TaskThreadPool* pool = new TaskThreadPool(
        static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
@ -186,15 +222,15 @@ struct CUDACachingHostAllocatorImpl
      size_t numThreads,
      size_t pageSize) {
    uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
-    uintptr_t end = (uintptr_t)start + (size / numThreads);
+    uintptr_t end = start + (size / numThreads);
    if (i == (numThreads - 1)) {
      end = (uintptr_t)ptr + size;
    }

    // pre-fault/map the pages by setting the first byte of the page
    uintptr_t alignedStart =
-        (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
-    for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
+        ((start + pageSize - 1) & ~(pageSize - 1));
+    for (uintptr_t p = alignedStart; p < (end); p += pageSize) {
      // NOLINTNEXTLINE(performance-no-int-to-ptr)
      memset((void*)p, 0, 1);
    }
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -310,7 +310,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
  // FP32 data type calculations based on the value of the allow_tf32 flag.
  // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
  if (!NoTF32Guard::should_disable_tf32() &&
-      at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
+      at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
  } else {
    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
--- a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
+++ b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
@ -19,7 +19,7 @@ struct PhiloxCudaState {
  // Called if graph capture is underway
  PhiloxCudaState(int64_t* seed,
                  int64_t* offset_extragraph,
-                  uint32_t offset_intragraph) {
+                  uint64_t offset_intragraph) {
    seed_.ptr = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
@ -36,7 +36,7 @@ struct PhiloxCudaState {

  Payload seed_{};
  Payload offset_{};
-  uint32_t offset_intragraph_ = 0;
+  uint64_t offset_intragraph_ = 0;
  bool captured_ = false;
 };

--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -162,7 +162,7 @@ inline std::string ComputeTypeFor() {
 // ROCBLAS and hipBLASLt.
 template <>
 inline std::string ComputeTypeFor<float>() {
-  if (at::globalContext().float32Precision("cuda", "matmul") != "tf32") {
+  if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) {
    return "f32_r";
  } else {
    return "xf32_r";
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -506,7 +506,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
      }

      hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
-      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
        computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
      }
      HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
--- a/aten/src/ATen/cuda/tunable/GemmRocblas.h
+++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h
@ -141,7 +141,7 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {

    TuningStatus Call(const GemmParams<T>* params) override {
      auto input_output_type = RocBlasDataTypeFor<T>();
-      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
        return FAIL;  // no support for TF32 in rocBLAS
      auto compute_type = RocBlasComputeTypeFor<T>();
      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
@ -209,7 +209,7 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>

    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
      auto input_output_type = RocBlasDataTypeFor<T>();
-      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
        return FAIL;  // no support for TF32 in rocBLAS
      auto compute_type = RocBlasComputeTypeFor<T>();
      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -404,8 +404,6 @@ TuningContext::TuningContext() :
    max_warmup_iterations_{0},
    icache_flush_{true},
    rotating_buffer_size_{-1},
-    filename_{},
-    untuned_file_{},
    results_count_from_input_file_{0},
    is_shutting_down_{false}
 {
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -141,7 +141,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
    size[i] = (int) t.size(i);
  }
  for (const auto i : c10::irange(dim, pad)) {
-    size[i] = (int) 1;
+    size[i] = 1;
  }
  dim = std::max(dim, pad);
  cudnnTensorFormat_t filter_format{};
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -176,7 +176,7 @@ struct LinalgCheckMatrixUnaryRuleHelper;

 template <char const *op_name, typename F, F Func, typename A, typename... T>
 struct LinalgCheckMatrixUnaryRuleHelper<op_name, F, Func, typelist<A, T...>> {
-  static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
+  static Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
    TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions.");
    return moveBatchDimToFront(tensor, batch_dim);
  }
@ -222,7 +222,7 @@ struct LinalgCheckMatrixBinaryRuleHelper;

 template <char const *op_name, typename F, F Func, typename A, typename B, typename... T>
 struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>> {
-  static inline std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
+  static std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
      const Tensor& first, std::optional<int64_t> first_bdim,
      const Tensor& second, std::optional<int64_t> second_bdim) {
    TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2,
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@ -58,7 +58,7 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y,
 template<typename scalar_t>
 scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy);

-static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) {
+static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) {
  return n == 1 || lda >= std::max<int64_t>(1L, m);
 }

--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -991,7 +991,7 @@ std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) cons
 template <typename key_t, typename value_t>
 struct KernelCache  {
  using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
-  static inline std::shared_ptr<value_t>&& fetch_or_create(
+  static std::shared_ptr<value_t>&& fetch_or_create(
      const key_t& key,
      const std::function<std::shared_ptr<value_t>()>& callback) {
    auto&& search = get_store().find(key);
@ -1003,7 +1003,7 @@ struct KernelCache  {
    }
  }

-  static inline kstore_t& get_store() {
+  static kstore_t& get_store() {
    static thread_local kstore_t cache_kernels;
    return cache_kernels;
  }
@ -1067,7 +1067,7 @@ struct GemmHelper {
 struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
  // Fetch/create GemmHelper object and execute brgemm with batch size = 1
  template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
-  static inline void call(
+  static void call(
      int64_t M,
      int64_t N,
      int64_t K,
@ -1118,12 +1118,12 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
        .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
  }

-  static inline std::shared_ptr<GemmHelper>& get_current() {
+  static std::shared_ptr<GemmHelper>& get_current() {
    static thread_local std::shared_ptr<GemmHelper> current;
    return current;
  }

-  static inline bool device_check(ScalarType dtype) {
+  static bool device_check(ScalarType dtype) {
    if (!at::globalContext().userEnabledMkldnn()) {
      return false;
    }
@ -1153,7 +1153,7 @@ using pack_t = dnnl::ukernel::brgemm_pack_B;
 using pack_t = dnnl::ukernel::transform;
 #endif
 struct Pack : public KernelCache <PackKey, pack_t> {
-  static inline void call(
+  static void call(
      int64_t K,
      int64_t N,
      int64_t ld_in,
@ -1182,7 +1182,7 @@ struct Pack : public KernelCache <PackKey, pack_t> {
    }
  }

-  static inline bool could_pack(ScalarType dtype) {
+  static bool could_pack(ScalarType dtype) {
    if (!at::globalContext().userEnabledMkldnn()) {
      return false;
    }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -702,7 +702,7 @@ static void check_shape_forward(const at::Tensor& input,
      // If kernel size is incorrect
      std::ostringstream input_ss;
      std::ostringstream kernel_ss;
-      std::string separator = "";
+      std::string separator;

      for (int i = 0, len = input_shape.size(); i < len; ++i) {
        input_ss << separator << input_shape[i];
@ -1019,7 +1019,7 @@ static Tensor convolution_same(

  if (symmetric_padding) {
    // All backends handle symmetric padding natively
-    SymDimVector output_padding(static_cast<size_t>(dim));
+    SymDimVector output_padding(dim);
    return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
                               false, output_padding, groups);
  }
@ -1039,7 +1039,7 @@ static Tensor convolution_same(
    }
  }
  auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
-  SymDimVector output_padding(static_cast<size_t>(dim));
+  SymDimVector output_padding(dim);
  return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
                                dilation, false, output_padding, groups);
 }
@ -1174,7 +1174,7 @@ at::Tensor convolution(
  bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
  return at::_convolution(input, weight, bias, stride, padding, dilation,
                          transposed, output_padding, groups,
-                          ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN("conv"));
+                          ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN(at::Float32Op::CONV));
 }

 at::Tensor convolution_overrideable(
@ -1319,7 +1319,7 @@ ConvBackend select_conv_backend(
  params.benchmark = ctx.benchmarkCuDNN();
  params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
  params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN("conv");
+  params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);

  auto input = input_r;
  auto weight = weight_r;
@ -1699,7 +1699,7 @@ at::Tensor _convolution(
  c10::MaybeOwned<Tensor> bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt);
  const Tensor& bias_r = *bias_r_maybe_owned;

-  return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN("conv"));
+  return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN(at::Float32Op::CONV));
 }

 std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable(
@ -1997,7 +1997,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
  params.benchmark = ctx.benchmarkCuDNN();
  params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
  params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN("conv");
+  params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);

  // Validate inputs.
  check_shape_backward(input, weight.sizes(), params);
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -1,6 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Copy.h>
-#include <ATen/native/Copy.h>

 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@ -70,7 +70,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
        new_shape.emplace_back(input_sizes[i]);
    }

-    for (const auto i : c10::irange((size_t)l_pad)) {
+    for (const auto i : c10::irange(l_pad)) {
        auto pad_idx = pad.size() - ((i + 1) * 2);
        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
        TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@ -107,11 +107,6 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) {
  storage->set_nbytes(size_bytes);
 }

-// Call the sparse implementation in SparseTensor.cpp directly.
-// A dynamic dispatch here is NOT necessary, so I didn't put
-// this function in native_functions.yaml
-const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src);
-
 // TODO(VitalyFedyunin): Move it to HTML docs.
 //
 // Strides of the output tensor of `resize_as_` operator is defined by input
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -145,12 +145,6 @@
 #include <utility>
 #include <vector>

-namespace at::native {
-
-AdvancedIndex make_info(Tensor self, IOptTensorListRef orig);
-
-} // namespace at::native
-
 namespace at::meta {

 TORCH_META_FUNC(gather)
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -73,7 +73,6 @@
 #include <ATen/ops/where_native.h>
 #include <ATen/ops/zeros_like.h>

-#include <iostream>
 #include <utility>
 #endif

--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@ -124,7 +124,7 @@ struct IsUnique {};

 template <typename scalar_t>
 struct IsUnique<scalar_t, false> {
-  inline bool operator() (scalar_t* data_ptr, int64_t i) {
+  bool operator() (scalar_t* data_ptr, int64_t i) {
    if (i == 0) { return true; }
    return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]);
  }
@ -132,7 +132,7 @@ struct IsUnique<scalar_t, false> {

 template <typename scalar_t>
 struct IsUnique<scalar_t, true> {
-  inline bool operator() (scalar_t* data_ptr, int64_t i) {
+  bool operator() (scalar_t* data_ptr, int64_t i) {
    if (i == 0) { return true; }
    return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]))
        && !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1]));
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
@ -17,7 +17,7 @@

 namespace ao::sparse {

-int register_linear_params();
+

 #ifdef USE_FBGEMM

--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
@ -20,7 +20,7 @@

 namespace ao::sparse {

-int register_linear_params();
+

 #ifdef USE_FBGEMM
 namespace {
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
@ -16,7 +16,7 @@
 #endif

 namespace ao::sparse {
-int register_linear_params();
+

 #ifdef USE_FBGEMM

--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1919,7 +1919,7 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");


-  addmm_out_cuda_impl(const_cast<Tensor&>(out), out, self, mat2, 0, 1);
+  addmm_out_cuda_impl(out, out, self, mat2, 0, 1);

  return out;
 }
--- a/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
@ -102,13 +102,7 @@ __host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::comple
 }

 void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim) {
-// Compile time for CUDA-11.4 is 3x slower than with CUDA-11.6+, specifically for complex numbers
-#if defined(FBCODE_CAFFE2) || defined(OVRSOURCE)
-#define _LCME_DISPATCH AT_DISPATCH_FLOATING_TYPES_AND2
-#else
-#define _LCME_DISPATCH AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2
-#endif
-  _LCME_DISPATCH(ScalarType::Half, ScalarType::BFloat16,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
      self.scalar_type(), "logcumsumexp_cuda",
      [&]() {
        using opmath_t = at::opmath_type<scalar_t>;
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@ -127,8 +127,7 @@ void apply_ldl_solve_cusolver(
    const Tensor& pivots,
    const Tensor& B,
    bool upper) {
-#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && \
-    CUSOLVER_VERSION >= 11102)
+#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION))
  TORCH_CHECK(
      false,
      "Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ",
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@ -169,7 +169,10 @@ std::string repro_from_args(const ConvolutionParams& params) {
  ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
  ss << "import torch\n";
  ss << "torch.backends.cuda.matmul.allow_tf32 = "
-     << pybool(at::globalContext().float32Precision("cuda", "matmul") == "tf32")
+     << pybool(
+            at::globalContext().float32Precision(
+                at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
+            at::Float32Precision::TF32)
     << "\n";
  ss << "torch.backends.cudnn.benchmark = "
     << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
@ -726,7 +729,7 @@ Tensor cudnn_convolution_relu(

  auto& ctx = at::globalContext();
  bool benchmark = ctx.benchmarkCuDNN();
-  bool allow_tf32 = ctx.allowTF32CuDNN("conv");
+  bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
  auto _bias = bias_t.has_value()
      ? bias_t.value()
      : at::zeros(
@ -784,7 +787,7 @@ Tensor cudnn_convolution_add_relu(
  }

  auto& ctx = at::globalContext();
-  bool allow_tf32 = ctx.allowTF32CuDNN("conv");
+  bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
  bool benchmark = ctx.benchmarkCuDNN();
  auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
  auto _bias = bias_t.has_value()
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@ -76,7 +76,6 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(

 #else // AT_CUDNN_ENABLED

-#include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>

@ -284,9 +283,9 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
  checkBackend(c, {*targets}, Backend::CUDA);
  const auto batch_size = log_probs->size(1);
  int64_t input_lengths_size =
-      input_lengths_.sizes().size() ? input_lengths_.size(0) : 1;
+      !input_lengths_.sizes().empty() ? input_lengths_.size(0) : 1;
  int64_t target_lengths_size =
-      target_lengths_.sizes().size() ? target_lengths_.size(0) : 1;
+      !target_lengths_.sizes().empty() ? target_lengths_.size(0) : 1;
  TORCH_CHECK(
      input_lengths_size == batch_size,
      "input_lengths needs to have size to match batch_size");
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -142,8 +142,6 @@ void run_cudnn_SDP_bprop_nestedtensor(
 namespace at {
 namespace native {

-#include <cudnn_frontend.h>
-
 namespace fe = cudnn_frontend;

 constexpr uint8_t MAX_MHA_DIM = 4;
@ -1379,7 +1377,7 @@ void run_cudnn_SDP_fprop(
  cudnnHandle_t handle = getCudnnHandle();

  // NB: The key initialization will round up sequence length, stride data etc.
-  // if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to
+  // if use_ragged_in_dense is enabled (to allow multiple sequence lengths to
  // reuse the same cached value/graph)
  auto key = MHACacheKeyWrapper(
      b,
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -245,7 +245,7 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
      datatype,
      input_datatype,
      algo,
-      at::globalContext().allowTF32CuDNN("rnn"));
+      at::globalContext().allowTF32CuDNN(at::Float32Op::RNN));
 #else
    rnn_desc.set(
        handle,
@ -261,7 +261,7 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
        datatype,
        input_datatype,
        algo,
-        at::globalContext().allowTF32CuDNN("rnn"));
+        at::globalContext().allowTF32CuDNN(at::Float32Op::RNN));
 #endif
  return rnn_desc;
 }
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@ -38,7 +38,6 @@ REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_transpose_backward_stub)

 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>
-#include <ATen/native/ConvUtils.h>
 #include <c10/util/irange.h>

 namespace at::native {
@ -105,7 +104,7 @@ static void check_shape_forward(const Tensor& input,
    // If kernel size is incorrect
    std::ostringstream input_ss;
    std::ostringstream kernel_ss;
-    std::string separator = "";
+    std::string separator;

    for (int i = 0, len = input_shape.size(); i < len; ++i) {
      input_ss << separator << input_shape[i];
@ -156,12 +155,12 @@ static void check_shape_forward(const Tensor& input,
 //

 static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
-  return at::globalContext().float32Precision("mkldnn", "conv") == "bf16" &&
+  return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::BF16 &&
      mkldnn_bf16_device_check();
 }

 static bool mkldnn_conv_enabled_fpmath_mode_tf32(){
-  return at::globalContext().float32Precision("mkldnn", "conv") == "tf32" &&
+  return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::TF32 &&
      cpuinfo_has_x86_amx_fp16();
 }

--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@ -69,12 +69,12 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
 namespace at::native {

 static bool use_mkldnn_bf32_linear() {
-  return at::globalContext().float32Precision("mkldnn", "matmul") == "bf16" &&
+  return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16 &&
      mkldnn_bf16_device_check();
 }

 static bool use_mkldnn_tf32_linear() {
-  return at::globalContext().float32Precision("mkldnn", "matmul") == "tf32" &&
+  return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32 &&
      cpuinfo_has_x86_amx_fp16();
 }

--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@ -111,11 +111,11 @@ static bool use_mkldnn_fp16_matmul() {
 }

 static bool use_mkldnn_bf32_matmul() {
-  return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision("mkldnn", "matmul") == "bf16";
+  return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16;
 }

 static bool use_mkldnn_tf32_matmul() {
-  return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision("mkldnn", "matmul") == "tf32";
+  return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32;
 }

 // returns an ideep::tensor
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@ -14,6 +14,7 @@ struct EmbeddingBagParams {
  ::c10::metal::array<idx_type_t, 2> output_strides;
  ::c10::metal::array<idx_type_t, 2> max_indices_strides;

+  bool use_per_sample_weights;
  idx_type_t per_sample_weights_stride;

  idx_type_t num_indices;
@ -23,3 +24,24 @@ struct EmbeddingBagParams {
  EmbeddingBagMode mode;
  int64_t padding_idx;
 };
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagBackwardParams {
+  ::c10::metal::array<idx_type_t, 2> weight_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> output_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> max_indices_strides;
+  bool use_per_sample_weights;
+  idx_type_t per_sample_weights_stride;
+  idx_type_t feature_size;
+  EmbeddingBagMode mode;
+  int64_t padding_idx;
+};
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagPerSampleWeightsBackwardParams {
+  ::c10::metal::array<idx_type_t, 2> output_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> weight_strides;
+  idx_type_t per_sample_weights_grad_stride;
+  idx_type_t feature_size;
+  int64_t padding_idx;
+};
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -1,4 +1,5 @@
 #include <ATen/native/mps/kernels/EmbeddingBag.h>
+#include <c10/metal/atomic.h>
 #include <c10/metal/utils.h>
 #include <metal_array>
 #include <metal_stdlib>
@ -44,6 +45,7 @@ template <EmbeddingBagMode M, typename T>
 struct MaybeApplyPerSampleWeight {
  inline opmath_t<T> operator()(
      opmath_t<T> weight_val,
+      bool /*use_per_sample_weights*/,
      uint32_t /*per_sample_weights_index*/,
      constant T* /*per_sample_weights*/,
      uint32_t /*per_sample_weights_stride*/) {
@ -55,10 +57,11 @@ template <typename T>
 struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
  inline opmath_t<T> operator()(
      opmath_t<T> weight_val,
+      bool use_per_sample_weights,
      uint32_t per_sample_weights_index,
      constant T* per_sample_weights,
      uint32_t per_sample_weights_stride) {
-    if (per_sample_weights_stride) {
+    if (use_per_sample_weights) {
      T per_sample_weight = per_sample_weights
          [per_sample_weights_stride * per_sample_weights_index];
      return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
@ -154,6 +157,7 @@ void embedding_bag_impl(
  auto num_bags = params.num_bags;
  auto feature_size = params.feature_size;
  auto padding_idx = params.padding_idx;
+  auto use_per_sample_weights = params.use_per_sample_weights;
  auto per_sample_weights_stride = params.per_sample_weights_stride;
  constant auto& output_strides = params.output_strides;
  constant auto& weight_strides = params.weight_strides;
@ -183,7 +187,11 @@ void embedding_bag_impl(
             feature_idx * weight_strides[1]]);

    weight_val = MaybeApplyPerSampleWeight<M, T>()(
-        weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
+        weight_val,
+        use_per_sample_weights,
+        indices_idx,
+        per_sample_weights,
+        per_sample_weights_stride);

    auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);

@ -239,19 +247,208 @@ kernel void embedding_bag(
  }
 }

-#define REGISTER_EMBEDDING_BAG_OP(T, I)                             \
-  template [[host_name("embedding_bag_" #T "_" #I)]]                \
-  kernel void embedding_bag<T, I>(                                  \
-      constant T * weight [[buffer(0)]],                            \
-      constant I * indices [[buffer(1)]],                           \
-      constant I * offsets [[buffer(2)]],                           \
-      constant T * per_sample_weights [[buffer(3)]],                \
-      device T * output [[buffer(4)]],                              \
-      device I * offset2bag [[buffer(5)]],                          \
-      device I * bag_size [[buffer(6)]],                            \
-      device I * max_indices [[buffer(7)]],                         \
-      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
-      uint tid [[thread_position_in_grid]]);
+template <EmbeddingBagMode M, typename T>
+struct MaybeDivBagSize {
+  inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
+    return val;
+  }
+};
+
+template <typename T>
+struct MaybeDivBagSize<EmbeddingBagMode::MEAN, T> {
+  inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
+    return val / bag_size;
+  }
+};
+
+template <EmbeddingBagMode M, typename T, typename I>
+void embedding_bag_backward_sum_mean_impl(
+    constant T* output_grad,
+    constant I* indices,
+    constant I* offset2bag,
+    constant I* bag_size,
+    constant T* per_sample_weights,
+    device AtomicType_t<T>* weight_grad,
+    constant EmbeddingBagBackwardParams<uint32_t>& params,
+    uint tid) {
+  auto feature_size = params.feature_size;
+  auto indices_idx = tid / feature_size;
+  auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
+  auto bag_size_val = bag_size[bag_idx];
+  auto weight_idx = indices[indices_idx];
+  auto padding_idx = params.padding_idx;
+
+  if (bag_size_val && weight_idx != padding_idx) {
+    auto feature_idx = tid % feature_size;
+    constant auto& weight_grad_strides = params.weight_grad_strides;
+    constant auto& output_grad_strides = params.output_grad_strides;
+    auto use_per_sample_weights = params.use_per_sample_weights;
+    auto per_sample_weights_stride = params.per_sample_weights_stride;
+
+    auto output_grad_val =
+        static_cast<opmath_t<T>>(output_grad
+                                     [bag_idx * output_grad_strides[0] +
+                                      feature_idx * output_grad_strides[1]]);
+
+    opmath_t<T> weight_grad_val = MaybeDivBagSize<M, T>()(
+        MaybeApplyPerSampleWeight<M, T>()(
+            output_grad_val,
+            use_per_sample_weights,
+            indices_idx,
+            per_sample_weights,
+            per_sample_weights_stride),
+        static_cast<opmath_t<T>>(bag_size_val));
+
+    AtomicType<T>::atomic_add(
+        weight_grad,
+        static_cast<int32_t>(weight_idx) * weight_grad_strides[0] +
+            feature_idx * weight_grad_strides[1],
+        static_cast<T>(weight_grad_val));
+  }
+}
+
+template <typename T, typename I>
+void embedding_bag_backward_max_impl(
+    constant T* output_grad,
+    constant I* bag_size,
+    constant I* max_indices,
+    device AtomicType_t<T>* weight_grad,
+    constant EmbeddingBagBackwardParams<uint32_t>& params,
+    uint tid) {
+  auto feature_size = params.feature_size;
+  auto bag_idx = tid / feature_size;
+  auto bag_size_val = bag_size[bag_idx];
+
+  if (bag_size_val) {
+    auto feature_idx = tid % feature_size;
+    constant auto& weight_grad_strides = params.weight_grad_strides;
+    constant auto& output_grad_strides = params.output_grad_strides;
+    constant auto& max_indices_strides = params.max_indices_strides;
+
+    auto output_grad_val = output_grad
+        [bag_idx * output_grad_strides[0] +
+         feature_idx * output_grad_strides[1]];
+    auto max_index =
+        static_cast<uint32_t>(max_indices
+                                  [bag_idx * max_indices_strides[0] +
+                                   feature_idx * max_indices_strides[1]]);
+
+    AtomicType<T>::atomic_add(
+        weight_grad,
+        max_index * weight_grad_strides[0] +
+            feature_idx * weight_grad_strides[1],
+        output_grad_val);
+  }
+}
+
+#define DISPATCH_BACKWARD_SUM_MEAN_IMPL(MODE)        \
+  return embedding_bag_backward_sum_mean_impl<MODE>( \
+      output_grad,                                   \
+      indices,                                       \
+      offset2bag,                                    \
+      bag_size,                                      \
+      per_sample_weights,                            \
+      weight_grad,                                   \
+      params,                                        \
+      tid)
+
+template <typename T, typename I>
+kernel void embedding_bag_backward(
+    constant T* output_grad [[buffer(0)]],
+    constant I* indices [[buffer(1)]],
+    constant I* offset2bag [[buffer(2)]],
+    constant I* bag_size [[buffer(3)]],
+    constant I* max_indices [[buffer(4)]],
+    constant T* per_sample_weights [[buffer(5)]],
+    device AtomicType_t<T>* weight_grad [[buffer(6)]],
+    constant EmbeddingBagBackwardParams<uint32_t>& params [[buffer(7)]],
+    uint tid [[thread_position_in_grid]]) {
+  switch (params.mode) {
+    case EmbeddingBagMode::SUM:
+      DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::SUM);
+    case EmbeddingBagMode::MEAN:
+      DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::MEAN);
+    case EmbeddingBagMode::MAX:
+      return embedding_bag_backward_max_impl(
+          output_grad, bag_size, max_indices, weight_grad, params, tid);
+  }
+}
+
+template <typename T, typename I>
+kernel void embedding_bag_per_sample_weights_backward(
+    constant T* output_grad [[buffer(0)]],
+    constant T* weight [[buffer(1)]],
+    constant I* indices [[buffer(2)]],
+    constant I* offset2bag [[buffer(3)]],
+    device AtomicType_t<T>* per_sample_weights_grad [[buffer(4)]],
+    constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t>& params
+    [[buffer(5)]],
+    uint tid [[thread_position_in_grid]]) {
+  auto feature_size = params.feature_size;
+  auto padding_idx = params.padding_idx;
+  auto indices_idx = tid / feature_size;
+  auto weight_idx = indices[indices_idx];
+
+  if (weight_idx != padding_idx) {
+    auto feature_idx = tid % feature_size;
+    auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
+    constant auto& output_grad_strides = params.output_grad_strides;
+    constant auto& weight_strides = params.weight_strides;
+    auto per_sample_weights_grad_stride = params.per_sample_weights_grad_stride;
+
+    auto weight_val = weight
+        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+         feature_idx * weight_strides[1]];
+    auto output_grad_val = output_grad
+        [bag_idx * output_grad_strides[0] +
+         feature_idx * output_grad_strides[1]];
+    auto per_sample_weights_grad_val = static_cast<opmath_t<T>>(weight_val) *
+        static_cast<opmath_t<T>>(output_grad_val);
+
+    AtomicType<T>::atomic_add(
+        per_sample_weights_grad,
+        indices_idx * per_sample_weights_grad_stride,
+        static_cast<T>(per_sample_weights_grad_val));
+  }
+}
+
+#define REGISTER_EMBEDDING_BAG_OP(T, I)                                     \
+  template [[host_name("embedding_bag_" #T "_" #I)]]                        \
+  kernel void embedding_bag<T, I>(                                          \
+      constant T * weight [[buffer(0)]],                                    \
+      constant I * indices [[buffer(1)]],                                   \
+      constant I * offsets [[buffer(2)]],                                   \
+      constant T * per_sample_weights [[buffer(3)]],                        \
+      device T * output [[buffer(4)]],                                      \
+      device I * offset2bag [[buffer(5)]],                                  \
+      device I * bag_size [[buffer(6)]],                                    \
+      device I * max_indices [[buffer(7)]],                                 \
+      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]],         \
+      uint tid [[thread_position_in_grid]]);                                \
+                                                                            \
+  template [[host_name("embedding_bag_backward_" #T "_" #I)]]               \
+  kernel void embedding_bag_backward<T, I>(                                 \
+      constant T * output_grad [[buffer(0)]],                               \
+      constant I * indices [[buffer(1)]],                                   \
+      constant I * offset2bag [[buffer(2)]],                                \
+      constant I * bag_size [[buffer(3)]],                                  \
+      constant I * max_indices [[buffer(4)]],                               \
+      constant T * per_sample_weights [[buffer(5)]],                        \
+      device AtomicType_t<T> * weight_grad [[buffer(6)]],                   \
+      constant EmbeddingBagBackwardParams<uint32_t> & params [[buffer(7)]], \
+      uint tid [[thread_position_in_grid]]);                                \
+                                                                            \
+  template                                                                  \
+      [[host_name("embedding_bag_per_sample_weights_backward_" #T "_" #I)]] \
+      kernel void embedding_bag_per_sample_weights_backward<T, I>(          \
+          constant T * output_grad [[buffer(0)]],                           \
+          constant T * weight [[buffer(1)]],                                \
+          constant I * indices [[buffer(2)]],                               \
+          constant I * offset2bag [[buffer(3)]],                            \
+          device AtomicType_t<T> * per_sample_weights_grad [[buffer(4)]],   \
+          constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t> &   \
+              params [[buffer(5)]],                                         \
+          uint tid [[thread_position_in_grid]]);

 REGISTER_EMBEDDING_BAG_OP(float, int);
 REGISTER_EMBEDDING_BAG_OP(float, long);
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -13,8 +13,10 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_embedding_bag_dense_backward_native.h>
 #include <ATen/ops/_embedding_bag_forward_only_native.h>
 #include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
 #include <ATen/ops/empty.h>
 #endif

@ -95,6 +97,7 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
  }

  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
+  params.use_per_sample_weights = use_per_sample_weights;
  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;

  params.num_indices = num_indices;
@ -177,4 +180,117 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
                            padding_idx);
 }

+Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
+                                         const Tensor& indices,
+                                         const Tensor& offset2bag,
+                                         const Tensor& bag_size,
+                                         const Tensor& max_indices,
+                                         int64_t num_weights,
+                                         bool scale_grad_by_freq,
+                                         int64_t mode,
+                                         const std::optional<Tensor>& per_sample_weights_opt,
+                                         int64_t padding_idx) {
+  // indices and offset2bag are assumed having correct dtypes and
+  // contiguous here due to the checks in _embedding_bag_backward in
+  // EmbeddingBag.cpp.
+  // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
+  // for more details.
+
+  int64_t feature_size = output_grad.size(1);
+  auto weight_grad = at::zeros({num_weights, feature_size}, output_grad.options());
+  EmbeddingBagBackwardParams<uint32_t> params;
+
+  for (const auto dim : c10::irange(2)) {
+    params.output_grad_strides[dim] = output_grad.stride(dim);
+    params.weight_grad_strides[dim] = weight_grad.stride(dim);
+
+    if (mode == EmbeddingBagMode::MAX) {
+      params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
+    }
+  }
+
+  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
+  params.use_per_sample_weights = use_per_sample_weights;
+  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+  params.feature_size = output_grad.size(1);
+  params.mode = static_cast<EmbeddingBagMode>(mode);
+  params.padding_idx = padding_idx;
+
+  auto num_indices = offset2bag.numel();
+  auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
+  MPSStream* stream = getCurrentMPSStream();
+
+  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
+                                                                    mps::scalarToMetalTypeString(output_grad),
+                                                                    mps::scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(
+          pipeline_state, "embedding_bag", {output_grad, indices, offset2bag, bag_size});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mps::mtl_setArgs(computeEncoder,
+                       output_grad,
+                       indices,
+                       offset2bag,
+                       bag_size,
+                       max_indices,
+                       use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
+                       weight_grad,
+                       params);
+
+      mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::move(weight_grad);
+}
+
+Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
+                                                      const Tensor& weight,
+                                                      const Tensor& indices,
+                                                      const Tensor& offsets,
+                                                      const Tensor& offset2bag,
+                                                      int64_t mode,
+                                                      int64_t padding_idx) {
+  TORCH_INTERNAL_ASSERT(static_cast<EmbeddingBagMode>(mode) == EmbeddingBagMode::SUM);
+  int64_t num_indices = indices.size(0);
+  int64_t feature_size = output_grad.size(1);
+  auto per_sample_weights_grad = at::zeros({num_indices}, output_grad.options());
+  EmbeddingBagPerSampleWeightsBackwardParams params;
+
+  for (const auto dim : c10::irange(2)) {
+    params.output_grad_strides[dim] = output_grad.stride(dim);
+    params.weight_strides[dim] = weight.stride(dim);
+  }
+
+  params.per_sample_weights_grad_stride = per_sample_weights_grad.stride(0);
+  params.feature_size = feature_size;
+  params.padding_idx = padding_idx;
+
+  auto num_threads = num_indices * feature_size;
+  MPSStream* stream = getCurrentMPSStream();
+
+  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
+                                                                    mps::scalarToMetalTypeString(output_grad),
+                                                                    mps::scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(
+          pipeline_state, "embedding_bag_per_sample_weights_backward", {output_grad, weight, indices, offset2bag});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mps::mtl_setArgs(computeEncoder, output_grad, weight, indices, offset2bag, per_sample_weights_grad, params);
+
+      mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::move(per_sample_weights_grad);
+}
+
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2379,7 +2379,7 @@

 - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
  dispatch:
-    CPU, CUDA: _embedding_bag_backward_symint
+    CPU, CUDA, MPS: _embedding_bag_backward_symint

 - func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
  dispatch:
@ -2389,12 +2389,14 @@
  dispatch:
    CPU: _embedding_bag_dense_backward_cpu
    CUDA: _embedding_bag_dense_backward_cuda
+    MPS: _embedding_bag_dense_backward_mps
  autogen: _embedding_bag_dense_backward.out

 - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
  dispatch:
    CPU: _embedding_bag_per_sample_weights_backward_cpu
    CUDA: _embedding_bag_per_sample_weights_backward_cuda
+    MPS: _embedding_bag_per_sample_weights_backward_mps
  autogen: _embedding_bag_per_sample_weights_backward.out

 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@ -316,7 +316,7 @@ Tensor NestedTensor_to_padded_tensor_generic(
    TORCH_CHECK(
        (int64_t)output_size_.size() == ret_val.dim(),
        "Length of output_size does not match NestedTensor dims. Broadcasting is not supported.");
-    for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) {
+    for (int64_t i = 0; i < ret_val.dim(); i++) {
      TORCH_CHECK(
          output_size_[i] >= ret_val.size(i),
          "Value in output_size is less than NestedTensor padded size. Truncation is not supported.");
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@ -1198,7 +1198,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
      kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc);
  ideep::tensor src(src_desc, act_contig.data_ptr());
  // weights & bias
-  ideep::tensor& weights = *(weight_.get());
+  ideep::tensor& weights = *(weight_);
  bool with_bias = bias_.has_value();
  const auto& kernel_size = weights.get_dims();
  // dst
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -812,7 +812,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_impl(

  auto is_input_qint8 = input.scalar_type() == c10::ScalarType::QInt8;
  auto input_contig = input.expect_contiguous();
-  auto& w = *(weight_.get());
+  auto& w = *weight_;
  auto K = input.size(dim - 1), M = input.numel() / K, N = w.get_dim(1);
  auto input_dims = {M, K};
  auto input_data_type = is_input_qint8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::u8;
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@ -545,7 +545,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl(
      /*reduce_range=*/reduce_range);
  const std::vector<int32_t>& src_zero_point = std::vector<int32_t>(1, q_params.zero_point);
  // weights, dst
-  auto w = *(weight_.get());
+  auto w = *weight_;
  auto dst_dims = {x.get_dim(0), w.get_dim(1)};
  const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/q_params.scale);
  const ideep::scale_t& weights_scales = w.get_scale();
--- a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
@ -12,7 +12,6 @@
 #include <ATen/quantized/Quantizer.h>
 #include <c10/core/QScheme.h>
 #include <c10/util/irange.h>
-#include <torch/library.h>

 #include <utility>

--- a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
@ -10,7 +10,6 @@
 #include <ATen/quantized/Quantizer.h>
 #include <c10/core/QScheme.h>
 #include <c10/util/irange.h>
-#include <torch/library.h>

 int register_linear_params();

--- a/aten/src/ATen/native/sparse/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/SparseBlas.cpp
@ -65,7 +65,7 @@ Tensor& addmv_out_sparse_compressed(
      return result.zero_();
    } else {
      return at::mul_out(
-          const_cast<Tensor&>(result),
+          result,
          self,
          at::native::scalar_tensor(
              beta,
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@ -1330,18 +1330,18 @@ Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, IntArrayRef dims_to_

 template <typename scalar_t>
 struct ReductionAddOp {
-  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+  scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
    return a + b;
  }
-  inline scalar_t identity() const { return 0; }
+  scalar_t identity() const { return 0; }
 };

 template <typename scalar_t>
 struct ReductionMulOp {
-  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+  scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
    return a * b;
  }
-  inline scalar_t identity() const { return 1; }
+  scalar_t identity() const { return 1; }
 };

 }  // namespace
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@ -55,7 +55,6 @@
 #include <ATen/ops/is_pinned_native.h>
 #include <ATen/ops/resize_as_sparse.h>
 #include <ATen/ops/resize_as_sparse_native.h>
-#include <ATen/ops/sparse_coo_tensor.h>
 #include <ATen/ops/sparse_coo_tensor_native.h>
 #include <ATen/ops/sparse_dim_native.h>
 #include <ATen/ops/sparse_mask_native.h>
--- a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
@ -244,7 +244,7 @@ Tensor& addmv_out_sparse_compressed_cuda(
      return result.zero_();
    } else {
      return at::mul_out(
-          const_cast<Tensor&>(result),
+          result,
          self,
          at::native::scalar_tensor(
              beta,
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@ -10,7 +10,6 @@
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/sparse/SparseBlasImpl.h>
 #include <ATen/native/sparse/cuda/SparseBlasImpl.h>
-#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -94,15 +93,6 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) {
      input.sizes());
 }

-void inline bsrsv2_bsrsm2_may_need_to_sync() {
-#if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11703
-  // cusparse bsrsv2 and bsrsm2 have a synchronization issue that may cause illegal memory access in cuda <= 11.6.x
-  // See https://github.com/pytorch/pytorch/issues/71297
-  ::c10::cuda::device_synchronize();
-#endif
-  // else: do nothing!
-}
-
 void block_sparse_triangular_solve_vec(
    const at::sparse_csr::SparseCsrTensor& A,
    const Tensor& B,
@ -223,7 +213,6 @@ void block_sparse_triangular_solve_vec(
            CUSPARSE_SOLVE_POLICY_NO_LEVEL,
            work_data.get());

-        bsrsv2_bsrsm2_may_need_to_sync();
      });
  if (!X.is_same(*X_)) {
    X.copy_(*X_);
@ -364,7 +353,6 @@ void block_sparse_triangular_solve_mat(
            CUSPARSE_SOLVE_POLICY_NO_LEVEL,
            work_data.get());

-        bsrsv2_bsrsm2_may_need_to_sync();
      });
  if (!X.is_same(*X_)) {
    X.copy_(*X_);
@ -665,12 +653,6 @@ void spgemm(
    const Scalar& beta,
    const Scalar& alpha,
    const at::sparse_csr::SparseCsrTensor& C) {
-  // older versions of cusparse on Windows segfault for complex128 dtype
-#if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400
-  TORCH_CHECK(
-      !(A.scalar_type() == ScalarType::ComplexDouble),
-      "Sparse multiplication with complex128 dtype inputs is not supported with current CUDA version. Please upgrade to CUDA Toolkit 11.2.1+");
-#endif

  IntArrayRef A_sizes = A.sizes();
  auto ndim = A.dim();
@ -953,13 +935,6 @@ void addmv_out_sparse_csr(
  if (mat.layout() == kSparseBsr) {
    return block_sparse_mv(mat, vec, beta, alpha, result);
  }
-#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
-  TORCH_CHECK(
-      false,
-      "Calling addmv on a sparse GPU tensor requires compiling ",
-      "PyTorch with CUDA 10.2+ (CUDA 11+ on Windows). ",
-      "Please use PyTorch built with newer CUDA version.");
-#else
  cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;

  c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_cusparse(result);
@ -970,11 +945,10 @@ void addmv_out_sparse_csr(
  auto descX = at::cuda::sparse::CuSparseDnVecDescriptor(*vec_);
  auto descY = at::cuda::sparse::CuSparseDnVecDescriptor(*result_);

-  // cusparseSpMVAlg_t was updated in cuda 11.2.1 (cusparse 11.4.0)
-#if CUSPARSE_VERSION >= 11400
-  cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT;
-#else
+#ifdef USE_ROCM
  cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
+#else
+  cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT;
 #endif

  // SpMV doesn't support uniform precision computation
@ -1027,7 +1001,6 @@ void addmv_out_sparse_csr(
  if (!result.is_same(*result_)) {
    result.copy_(*result_);
  }
-#endif // !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
 }

 /*
@ -1245,12 +1218,8 @@ void triangular_solve_out_sparse_csr(
      return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
    }
  }
-#if !AT_USE_CUSPARSE_GENERIC_SPSV()
-  TORCH_CHECK(
-      false,
-      "Calling triangular solve on a sparse GPU tensor requires compiling ",
-      "PyTorch with at least CUDA 11.3. ",
-      "Please use PyTorch built with newer CUDA version.");
+#ifdef USE_ROCM
+  TORCH_CHECK(false, "ROCm is not supported");
 #else
  c10::MaybeOwned<Tensor> X_ = prepare_dense_matrix_for_cusparse(X);
  // It should be possible to use mixed memory format
@ -1317,13 +1286,6 @@ void triangular_solve_out_sparse_csr(
              desc_spsv.descriptor()));
        });
  } else {
-#if !AT_USE_CUSPARSE_GENERIC_SPSM()
-    TORCH_CHECK(
-        false,
-        "Calling triangular solve on a sparse GPU tensor requires compiling ",
-        "PyTorch with at least CUDA 11.3.1. ",
-        "Please use PyTorch built with newer CUDA version.");
-#else
    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
        X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
          scalar_t alpha = 1;
@ -1377,12 +1339,11 @@ void triangular_solve_out_sparse_csr(
              CUSPARSE_SPSM_ALG_DEFAULT,
              desc_spsm.descriptor()));
        });
-#endif // !AT_USE_CUSPARSE_GENERIC_SPSM()
  }
  if (!X.is_same(*X_)) {
    X.copy_(*X_);
  }
-#endif // !AT_USE_CUSPARSE_GENERIC_SPSV()
+#endif
 }

 void sampled_addmm_out_sparse_csr(
@ -1391,13 +1352,6 @@ void sampled_addmm_out_sparse_csr(
    const Scalar& beta,
    const Scalar& alpha,
    const at::sparse_csr::SparseCsrTensor& C) {
-#if !(AT_USE_CUSPARSE_GENERIC_SDDMM() || AT_USE_HIPSPARSE_GENERIC_API())
-  TORCH_CHECK(
-      false,
-      "Calling sampled_addmm with sparse GPU tensors requires compiling ",
-      "PyTorch with CUDA 11.2.1+. ",
-      "Please use PyTorch built with newer CUDA version.");
-#else
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == Layout::Strided);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(B.layout() == Layout::Strided);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(C.is_sparse_csr());
@ -1472,7 +1426,6 @@ void sampled_addmm_out_sparse_csr(
              buffer.get()));
        }
      });
-#endif
 }

 } // namespace at::native::sparse::impl::cuda
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@ -203,7 +203,7 @@ class LocalCallbackManager {
  // Runtime cache.
  size_t global_version_{GlobalCallbackManager::NoVersion};
  std::array<CacheEntry, NumRecordScopes> active_callbacks_;
-  std::mt19937 generator_{};
+  std::mt19937 generator_;
 };

 // ============================================================================
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -34,19 +34,24 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
    if "rocm" in expected_filename:
        flaky_models.update(
            {
+                "Background_Matting",
                "alexnet",
                "cait_m36_384",
+                "dla102",
                "demucs",
                "densenet121",
                "detectron2_fcos_r_50_fpn",
                "doctr_det_predictor",
                "doctr_reco_predictor",
+                "dpn107",
+                "fbnetv3_b",
                "hf_BigBird",
                "hf_Longformer",
                "hf_Reformer",
                "hf_Roberta_base",
                "hf_T5",
                "hf_T5_base",
+                "hf_T5_generate",
                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
@ -64,6 +69,7 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "squeezenet1_1",
                "stable_diffusion_text_encoder",
                "stable_diffusion_unet",
+                "swsl_resnext101_32x16d",
                "timm_efficientdet",
                "timm_efficientnet",
                "timm_nfnet",
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -47,6 +47,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "levit_128",
                "llava",
                "microbench_unbacked_tolist_sum",
+                "resnet50",
+                "resnet152",
                "sam",
                "sam_fast",
                "stable_diffusion_text_encoder",
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@ -378,7 +378,7 @@ vgg16,pass,0



-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,18



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@ -286,7 +286,7 @@ vgg16,pass,6



-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,37



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv
@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
+dla102,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
@ -170,7 +170,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0



-mobilenet_v3_large,pass,7
+mobilenet_v3_large,pass,0



@ -210,7 +210,7 @@ pytorch_unet,pass_due_to_skip,7



-resnet152,pass,7
+resnet152,pass,0



@ -218,7 +218,7 @@ resnet18,pass,6



-resnet50,pass,6
+resnet50,pass,0



@ -270,7 +270,7 @@ timm_nfnet,pass,0



-timm_regnet,pass,7
+timm_regnet,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
@ -58,7 +58,7 @@ DistilBertForQuestionAnswering,pass,0



-DistillGPT2,pass,2
+DistillGPT2,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
@ -150,6 +150,10 @@ hf_GPT2_large,pass_due_to_skip,0



+hf_Roberta_base,pass,0
+
+
+
 hf_T5,pass,0


@ -194,6 +198,10 @@ maml_omniglot,pass,0



+microbench_unbacked_tolist_sum,fail_to_run,0
+
+
+
 mnasnet1_0,pass,0


@ -310,6 +318,10 @@ timm_efficientnet,pass,0



+timm_nfnet,pass,0
+
+
+
 timm_regnet,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv
@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
+dla102,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
@ -170,7 +170,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0



-mobilenet_v3_large,pass,7
+mobilenet_v3_large,pass,0



@ -210,7 +210,7 @@ pytorch_unet,pass_due_to_skip,7



-resnet152,pass,7
+resnet152,pass,0



@ -266,7 +266,7 @@ timm_nfnet,pass,0



-timm_regnet,pass,7
+timm_regnet,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
@ -30,7 +30,7 @@ BertForQuestionAnswering,pass,5



-BlenderbotForCausalLM,eager_fail_to_run,0
+BlenderbotForCausalLM,pass_due_to_skip,0



@ -50,7 +50,7 @@ DebertaV2ForMaskedLM,pass_due_to_skip,0



-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
+DebertaV2ForQuestionAnswering,pass,4



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv
@ -150,7 +150,7 @@ pit_b_224,pass,0



-pnasnet5large,pass,0
+pnasnet5large,fail_accuracy,0



@ -158,23 +158,23 @@ poolformer_m36,pass,0



-regnety_002,pass,0
+regnety_002,fail_accuracy,0



-repvgg_a2,pass,0
+repvgg_a2,fail_accuracy,0



-res2net101_26w_4s,pass,0
+res2net101_26w_4s,fail_accuracy,0



-res2net50_14w_8s,pass,0
+res2net50_14w_8s,fail_accuracy,0



-res2next50,pass,0
+res2next50,fail_accuracy,0



@ -206,7 +206,7 @@ swin_base_patch4_window7_224,pass,0



-swsl_resnext101_32x16d,pass,0
+swsl_resnext101_32x16d,fail_accuracy,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
@ -34,7 +34,7 @@ convnext_base,pass,7



-crossvit_9_240,pass,7
+crossvit_9_240,fail_accuracy,7



@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
+dla102,pass,0



@ -62,7 +62,7 @@ eca_botnext26ts_256,pass,7



-eca_halonext26ts,pass,7
+eca_halonext26ts,fail_accuracy,7



@ -74,7 +74,7 @@ fbnetc_100,pass,7



-fbnetv3_b,pass,6
+fbnetv3_b,fail_accuracy,6



@ -130,7 +130,7 @@ mnasnet_100,pass,7



-mobilenetv2_100,pass,7
+mobilenetv2_100,fail_accuracy,7



@ -150,7 +150,7 @@ pit_b_224,pass,6



-pnasnet5large,pass,5
+pnasnet5large,fail_accuracy,5



@ -162,7 +162,7 @@ regnety_002,pass,6



-repvgg_a2,pass,7
+repvgg_a2,fail_accuracy,7



@ -186,7 +186,7 @@ resnest101e,pass,6



-rexnet_100,pass,7
+rexnet_100,fail_accuracy,7



@ -230,7 +230,7 @@ twins_pcpvt_base,pass,7



-visformer_small,pass,7
+visformer_small,fail_accuracy,7



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@ -162,7 +162,15 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,8
+hf_Longformer,pass,4
+
+
+
+hf_Reformer,pass,5
+
+
+
+hf_Roberta_base,pass,0



@ -174,7 +182,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



@ -214,6 +222,10 @@ maml_omniglot,pass,0



+microbench_unbacked_tolist_sum,pass,2
+
+
+
 mnasnet1_0,pass,0


@ -306,6 +318,10 @@ sam,pass,0



+sam_fast,model_fail_to_load,0
+
+
+
 shufflenet_v2_x1_0,pass,0


@ -330,10 +346,18 @@ stable_diffusion_unet,pass_due_to_skip,0



+timm_efficientdet,pass,2
+
+
+
 timm_efficientnet,pass,0



+timm_nfnet,pass,0
+
+
+
 timm_regnet,pass,0


--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
@ -70,7 +70,7 @@ fastNLP_Bert,pass,10



-functorch_dp_cifar10,pass,7
+functorch_dp_cifar10,fail_accuracy,7



@ -110,7 +110,19 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Longformer,pass,10
+
+
+
+hf_Reformer,pass,20
+
+
+
+hf_Roberta_base,pass,6
+
+
+
+hf_T5,pass,5



@ -158,7 +170,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0



-mobilenet_v3_large,pass,7
+mobilenet_v3_large,pass,0



@ -198,7 +210,7 @@ pytorch_unet,pass_due_to_skip,7



-resnet152,pass,7
+resnet152,pass,0



@ -242,11 +254,19 @@ stable_diffusion_unet,pass_due_to_skip,0



+timm_efficientdet,pass,8
+
+
+
 timm_efficientnet,pass,7



-timm_regnet,pass,7
+timm_nfnet,pass,6
+
+
+
+timm_regnet,pass,0



@ -278,7 +298,7 @@ vgg16,pass,0



-vision_maskrcnn,pass,39
+vision_maskrcnn,fail_accuracy,39



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv
@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,7



-dla102,pass,7
+dla102,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
@ -170,7 +170,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0



-mobilenet_v3_large,pass,7
+mobilenet_v3_large,pass,0



@ -210,7 +210,7 @@ pytorch_unet,pass_due_to_skip,7



-resnet152,pass,7
+resnet152,pass,0



@ -270,7 +270,7 @@ timm_nfnet,pass,0



-timm_regnet,pass,7
+timm_regnet,pass,0



--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -2282,7 +2282,9 @@ class BenchmarkRunner:
                del model_copy
                empty_gpu_cache(current_device)

-            # Two eager runs should have exactly same result
+            # Two eager runs should have exactly same result, within tolerance.
+            # TODO If we want the above to be true, then deterministic should be set.
+            # For example, MIOpen convolutions could be implemented with non-deterministic algos.
            is_same = True
            try:
                if (
@ -2292,7 +2294,7 @@ class BenchmarkRunner:
                        correct_rerun_result,
                        fp64_ref=None,
                        cos_similarity=False,
-                        tol=0,
+                        tol=tolerance if torch.version.hip else 0,
                        equal_nan=self.equal_nan,
                        use_larger_multiplier_for_smaller_tensor=self.use_larger_multiplier_for_smaller_tensor(
                            name
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -15,6 +15,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
      m_max_non_split_rounding_size(kLargeBuffer),
      m_garbage_collection_threshold(0),
      m_pinned_num_register_threads(1),
+      m_pinned_reserve_segment_size_mb(0),
      m_expandable_segments(false),
 #if CUDA_VERSION >= 12030
      m_expandable_segments_handle_type(
@ -371,6 +372,9 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
    } else if (config_item_view == "pinned_num_register_threads") {
      i = parsePinnedNumRegisterThreads(config, i);
      used_native_specific_option = true;
+    } else if (config_item_view == "pinned_reserve_segment_size_mb") {
+      i = parsePinnedReserveSegmentSize(config, i);
+      used_native_specific_option = true;
    } else if (config_item_view == "pinned_use_background_threads") {
      i = parsePinnedUseBackgroundThreads(config, i);
      used_native_specific_option = true;
@ -451,6 +455,22 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
  return i;
 }

+size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    size_t val2 = stoi(config[i]);
+    TORCH_CHECK(
+        val2 > 0, "Pinned reserve segment size has to be greater than 0 ", "");
+    m_pinned_reserve_segment_size_mb = val2;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_reserve_segment_size_mb value", "");
+  }
+  return i;
+}
+
 size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
    const std::vector<std::string>& config,
    size_t i) {
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -70,6 +70,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return instance().m_pinned_use_background_threads;
  }

+  static size_t pinned_reserve_segment_size_mb() {
+    return instance().m_pinned_reserve_segment_size_mb;
+  }
+
  static size_t pinned_max_register_threads() {
    // Based on the benchmark results, we see better allocation performance
    // with 8 threads. However on future systems, we may need more threads
@ -143,6 +147,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
  size_t parsePinnedNumRegisterThreads(
      const std::vector<std::string>& config,
      size_t i);
+  size_t parsePinnedReserveSegmentSize(
+      const std::vector<std::string>& config,
+      size_t i);
  size_t parsePinnedUseBackgroundThreads(
      const std::vector<std::string>& config,
      size_t i);
@ -155,6 +162,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
  std::vector<size_t> m_roundup_power2_divisions;
  std::atomic<double> m_garbage_collection_threshold;
  std::atomic<size_t> m_pinned_num_register_threads;
+  std::atomic<size_t> m_pinned_reserve_segment_size_mb;
  std::atomic<bool> m_expandable_segments;
  std::atomic<Expandable_Segments_Handle_Type>
      m_expandable_segments_handle_type;
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -816,7 +816,7 @@ struct ExpandableSegment {
 struct BlockState {
  c10::DeviceIndex device = 0;
  cudaStream_t stream = nullptr;
-  stream_set stream_uses = {};
+  stream_set stream_uses;
  size_t size = 0;
  void* ptr = nullptr;
  bool allocated = false;
@ -1683,7 +1683,7 @@ class DeviceCachingAllocator {
    cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone};
  };

-  inline CaptureInfo stream_get_capture_info(cudaStream_t stream) {
+  CaptureInfo stream_get_capture_info(cudaStream_t stream) {
    CaptureInfo info{};
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
    C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
@ -1997,7 +1997,7 @@ class DeviceCachingAllocator {
      ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
      auto full_range = block->expandable_segment_->share(
          SegmentRange(block->ptr, block->size), ss);
-      offset = (char*)block->ptr - (char*)full_range.ptr;
+      offset = (char*)block->ptr - full_range.ptr;
    }
    return ShareableHandle{offset, ss.str()};
  }
@ -3384,7 +3384,7 @@ class DeviceCachingAllocator {
    if (pool->owner_PrivatePool && pool->owner_PrivatePool->allocator()) {
      // If there is an active mempool with a given allocator,
      // we use the given allocator's delete function.
-      pool->owner_PrivatePool->allocator()->raw_delete((void*)block->ptr);
+      pool->owner_PrivatePool->allocator()->raw_delete(block->ptr);
    } else {
      C10_CUDA_CHECK(cudaFree((void*)block->ptr));
    }
@ -3423,8 +3423,7 @@ class DeviceCachingAllocator {
    }
    block->pool->blocks.erase(block);

-    ptrdiff_t before_size =
-        static_cast<char*>(unmapped.ptr) - static_cast<char*>(block->ptr);
+    ptrdiff_t before_size = unmapped.ptr - static_cast<char*>(block->ptr);
    if (before_size > 0) {
      // prev? -> before_free -> block
      Block* before_free = new Block(
@ -3442,7 +3441,7 @@ class DeviceCachingAllocator {
          block->stream,
          after_size,
          block->pool,
-          static_cast<char*>(unmapped.ptr) + unmapped.size);
+          unmapped.ptr + unmapped.size);
      after_free->expandable_segment_ = block->expandable_segment_;
      after_free->splice(block, block->next);
      block->pool->insert_into_blocks(after_free);
@ -3832,7 +3831,7 @@ class NativeCachingAllocator : public CUDAAllocator {
        ": did you call init?");
    Block* block = device_allocator[device]->malloc(device, size, stream);
    add_allocated_block(block);
-    *devPtr = (void*)block->ptr;
+    *devPtr = block->ptr;
    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
    if (C10_UNLIKELY(interp)) {
      (*interp)->trace_gpu_memory_allocation(
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -446,7 +446,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
    return !devs_initialized_flags.empty();
  }

-  static inline void assertValidDevice(c10::DeviceIndex device) {
+  static void assertValidDevice(c10::DeviceIndex device) {
    TORCH_CHECK(
        0 <= device && device < device_count, "Invalid device argument.");
  }
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -556,12 +556,13 @@ if(USE_CUDA OR USE_ROCM)
  append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
 endif()

-if(USE_CUDA)
-  append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
-endif()
-if(USE_ROCM)
-  append_filelist("libtorch_nativert_cuda_sources" Caffe2_HIP_SRCS)
-endif()
+# NativeRT is disabled
+# if(USE_CUDA)
+#   append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
+# endif()
+# if(USE_ROCM)
+#   append_filelist("libtorch_nativert_cuda_sources" Caffe2_HIP_SRCS)
+# endif()

 if(USE_CUDA)
  list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
@ -1360,7 +1361,8 @@ if(BUILD_TEST)
    )
  else()
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
+    # NativeRT is disabled
+    # add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
    if(USE_DISTRIBUTED)
      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
--- a/Show More
+++ b/Show More