updates

Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
2025-11-01 04:54:55 +08:00 · 2025-10-01 20:12:21 -07:00 · 2025-10-01 19:46:46 -07:00 · 2025-10-01 10:46:10 -07:00 · 2025-09-30 14:07:57 -07:00 · 2025-09-30 13:55:19 -07:00
1184 changed files with 13731 additions and 21723 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -37,9 +37,9 @@ case ${DOCKER_TAG_PREFIX} in
  rocm*)
    BASE_TARGET=rocm
    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950, gfx115x conditionally starting in ROCm 7.0
+    # add gfx950 conditionally starting in ROCm 7.0
    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
    fi
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -344,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-deb42f2a8e48f5032b4a98ee781a15fa87a157cf
+e0dda9059d082537cee36be6c5e4fe3b18c880c0
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
+27664085f804afc83df26f740bb46c365854f2c4
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -19,8 +19,8 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
-pip_install onnxruntime==1.23.0
-pip_install onnxscript==0.5.3
+pip_install onnxruntime==1.22.1
+pip_install onnxscript==0.4.0

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -115,9 +115,6 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0

-ADD ./common/patch_libstdc.sh patch_libstdc.sh
-RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
-
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -84,9 +84,9 @@ case ${image} in
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -120,8 +120,9 @@ ninja==1.11.1.4
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.55.2, 0.60.0
+#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
+#For numba issue see https://github.com/pytorch/pytorch/issues/51511
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073

 #numpy
@ -241,9 +242,10 @@ pygments==2.15.0
 #Pinned versions: 14.1.0
 #test that import:

-scikit-image==0.22.0
+scikit-image==0.19.3 ; python_version < "3.10"
+scikit-image==0.22.0 ; python_version >= "3.10"
 #Description: image processing routines
-#Pinned versions: 0.22.0
+#Pinned versions:
 #test that import: test_nn.py

 #scikit-learn
@ -339,7 +341,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:

-onnxscript==0.5.3
+onnxscript==0.4.0
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201

 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -18,6 +18,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 .PHONY: all
 all: magma-rocm70
 all: magma-rocm64
+all: magma-rocm63

 .PHONY:
 clean:
@ -33,3 +34,8 @@ magma-rocm70:
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
+
+.PHONY: magma-rocm63
+magma-rocm63: DESIRED_ROCM := 6.3
+magma-rocm63:
+	$(DOCKER_RUN)
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' ]]; then
+if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -34,14 +34,12 @@ fi


 # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-  NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
-  if [ -n "$NUMBA_CUDA_DIR" ]; then
-    NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
-    pushd "$NUMBA_CUDA_DIR"
-    patch -p4 <"$NUMBA_PATCH"
-    popd
-  fi
+NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
+if [ -n "$NUMBA_CUDA_DIR" ]; then
+  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
+  pushd "$NUMBA_CUDA_DIR"
+  patch -p4 <"$NUMBA_PATCH"
+  popd
 fi

 echo "Environment variables:"
@ -886,7 +884,7 @@ test_inductor_torchbench_smoketest_perf() {
  done

  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move this to .ci/docker/requirements-ci.txt
-python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
+python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"

 run_tests() {
    # Run nvidia-smi if available
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
-    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
 )

 set "CUDA_PATH=%CUDA_PATH_V128%"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -59,14 +59,13 @@ performance-*,
 -performance-enum-size,
 readability-container-size-empty,
 readability-delete-null-pointer,
-readability-duplicate-include,
+readability-duplicate-include
 readability-misplaced-array-index,
-readability-redundant*,
+readability-redundant*
 readability-simplify-subscript-expr,
 readability-string-compare,
 -readability-redundant-access-specifiers,
 -readability-redundant-control-flow,
-readability-redundant-inline-specifier,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -28,10 +28,6 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

-    - name: Print GPU info (if present)
-      shell: bash
-      run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
-
    - name: Check if in a container runner
      shell: bash
      id: check_container_runner
@ -86,6 +82,37 @@ runs:
        # Prune all of the docker images
        docker system prune -af

+    - name: Manually resolve download.pytorch.org
+      shell: bash
+      continue-on-error: true
+      run: |
+        set +e
+        set -x
+
+        PT_DOMAIN=download.pytorch.org
+        # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
+        # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
+        # one is returned at random
+        RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
+
+        if [ -z "${RESOLVED_IP}" ]; then
+          echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
+          RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
+
+          if [ -z "${RESOLVED_IP}" ]; then
+            echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
+            exit 1
+          fi
+        fi
+
+        if grep -r "${PT_DOMAIN}" /etc/hosts; then
+          # Clean up any old records first
+          sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
+        fi
+
+        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
+        cat /etc/hosts
+
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-0ad9951c416d33c5da4f7a504fb162cbe62386f5
+78a47f87ce259a48f0391fa9ae15add05ea7432b
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-2a9138a26ee257fef05310ad3fecf7c55fe80d73
+0fc62aa26a30ed7ca419d285f285cb5ba02c4394
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -202,7 +202,7 @@ ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=4
 ENV NVCC_THREADS=$nvcc_threads
-ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}

 ARG USE_SCCACHE
@ -297,28 +297,16 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

-# Install build and runtime dependencies, this is needed for flashinfer install
-COPY requirements/build.txt requirements/build.txt
-COPY use_existing_torch.py use_existing_torch.py
-RUN python3 use_existing_torch.py
-RUN cat requirements/build.txt
-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
        python3 -m pip install uv==0.8.4; \
    fi
-
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
-
-
 # Default mount file as placeholder, this just avoid the mount error
 ARG TORCH_WHEELS_PATH="./requirements"
 # Install torch, torchaudio and torchvision
@ -344,11 +332,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install xformers wheel from previous stage
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /wheels/xformers/*.whl --verbose
+
 # Build flashinfer from source.
 ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
 # install package for build flashinfer
 # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738

+RUN pip install build==1.3.0
 RUN pip freeze | grep -E 'setuptools|packaging|build'

 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
--- a/.github/ci_configs/vllm/use_existing_torch.py
+++ b/.github/ci_configs/vllm/use_existing_torch.py
@ -1,14 +1,9 @@
 import glob
-import os


 requires_files = glob.glob("requirements/*.txt")
 requires_files += ["pyproject.toml"]
-
 for file in requires_files:
-    if not os.path.exists(file):
-        print(f"!!! skipping missing {file}")
-        continue
    print(f">>> cleaning {file}")
    with open(file) as f:
        lines = f.readlines()
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -18,7 +18,6 @@ class GitHubComment:
    body_text: str
    created_at: str
    author_login: str
-    author_url: Optional[str]
    author_association: str
    editor_login: Optional[str]
    database_id: int
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -38,7 +38,6 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text="mock_body_text",
            created_at="",
            author_login="",
-            author_url=None,
            author_association="",
            editor_login=None,
            database_id=1,
@ -49,7 +48,6 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
            created_at="",
            author_login=BOT_AUTHORS[1],
-            author_url=None,
            author_association="",
            editor_login=None,
            database_id=2,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -32,7 +32,6 @@ from trymerge import (
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
-    PostCommentError,
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
@ -589,23 +588,6 @@ class TestTryMerge(TestCase):
            self.assertEqual(mock_merge_base, pr.get_merge_base())
            mocked_gh_fetch_merge_base.assert_called_once()

-    def test_app_can_revert(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 164660)
-        repo = DummyGitRepo()
-        app_comment_id, impostor_comment_id = 3375785595, 3377647892
-        # Check that app can revert
-        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
-        # But impostor can not
-        self.assertRaises(
-            PostCommentError,
-            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
-        )
-        # Despite it's name being the name of the bot
-        self.assertEqual(
-            pr.get_comment_by_id(impostor_comment_id).author_login,
-            "pytorch-auto-revert",
-        )
-

@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -234,7 +234,6 @@ query ($owner: String!, $name: String!, $number: Int!) {
          createdAt
          author {
            login
-            url
          }
          authorAssociation
          editor {
@ -1094,7 +1093,6 @@ class GitHubPR:
            body_text=node["bodyText"],
            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
-            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
            editor_login=editor["login"] if editor else None,
            database_id=node["databaseId"],
@ -2031,11 +2029,6 @@ def validate_revert(
    # For some reason, one can not be a member of private repo, only CONTRIBUTOR
    if pr.is_base_repo_private():
        allowed_reverters.append("CONTRIBUTOR")
-    # Special case the pytorch-auto-revert app, whose does not have association
-    # But should be able to issue revert command
-    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
-        allowed_reverters.append("NONE")
-
    if author_association not in allowed_reverters:
        raise PostCommentError(
            f"Will not revert as @{author_login} is not one of "
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -40,15 +40,6 @@ jobs:
              # Use gh CLI to get changed files in the PR with explicit repo
              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')

-              # See https://github.com/pytorch/pytorch/pull/134215#issuecomment-2332128790
-              PYI_FILES_TO_ADD=""
-              for file in ${CHANGED_FILES}; do
-                if [[ "${file}" == *".pyi.in" ]]; then
-                  PYI_FILES_TO_ADD="${PYI_FILES_TO_ADD} ${file//.in/}"
-                fi
-              done
-              CHANGED_FILES="${CHANGED_FILES}${PYI_FILES_TO_ADD}"
-
              if [ -z "$CHANGED_FILES" ]; then
                echo "No changed files found, setting to '*'"
                CHANGED_FILES="*"
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100

 on:
  schedule:
-    - cron: 15 0 * * 1-6
+    - cron: 15 0,12 * * 1-6
    - cron: 0 7 * * 0
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -63,7 +63,6 @@ jobs:
      # Same as the build job
      python-version: 3.12.7
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
-      timeout-minutes: 300
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -106,16 +106,6 @@ jobs:
          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
        ]}
    secrets: inherit

--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -213,9 +213,9 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
        ]}
    secrets: inherit

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,9 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # More memory is needed to build with asan
      runner: linux.2xlarge.memory
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
      test-matrix: |
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -59,29 +59,3 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
-
-  linux-jammy-rocm-py3_10-gfx1100-test:
-    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3_10-gfx1100
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
-        ]}
-      tests-to-include: >
-         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
-         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
-         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
-         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
-         inductor/test_flex_attention inductor/test_max_autotune
-    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,6 +140,8 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
+      # More memory is needed to build with asan
+      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -160,10 +160,9 @@ jobs:
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
        ]}
    secrets: inherit

@ -190,6 +189,41 @@ jobs:
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    secrets: inherit

+  linux-jammy-rocm-py3_10-build:
+    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
+    secrets: inherit
+
  inductor-build:
    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -42,7 +42,7 @@ jobs:
      build-external-packages: "vllm"
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.0 8.9 9.0'
+      cuda-arch-list: '8.0;8.9;9.0'
      runner: linux.24xlarge.memory
      test-matrix: |
        { include: [
--- a/.gitignore
+++ b/.gitignore
@ -88,7 +88,7 @@ torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
-torch/headeronly/version.h
+torch/csrc/api/include/torch/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -18,7 +18,6 @@ exclude_patterns = [
    'torch/_inductor/autoheuristic/artifacts/**',
    'scripts/**',
    'test/generated_type_hints_smoketest.py',
-    'test/test_torchfuzz_repros.py',
    # CPython tests
    'test/dynamo/cpython/**',
    # Tests from the NumPy test suite
@ -28,7 +27,6 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
-    "tools/experimental/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
@ -198,7 +196,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
-    'tools/experimental/torchfuzz/**',
+    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
@ -1573,7 +1571,6 @@ exclude_patterns = [
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
    'test/dynamo/cpython/**',
-    'test/test_torchfuzz_repros.py',
    'scripts/**',
    'third_party/**',
    'fb/**',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -13,9 +13,6 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")

-# Export files for use by torch/headeronly (where version.h generation now lives)
-exports_files(["version.txt"])
-
 define_targets(rules = rules)

 COMMON_COPTS = [
@ -693,9 +690,7 @@ cc_library(
            "torch/csrc/*/generated/*.h",
            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
-    ) + GENERATED_AUTOGRAD_CPP + [
-        "//torch/headeronly:version_h",
-    ],
+    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
    includes = [
        "third_party/kineto/libkineto/include",
        "torch/csrc",
--- a/18
+++ b/18
@ -181,15 +181,15 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/csrc/jit/python/init.cpp @mikaylagawarecki

 # CUDA and CUDA math libraries
-aten/src/ATen/cuda/ @eqy @syed-ahmed @Aidyn-A
-aten/src/ATen/cudnn/ @eqy @syed-ahmed @Aidyn-A
-aten/src/ATen/native/cuda/ @eqy @syed-ahmed @Aidyn-A
-aten/src/ATen/native/cudnn/ @eqy @syed-ahmed @Aidyn-A
-c10/cuda @eqy @syed-ahmed @Aidyn-A
-torch/cuda/ @eqy @syed-ahmed @Aidyn-A
-torch/csrc/cuda/ @eqy @syed-ahmed @Aidyn-A
-torch/backends/cuda/ @eqy @syed-ahmed @Aidyn-A
-torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
+aten/src/ATen/cuda/ @eqy @syed-ahmed
+aten/src/ATen/cudnn/ @eqy @syed-ahmed
+aten/src/ATen/native/cuda/ @eqy @syed-ahmed
+aten/src/ATen/native/cudnn/ @eqy @syed-ahmed
+c10/cuda @eqy @syed-ahmed
+torch/cuda/ @eqy @syed-ahmed
+torch/csrc/cuda/ @eqy @syed-ahmed
+torch/backends/cuda/ @eqy @syed-ahmed
+torch/backends/cudnn/ @eqy @syed-ahmed

 # PyTree utilities
 /torch/utils/_pytree.py @XuehaiPan
--- a/5
+++ b/5
@ -50,10 +50,11 @@ RUN git submodule update --init --recursive
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
 ARG CUDA_PATH=cu121
+ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
-# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
-RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
+RUN /opt/conda/bin/conda update -y -n base -c defaults conda
+RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}

 ARG TARGETPLATFORM

--- a/RELEASE.md
+++ b/RELEASE.md
@ -3,7 +3,6 @@
 <!-- toc -->

  - [Release Compatibility Matrix](#release-compatibility-matrix)
-    - [PyTorch CUDA Support Matrix](#pytorch-cuda-support-matrix)
  - [Release Cadence](#release-cadence)
  - [General Overview](#general-overview)
    - [Frequently Asked Questions](#frequently-asked-questions)
@ -64,22 +63,6 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 | 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
 | 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |

-### PyTorch CUDA Support Matrix
-
-For Release 2.9 PyTorch Supports following CUDA Architectures:
-
-| CUDA | architectures supported for Linux x86 and Windows builds | notes |
-| --- | --- | --- |
-| 12.6.3 | Maxwell(5.0), Pascal(6.0), Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0) | |
-| 12.8.1 | Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0)  | |
-| 13.0.0 | Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0+PTX) | +PTX available on linux builds only |
-
-| CUDA | architectures supported for Linux aarch64 builds |
-| --- | --- |
-| 12.6.3 | Ampere(8.0), Hopper(9.0) |
-| 12.8.1 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 12.0)  |
-| 13.0.0 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 11.0, 12.0+PTX) |
-
 ## Release Cadence

 Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -605,11 +605,6 @@ if(UNIX)
  if(HAVE_MALLOC_USABLE_SIZE)
    add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
  endif(HAVE_MALLOC_USABLE_SIZE)
-  set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
-  CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
-  if(HAVE_POSIX_FALLOCATE)
-    add_definitions(-DHAVE_POSIX_FALLOCATE=1)
-  endif(HAVE_POSIX_FALLOCATE)
 endif(UNIX)

 ADD_DEFINITIONS(-DUSE_EXTERNAL_MZCRC)
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -292,6 +292,42 @@ bool Context::userEnabledOverrideableSDP() const {
  return enabled_overrideable;
 }

+static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
+static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
+
+bool Context::checkCuBLASConfigDeterministic() {
+  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
+  // is set to deterministic setting
+  if (hasCUDART()) {
+    const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
+    return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
+  }
+  return true;
+}
+
+void Context::alertCuBLASConfigNotDeterministic() const {
+  static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
+  if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
+    return;
+  }
+
+  auto msg = c10::str(
+    "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
+    "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
+    "it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
+    "case, you must set an environment variable before running your PyTorch application: ",
+    cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
+    cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
+    "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
+  );
+
+  if (deterministicAlgorithmsWarnOnly()) {
+    TORCH_WARN(msg);
+  } else {
+    TORCH_CHECK(false, msg);
+  }
+}
+
 bool Context::benchmarkCuDNN() const {
  return benchmark_cudnn;
 }
@ -483,8 +519,8 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60300
          "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 70000
-          "gfx950", "gfx1150", "gfx1151"
+#if ROCM_VERSION >= 60500
+          "gfx950"
 #endif
      };
      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
@ -587,33 +623,20 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
  rocm_fa_preferred_backend = b;
 }

-CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
+bool Context::allowFP16ReductionCuBLAS() const {
  return allow_fp16_reduction_cublas;
 }

-CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
-  TORCH_CHECK(
-      !(allow_reduced_precision && !allow_splitk),
-      "allow_splitk=False is not supported when reduced precision reductions are enabled");
-  if (allow_reduced_precision) {
-    return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
-  } else if (allow_splitk) {
-    return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
-  } else {
-    return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
-  }
+void Context::setAllowFP16ReductionCuBLAS(bool b) {
+  allow_fp16_reduction_cublas = b;
 }

-void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
-  allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
-}
-
-CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
+bool Context::allowBF16ReductionCuBLAS() const {
  return allow_bf16_reduction_cublas;
 }

-void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
-  allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
+void Context::setAllowBF16ReductionCuBLAS(bool b) {
+  allow_bf16_reduction_cublas = b;
 }

 bool Context::allowFP16AccumulationCuBLAS() const {
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -25,7 +25,6 @@
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
-#include <c10/util/hash.h>
 #include <c10/util/irange.h>

 #include <cstdint>
@ -38,12 +37,6 @@ namespace at {
 class Tensor;

 enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
-
-enum class CuBLASReductionOption : uint8_t {
-  AllowReducedPrecisionWithSplitK = 0,
-  DisallowReducedPrecisionAllowSplitK = 1,
-  DisallowReducedPrecisionDisallowSplitK = 2,
-};
 enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
 enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
 enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
@ -326,7 +319,13 @@ class TORCH_API Context {
  //
  // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
  //   of the time, this should be accomplished by calling
-  //   `at::globalContext().alertNotDeterminstic().
+  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
+  //   nondeterministic behavior is caused by the CuBLAS workspace
+  //   configuration in CUDA >= 10.2,
+  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
+  //   called instead (in this case, a comment explaining why the operation is
+  //   nondeterministic is not necessary). See below for details on these
+  //   methods.
  //
  // * Have an entry in the list of nondeterministic PyTorch operations in the
  //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
@ -350,6 +349,12 @@ class TORCH_API Context {
  // Throws an error if `Context::deterministicAlgorithms()` is true
  static void alertNotDeterministic(std::string_view const& caller);

+  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
+  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
+  // ":4096:8". For more details:
+  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+  void alertCuBLASConfigNotDeterministic() const;
+
  void setFloat32MatmulPrecision(const std::string& s);
  void setFloat32Precision(
      Float32Backend backend,
@ -363,14 +368,10 @@ class TORCH_API Context {
  void setAllowTF32CuBLAS(bool);
  Float32MatmulPrecision float32MatmulPrecision() const;
  Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
-  CuBLASReductionOption allowFP16ReductionCuBLAS() const;
-  void setAllowFP16ReductionCuBLAS(
-      bool allow_reduced_precision,
-      bool allow_splitk = true);
-  CuBLASReductionOption allowBF16ReductionCuBLAS() const;
-  void setAllowBF16ReductionCuBLAS(
-      bool allow_reduced_precision,
-      bool allow_splitk = true);
+  bool allowFP16ReductionCuBLAS() const;
+  void setAllowFP16ReductionCuBLAS(bool);
+  bool allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(bool);
  bool allowFP16AccumulationCuBLAS() const;
  void setAllowFP16AccumulationCuBLAS(bool);

@ -435,6 +436,7 @@ class TORCH_API Context {
  }

 private:
+  static bool checkCuBLASConfigDeterministic();
  std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
@ -462,10 +464,8 @@ class TORCH_API Context {
      : at::Float32MatmulPrecision::HIGHEST;
  int benchmark_limit_cudnn = 10;
  bool allow_tf32_cudnn = true;
-  CuBLASReductionOption allow_fp16_reduction_cublas =
-      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
-  CuBLASReductionOption allow_bf16_reduction_cublas =
-      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  bool allow_fp16_reduction_cublas = true;
+  bool allow_bf16_reduction_cublas = true;
  bool allow_fp16_accumulation_cublas = false;
  std::optional<int32_t> sm_carveout = std::nullopt;
  bool enabled_mkldnn = true;
@ -495,22 +495,33 @@ class TORCH_API Context {
  bool enable_sparse_tensor_invariant_checks = false;
  bool allow_fp16_reduction_cpu = false;

-  using Key = std::pair<Float32Backend, Float32Op>;
-  std::unordered_map<Key, Float32Precision, c10::hash<Key>> fp32_precision = {
-      {{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE},
-      {{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE},
-      {{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE},
-      {{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE},
-      {{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE},
-      {{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE},
-      {{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32},
-      {{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32},
-      {{Float32Backend::CUDA, Float32Op::MATMUL},
-       float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
-           ? Float32Precision::NONE
-           : Float32Precision::TF32},
+  struct BackendOpHash {
+    size_t operator()(const std::pair<Float32Backend, Float32Op>& key) const {
+      size_t k1{static_cast<size_t>(key.first)};
+      size_t k2{static_cast<size_t>(key.second)};
+      return std::hash<size_t>{}((k1 << 32) | k2);
+    }
  };

+  std::unordered_map<
+      std::pair<Float32Backend, Float32Op>,
+      Float32Precision,
+      BackendOpHash>
+      fp32_precision = {
+          {{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE},
+          {{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE},
+          {{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE},
+          {{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE},
+          {{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE},
+          {{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE},
+          {{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32},
+          {{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32},
+          {{Float32Backend::CUDA, Float32Op::MATMUL},
+           float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
+               ? Float32Precision::NONE
+               : Float32Precision::TF32},
+      };
+
  Allocator* prev_allocator_ptr_{nullptr};
 };

@ -690,4 +701,5 @@ struct TORCH_API ROCmBackwardPassGuard {
  ~ROCmBackwardPassGuard();
  static bool is_backward_pass();
 };
+
 } // namespace at
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@ -292,28 +292,6 @@ MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags,
          if (ftruncate(fd, static_cast<off_t>(size)) == -1) {
            TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", c10::utils::str_error(errno), " (", errno, ")");
          }
-
-#ifdef HAVE_POSIX_FALLOCATE
-          if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
-            for (;;) {
-              if (posix_fallocate(fd, 0, static_cast<off_t>(size)) == 0) {
-                break;
-              }
-
-              if (errno == EINTR) {
-                continue;
-              }
-
-              if (errno == EINVAL || errno == EOPNOTSUPP) {
-                // the underlying filesystem does not support the operation
-                break;
-              }
-
-              TORCH_CHECK(false, "unable to allocate shared memory(shm) for file <", filename_, ">: ", c10::utils::str_error(errno), " (", errno, ")");
-            }
-          }
-#endif
-
          if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast<int64_t>(size)) {
 #ifndef STRIP_ERROR_MESSAGES
            int last_err = errno;
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@ -179,7 +179,7 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef
    return;
  }
  const auto src_names = src.names();
-  const auto result_dim = result.dim();
+  const auto result_dim = static_cast<int64_t>(result.dim());
  const auto src_dim = static_cast<int64_t>(src_names.size());
  const auto excluded_dim = static_cast<int64_t>(excluded_idxs.size());
  TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim);
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -229,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  }

  void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
-    _resize_(sparse_dim, dense_dim, size);
+    return _resize_(sparse_dim, dense_dim, size);
  }

  void resize_(
      int64_t sparse_dim,
      int64_t dense_dim,
      ArrayRef<c10::SymInt> size) {
-    _resize_(sparse_dim, dense_dim, size);
+    return _resize_(sparse_dim, dense_dim, size);
  }

  // NOTE: this function will resize the sparse tensor and also set `indices`
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
    }
  }

-  set_item(self, indices, value);
+  return set_item(self, indices, value);
 }

 } // namespace indexing
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -214,7 +214,7 @@ inline Tensor applySlice(
      "step must be greater than zero");

  // See NOTE [nested tensor size for indexing]
-  if (self_sizes.has_value() && !self_sizes.value().empty()) {
+  if (self_sizes.has_value() && self_sizes.value().size() > 0) {
    // Skip this optimization if we are tracing, as the trace may be polymorphic
    // over the shape of the `self` tensor, and we still want to record
    // the slice.
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -765,8 +765,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
  if (numel == 0) {
    return;
  } else if (numel < grain_size || at::get_num_threads() == 1) {
-    serial_for_each(loop, {0, numel});
-    return;
+    return serial_for_each(loop, {0, numel});
  } else {
    at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
      serial_for_each(loop, {begin, end});
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -273,11 +273,11 @@ void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout)
 }

 void * maybe_data_ptr(const Tensor& tensor) {
-  return tensor.defined() ? tensor.data_ptr() : nullptr;
+  return tensor.defined() ? (void *)tensor.data_ptr() : nullptr;
 }

 void * maybe_data_ptr(const TensorArg& tensor) {
-  return tensor->defined() ? tensor->data_ptr() : nullptr;
+  return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
 }

 void check_dim_size(
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -50,57 +50,19 @@ namespace {
  constexpr size_t MAX_SIZE_INDEX = 64;
 }

-// A large reserved pinned memory segment that is created in advance which is used
-// to allocate small pinned memory requests to avoid calling into expensive APIs.
-// We never free this memory and move up the pointer as we allocate new blocks
-// and when blocks are freed, they are cached in the free lists.
-struct PinnedReserveSegment {
-  PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size),
-    current_ptr_(start_), initialized_(true) {}
-
-  PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {}
-
-  bool initialized() {
-    return initialized_;
-  }
-
-  void* allocate(size_t bytes) {
-    std::lock_guard<std::mutex> guard(mutex_);
-
-    // Round up the requested size to 4KB boundary for all including the small ones.
-    size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1);
-
-    if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) {
-      return nullptr;
-    }
-
-    void* ptr = current_ptr_;
-    current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes;
-    return ptr;
-  }
-
-  bool owns(void* ptr) {
-    return ptr >= start_ && ptr < (uint8_t*)start_ + size_;
-  }
-
-  std::mutex mutex_;
-  void* start_;
-  size_t size_;
-  void* current_ptr_;
-  bool initialized_;
-};
-
 // Struct containing memory allocator summary statistics for host.
 struct TORCH_API HostStats {
-  // COUNT: total allocations (active)
-  Stat active_requests;
-  // SUM: bytes allocated/reserved by this memory alocator. (active)
-  Stat active_bytes;
-  // COUNT: total allocations (active + free)
-  Stat allocations;
-  // SUM: bytes allocated/reserved by this memory alocator. This accounts
-  // for both free and in-use blocks.
+  // COUNT: allocations requested by client code. Note that active
+  // count can be extracted by looking at current allocations
+  Stat allocation;
+  // COUNT: number of allocated segments from host memory allocation.
+  Stat segment;
+
+  // SUM: bytes allocated by this memory alocator. Note that active bytes
+  // can be extracted by looking at current bytes allocated
  Stat allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  Stat reserved_bytes;

  // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
  DurationStat host_alloc_time;
@ -115,7 +77,7 @@ struct TORCH_API HostStats {
  // COUNT: number of times cudaHostFree/cudaHostUnregister was called.
  int64_t num_host_free = 0; // This is derived from segment or timing

-  // Count of cudaHostAlloc/cudaHostRegister per bucket
+  // Count of cudaHostFree/cudaHostUnregister per bucket
  std::vector<int64_t> bucket_allocation = std::vector<int64_t>(MAX_SIZE_INDEX);
 };

@ -124,22 +86,17 @@ struct TORCH_API HostStats {
 // avoid locking the allocator while collecting stats.
 struct alignas(64) HostStatsStaged {
  std::mutex timing_mutex_;
-  // COUNT: total allocations (active + free)
+  // COUNT: allocations requested by client code resulting in a new segment/block allocation
+  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocation;
+  // SUM: bytes within active memory blocks, including blocks that are
+  // currently in the free list.
  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
-  Stat allocations;
-  // SUM: bytes allocated/reserved by this memory alocator. This accounts
-  // for both free and in-use blocks.
  Stat allocated_bytes;
-  // COUNT: number of allocations per bucket (active)
-  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
-  std::vector<Stat> active_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
-  // SUM: bytes of allocation per bucket (active)
-  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
-  std::vector<Stat> active_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
-  // COUNT: number of allocations per bucket (active + free)
+  // COUNT: number of allocations per bucket
  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
  std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
-  // SUM: bytes of allocation per bucket (active + free)
+  // SUM: bytes of allocation per bucket
  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
  std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
  // SUM: time spent in cudaHostAlloc/cudaHostRegister
@ -243,21 +200,7 @@ struct CachingHostAllocatorImpl {
    // background.
    if (!pinned_use_background_threads()) {
      process_events();
-    }
-
-    // Round up the allocation to the nearest power of two to improve reuse.
-    // These power of two sizes are also used to index into the free list.
-    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
-
-    // First, try to allocate from the free list
-    auto* block = get_free_block(roundSize);
-    if (block) {
-      return {block->ptr_, reinterpret_cast<void*>(block)};
-    }
-
-    // Check in the recently freed blocks with pending events to see if we
-    // can reuse them. Call get_free_block again after processing events
-    if (pinned_use_background_threads()) {
+    } else {
      // Launch the background thread and process events in a loop.
      static bool background_thread_flag [[maybe_unused]] = [this] {
        getBackgroundThreadPool()->run([&]() {
@ -270,6 +213,16 @@ struct CachingHostAllocatorImpl {
      }();
    }

+    // Round up the allocation to the nearest power of two to improve reuse.
+    // These power of two sizes are also used to index into the free list.
+    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
+
+    // First, try to allocate from the free list
+    auto* block = get_free_block(roundSize);
+    if (block) {
+      return {block->ptr_, reinterpret_cast<void*>(block)};
+    }
+
    // Slow path: if we can't allocate from the cached free list, we need
    // to create a new block.
    void* ptr = nullptr;
@ -380,7 +333,7 @@ struct CachingHostAllocatorImpl {
        ptr_to_block_.erase(block->ptr_);
        auto index = size_index(block->size_);
        free_block(block);
-        stats_.allocations.decrease(1);
+        stats_.allocation.decrease(1);
        stats_.allocated_bytes.decrease(block->size_);
        stats_.allocation_bucket_stats[index].decrease(1);
        stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
@ -430,16 +383,16 @@ struct CachingHostAllocatorImpl {
      // per bucket (we pick index 0 arbitrarily). These are also all the host
      // allocations, not taking into account caching and free lists.
      if (i == 0) {
-        stats.allocations = stats_.allocations;
-        stats.allocated_bytes = stats_.allocated_bytes;
-        stats.num_host_alloc = stats.allocations.allocated;
-        stats.num_host_free = stats.allocations.freed;
+        stats.segment = stats_.allocation;
+        stats.reserved_bytes = stats_.allocated_bytes;
+        stats.num_host_alloc = stats.segment.allocated;
+        stats.num_host_free = stats.segment.freed;
      }

      // Bucket stats need to be merged with the slow-path stats. We do this in
      // a best effort manner, since we can't really replay the cached events per bucket.
-      add_bucket_stats(stats.active_requests, stats_.active_bucket_stats[i]);
-      add_bucket_stats(stats.active_bytes, stats_.active_bytes_bucket_stats[i]);
+      add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
+      add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
      stats.bucket_allocation[i] = stats_.allocation_bucket_stats[i].allocated;
    }

@ -464,11 +417,9 @@ struct CachingHostAllocatorImpl {
      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);

      if (i == 0) {
-        stats_.allocations.reset_accumulated();
+        stats_.allocation.reset_accumulated();
        stats_.allocated_bytes.reset_accumulated();
      }
-      stats_.active_bucket_stats[i].reset_accumulated();
-      stats_.active_bytes_bucket_stats[i].reset_accumulated();
      stats_.allocation_bucket_stats[i].reset_accumulated();
      stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
    }
@ -491,11 +442,9 @@ struct CachingHostAllocatorImpl {
      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);

      if (i == 0) {
-        stats_.allocations.reset_peak();
+        stats_.allocation.reset_peak();
        stats_.allocated_bytes.reset_peak();
      }
-      stats_.active_bucket_stats[i].reset_peak();
-      stats_.active_bytes_bucket_stats[i].reset_peak();
      stats_.allocation_bucket_stats[i].reset_peak();
      stats_.allocated_bytes_bucket_stats[i].reset_peak();
    }
@ -512,7 +461,7 @@ struct CachingHostAllocatorImpl {
  virtual void add_allocated_block(B* block) {
    std::lock_guard<std::mutex> g(blocks_mutex_);
    blocks_.insert(block);
-    stats_.allocations.increase(1);
+    stats_.allocation.increase(1);
    stats_.allocated_bytes.increase(block->size_);
    ptr_to_block_.insert({block->ptr_, block});

@ -525,8 +474,6 @@ struct CachingHostAllocatorImpl {
      std::lock_guard<std::mutex> g(free_list_[index].mutex_);
      stats_.allocation_bucket_stats[index].increase(1);
      stats_.allocated_bytes_bucket_stats[index].increase(size);
-      stats_.active_bucket_stats[index].increase(1);
-      stats_.active_bytes_bucket_stats[index].increase(size);
    }
  }

@ -537,8 +484,6 @@ struct CachingHostAllocatorImpl {
      B* block = free_list_[index].list_.back();
      free_list_[index].list_.pop_back();
      block->allocated_ = true;
-      stats_.active_bucket_stats[index].increase(1);
-      stats_.active_bytes_bucket_stats[index].increase(size);
      return block;
    }
    return nullptr;
@ -632,8 +577,6 @@ struct CachingHostAllocatorImpl {
        auto index = size_index(block->size_);
        std::lock_guard<std::mutex> g(free_list_[index].mutex_);
        free_list_[index].list_.push_back(block);
-        stats_.active_bucket_stats[index].decrease(1);
-        stats_.active_bytes_bucket_stats[index].decrease(size);
        if (size != -1) {
          return;
        }
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
 }

 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
-  impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
 }

 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
        const std::optional<Tensor>& gradient,
        std::optional<bool> keep_graph,
        bool create_graph) const {
-  impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }

 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
@ -173,12 +173,4 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
  return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
 }

-std::optional<ScalarType> TensorBase::grad_dtype() const {
-  return impl::GetVariableHooks()->grad_dtype(*this);
-}
-
-void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
-  return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
-}
-
 } // namespace at
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -930,10 +930,6 @@ public:

  const TensorBase& requires_grad_(bool _requires_grad=true) const;

-  std::optional<ScalarType> grad_dtype() const;
-
-  void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
-
  // View Variables
  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/aten/src/ATen/core/TransformationHelper.h
+++ b/aten/src/ATen/core/TransformationHelper.h
@ -117,7 +117,7 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) {
 template <>
 C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
  // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
-  return median + sigma * at::tan(c10::pi<double> * (val - 0.5));
+  return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5)));
 }

 /**
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@ -68,8 +68,6 @@ struct TORCH_API VariableHooksInterface {
      const c10::OperatorHandle& op,
      c10::DispatchKeySet dispatch_keys,
      torch::jit::Stack* stack) const = 0;
-  virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
-  virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
 };

 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
--- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
@ -2,7 +2,7 @@

 namespace c10 {

-inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {}
+inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {}

 inline BoxedKernel::BoxedKernel(
    std::unique_ptr<OperatorKernel> functor,
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@ -20,7 +20,9 @@ make_unique_base(Args&&... args) {
 } // namespace detail

 inline KernelFunction::KernelFunction()
-    : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {}
+    : boxed_kernel_func_(),
+      unboxed_kernel_func_(nullptr),
+      sym_unboxed_kernel_func_(nullptr) {}

 inline KernelFunction::~KernelFunction() {
  if (tokens_) {
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -76,7 +76,13 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name,

 OpRegistrationListener::~OpRegistrationListener()= default;

-Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique<detail::RegistrationListenerList>()), guard_(std::make_shared<Guard>())
+Dispatcher::Dispatcher()
+: operators_()
+, operatorLookupTable_()
+, backendFallbackKernels_()
+, listeners_(std::make_unique<detail::RegistrationListenerList>())
+, cond_var_()
+, guard_(std::make_shared<Guard>())
 {}

 Dispatcher::~Dispatcher() {
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -96,7 +96,7 @@ class TORCH_API Dispatcher final {
  friend class TypedOperatorHandle;

  struct Guard final {
-    Guard() : alive(true) {}
+    Guard() : alive(true), mutex() {}
    std::atomic<bool> alive;
    std::mutex mutex;
  };
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
  }

  void checkInvariants() const {
-    operatorDef_->op.checkInvariants();
+    return operatorDef_->op.checkInvariants();
  }

  c10::ArrayRef<at::Tag> getTags() const {
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
  }
 #endif
  const auto& kernel = entry.lookup(dispatchKeySet);
-  kernel.callBoxed(op, dispatchKeySet, stack);
+  return kernel.callBoxed(op, dispatchKeySet, stack);
 }

 } // namespace c10
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -62,7 +62,17 @@ static const auto& getDispatchTableIndexToKey() {
 }

 OperatorEntry::OperatorEntry(OperatorName&& operator_name)
-: name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_))
+: name_(std::move(operator_name))
+, schema_()
+#ifndef C10_MOBILE
+, tags_()
+#endif
+, dispatchTable_()
+, dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
+, kernels_()
+, cpp_signature_()
+, sym_cpp_signature_()
+, is_observed_(ObservedOperators::isObserved(name_))
 {
  // Pick up any backend fallbacks that were registered prior to this
  // OperatorEntry being created.
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -357,7 +357,7 @@ IValue IValue::equals(const IValue& rhs) const {
    case Tag::Enum:
      return lhs.toEnumHolder()->is(*rhs.toEnumHolder());
    case Tag::Uninitialized:
-      // Uninitialized ivalues show up in no-ops when the compiler can prove a
+      // Unitialized ivalues show up in no-ops when the compiler can prove a
      // value will never be used. Just return false on any equality comparison.
      return false;
  }
--- a/aten/src/ATen/core/op_registration/op_allowlist.h
+++ b/aten/src/ATen/core/op_registration/op_allowlist.h
@ -114,7 +114,7 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i
        }
        next++;
      } else {
-        if (allowlist.substr(cur) == item) {
+        if (allowlist.substr(cur).compare(item) == 0) {
          return true;
        }
        break;
--- a/aten/src/ATen/core/op_registration/op_registration.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration.cpp
@ -73,7 +73,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(

  std::optional<FunctionSchema> inferred_schema = std::nullopt;
  for (const auto& kernel : options.kernels) {
-    if (nullptr != kernel.inferred_function_schema) {
+    if (nullptr != kernel.inferred_function_schema.get()) {
      if (!inferred_schema.has_value()) {
        inferred_schema = *kernel.inferred_function_schema;
        break;
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@ -411,6 +411,7 @@ public:

    Options()
    : schemaOrName_(std::nullopt)
+    , kernels()
    , aliasAnalysisKind_(std::nullopt)
    {}

@ -419,6 +420,7 @@ public:
    struct KernelRegistrationConfig final {
      KernelRegistrationConfig()
        : dispatch_key(std::nullopt)
+        , func()
        , cpp_signature(std::nullopt)
        , inferred_function_schema(nullptr)
      {}
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@ -905,7 +905,7 @@ class Vectorized8 : public Vectorizedi {
    // Because loadu(const void* ptr, T count) requires zero initialization for
    // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128
    // bits of the result are undefined.
-    // TODO<leslie> We can use _mm256_zextsi128_si256 in the future,
+    // TODO<leslie> We can use _mm256_zextsi128_si256 in the furture,
    // since gcc 9.3 doesn't support it now.
    __m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
    return _mm256_castsi128_si256(input_128);
@ -1844,7 +1844,7 @@ Vectorized<int16_t> inline shift_256_16(
    c0 = _mm256_srav_epi32(a0, b0);
  c0 = _mm256_shuffle_epi8(c0, ctl_1_0);

-  // Perform shifting the same way for input array elements with
+  // Peform shifting the same way for input array elements with
  // idx%2==1.
  __m256i a1 = _mm256_and_si256(a, keep_1);
  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@ -2180,7 +2180,7 @@ Vectorized<T> inline shift_256_8(
    c0 = _mm256_srlv_epi32(a0, b0);
  c0 = _mm256_shuffle_epi8(c0, ctl_3_0);

-  // Perform shifting the same way for input array elements with
+  // Peform shifting the same way for input array elements with
  // idx%4==1.
  __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@ -2193,7 +2193,7 @@ Vectorized<T> inline shift_256_8(
    c1 = _mm256_srlv_epi32(a1, b1);
  c1 = _mm256_shuffle_epi8(c1, ctl_3_1);

-  // Perform shifting the same way for input array elements with
+  // Peform shifting the same way for input array elements with
  // idx%4==2.
  __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
  __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
@ -2206,7 +2206,7 @@ Vectorized<T> inline shift_256_8(
    c2 = _mm256_srlv_epi32(a2, b2);
  c2 = _mm256_shuffle_epi8(c2, ctl_3_2);

-  // Perform shifting the same way for input array elements with
+  // Peform shifting the same way for input array elements with
  // idx%4==3.
  __m256i a3 = _mm256_and_si256(a, keep_3);
  __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@ -1088,7 +1088,7 @@ class Vectorized8 : public Vectorizedi {
    // Because loadu(const void* ptr, T count) requires zero initialization for
    // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384
    // bits of the result are undefined.
-    // TODO<leslie> We can use _mm512_zextsi128_si512 in the future,
+    // TODO<leslie> We can use _mm512_zextsi128_si512 in the furture,
    // since gcc 9.3 doesn't support it now.
    __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
    return _mm512_castsi128_si512(input_128);
@ -2022,7 +2022,7 @@ Vectorized<T> inline shift_512_8(
    c0 = _mm512_srlv_epi16(a0, b0);
  c0 = _mm512_shuffle_epi8(c0, ctl_1_0);

-  // Perform shifting the same way for input array elements with
+  // Peform shifting the same way for input array elements with
  // idx%2==1.
  __m512i a1 = _mm512_and_si512(a, keep_1);
  __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -323,7 +323,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
    descriptor_.reset(raw_descriptor);
  }
  template <typename T>
-  void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
    // NOLINTNEXTLINE(bugprone-sizeof-expression)
    TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value)));
  }
@ -345,7 +345,7 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
    descriptor_.reset(raw_descriptor);
  }
  template <typename T>
-  void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
+  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
    TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
  }
 };
@ -360,7 +360,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
    descriptor_.reset(raw_descriptor);
  }
  template <typename T>
-  void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
+  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
    TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
  }
 };
@ -422,40 +422,25 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    abType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
-    if (fp16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
-          fp16_reduction ==
-                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
-              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
-                 CUBLASLT_REDUCTION_SCHEME_NONE)
-              : CUBLASLT_REDUCTION_SCHEME_NONE;
-      preference.setAttribute(
-          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
    }
 #endif
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
-    if (bf16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
-          bf16_reduction ==
-                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
-              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
-                 CUBLASLT_REDUCTION_SCHEME_NONE)
-              : CUBLASLT_REDUCTION_SCHEME_NONE;
-      preference.setAttribute(
-          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
    }
 #endif
  } else {
    static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
  }

+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -589,6 +574,8 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D

 template <>
 void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -600,6 +587,8 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {

 template <>
 void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -611,6 +600,8 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {

 template <>
 void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -624,6 +615,8 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co

 template <>
 void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -637,6 +630,8 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com

 template <typename C_Dtype>
 inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -708,6 +703,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP

 template <typename C_Dtype>
 inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  BGEMM_CHECK_ARGVALUES(at::BFloat16);
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1031,6 +1028,8 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty

 template <>
 void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1042,6 +1041,8 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {

 template <>
 void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1053,6 +1054,8 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {

 template <>
 void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1066,6 +1069,8 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp

 template <>
 void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1079,6 +1084,8 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl

 template <typename C_Dtype>
 inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1136,15 +1143,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
  }
  if (prop->major >= 5) {
    cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
-    TORCH_CHECK(fp16_reduction !=
-        at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
-          "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
-          "..., allow_splitk=False) requires the cuBLASLt backend");
-    if (fp16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      cublas_flags = static_cast<cublasMath_t>(
-          cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
    }
    // Disallow fp16 reductions that could lead to unexpected overflow issues.
    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
@ -1194,6 +1194,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(

 template <typename C_Dtype>
 inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1203,15 +1204,8 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
  GEMM_CHECK_ARGVALUES(at::BFloat16);
 #ifndef USE_ROCM
  cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-  auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
-  TORCH_CHECK(bf16_reduction !=
-      at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
-        "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
-        "..., allow_splitk=False) requires the cuBLASLt backend");
-  if (bf16_reduction !=
-      at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-    cublas_flags = static_cast<cublasMath_t>(
-        cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+  if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+    cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
  }
 #endif
 #if defined(USE_ROCM)
@ -1300,7 +1294,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
  }
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-    if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
      gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
    } else{
      at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
@ -1607,34 +1601,18 @@ bool gemm_and_bias(
    abType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
-    if (fp16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
-          fp16_reduction ==
-                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
-              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
-                 CUBLASLT_REDUCTION_SCHEME_NONE)
-              : CUBLASLT_REDUCTION_SCHEME_NONE;
-      preference.setAttribute(
-          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
    }
 #endif
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
    abType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
-    if (bf16_reduction !=
-        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      uint32_t mask =
-          bf16_reduction ==
-                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
-              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
-                 CUBLASLT_REDUCTION_SCHEME_NONE)
-              : CUBLASLT_REDUCTION_SCHEME_NONE;
-      preference.setAttribute(
-          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
+    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
    }
 #endif
  }
@ -2430,6 +2408,8 @@ void trsmBatched<c10::complex<double>>(

 template <>
 void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2445,6 +2425,8 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
  // gemv is bw bound, and does not benefit from TF32. But the precision
  // loss still happens on TF32. So we disable it here.
  NoTF32Guard disable_tf32;
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2457,6 +2439,8 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {

 template <>
 void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2470,6 +2454,8 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
  // gemv is bw bound, and does not benefit from TF32. But the precision
  // loss still happens on TF32. So we disable it here.
  NoTF32Guard disable_tf32;
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@ -109,7 +109,7 @@ void CUDAGeneratorState::increase(uint64_t increment) {
        offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
    // Ensures the increment does not cause overflow.
    TORCH_INTERNAL_ASSERT(
-        offset_intragraph_ <= std::numeric_limits<uint64_t>::max() - increment,
+        offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
        "Increment causes overflow in the offset value.");
    offset_intragraph_ += increment;
  } else {
@ -461,7 +461,7 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) {
 */
 PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
  if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
-    uint64_t offset = state_->offset_intragraph_;
+    uint32_t offset = state_->offset_intragraph_;
    state_->increase(increment);
    return PhiloxCudaState(
        state_->seed_extragraph_.data_ptr<int64_t>(),
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@ -96,16 +96,16 @@ struct CUDAGraph;
 struct CUDAGeneratorState : public c10::intrusive_ptr_target {
  uint64_t seed_;
  uint64_t philox_offset_per_thread_;
-  uint64_t offset_intragraph_;
+  uint32_t offset_intragraph_;
  bool capturing_{};
  std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
-  at::TensorBase seed_extragraph_;
-  at::TensorBase offset_extragraph_;
+  at::TensorBase seed_extragraph_{};
+  at::TensorBase offset_extragraph_{};

  CUDAGeneratorState(
      uint64_t seed = default_rng_seed_val,
      uint64_t philox_offset_per_thread = 0,
-      uint64_t offset_intragraph = 0)
+      uint32_t offset_intragraph = 0)
      : seed_(seed),
        philox_offset_per_thread_(philox_offset_per_thread),
        offset_intragraph_(offset_intragraph) {}
@ -167,7 +167,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
  CUDAGeneratorImpl* clone_impl() const override;

  c10::intrusive_ptr<CUDAGeneratorState> state_;
-  std::atomic_flag no_reset_rnn_state_;
+  std::atomic_flag no_reset_rnn_state_{};
 };

 namespace cuda::detail {
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -56,7 +56,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {

  // the ID assigned by cuda during graph capture,
  // used to identify when a stream is participating in capture
-  CaptureId_t capture_id_ = 0;
+  CaptureId_t capture_id_ = -1;

  // uuid used to request a particular private mempool from CUDACachingAllocator.
  // By default, this will be set to {id_, 0}.
--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@ -6,15 +6,43 @@
 #define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch)
 #endif

+// cuSparse Generic API added in CUDA 10.1
+// Windows support added in CUDA 11.0
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32)))
+#define AT_USE_CUSPARSE_GENERIC_API() 1
+#else
+#define AT_USE_CUSPARSE_GENERIC_API() 0
+#endif
+
+// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
+    (CUSPARSE_VERSION < 12000)
+#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1
+#else
+#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0
+#endif
+
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
+    (CUSPARSE_VERSION >= 12000)
+#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1
+#else
+#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0
+#endif

 #if defined(USE_ROCM)
 // hipSparse const API added in v2.4.0
 #if HIPSPARSE_VERSION >= 200400
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #else
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #endif
 #else // USE_ROCM
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 0
 #endif // USE_ROCM

--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@ -12,6 +12,8 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) {
  return cusparseDestroyDnMat(const_cast<cusparseDnMatDescr*>(dnMatDescr));
 }

+#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
 namespace {

 // If a specific GPU model does not provide native support for a given data
@ -208,4 +210,6 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
  descriptor_.reset(raw_descriptor);
 }

+#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
 } // namespace at::cuda::sparse
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.h
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h
@ -35,6 +35,7 @@ class CuSparseDescriptor {
  std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };

+#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
 template <typename T, cusparseStatus_t (*destructor)(const T*)>
 struct ConstCuSparseDescriptorDeleter {
  void operator()(T* x) {
@ -57,6 +58,7 @@ class ConstCuSparseDescriptor {
 protected:
  std::unique_ptr<T, ConstCuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };
+#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS

 #if defined(USE_ROCM)
 using cusparseMatDescr = std::remove_pointer_t<hipsparseMatDescr_t>;
@ -121,8 +123,39 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info

 #endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE

+#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
 cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);

+#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS()
+class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
+    : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
+ public:
+  explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
+};
+
+class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
+    : public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> {
+ public:
+  explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
+  cusparseDnMatDescr* unsafe_mutable_descriptor() const {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+  cusparseDnMatDescr* unsafe_mutable_descriptor() {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+};
+
+class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
+    : public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> {
+ public:
+  explicit CuSparseDnVecDescriptor(const Tensor& input);
+};
+
+class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
+    : public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {};
+
+#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
  class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
      : public ConstCuSparseDescriptor<
            cusparseDnMatDescr,
@ -161,6 +194,7 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
      : public ConstCuSparseDescriptor<
            cusparseSpMatDescr,
            &cusparseDestroySpMat> {};
+#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()

 class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
    : public CuSparseSpMatDescriptor {
@ -249,4 +283,6 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
  }
 };

+#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
 } // namespace at::cuda::sparse
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -9,6 +9,7 @@

 #include <cuda_runtime_api.h>
 #include <future>
+#include <unordered_map>

 namespace at::cuda {
 namespace {
@ -71,20 +72,9 @@ using Block = HostBlock<CUDAStream>;
 struct CUDACachingHostAllocatorImpl
    : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
 private:
-  ska::flat_hash_map<void*, bool> use_host_register;
+  std::unordered_map<void*, bool> use_host_register;

  void allocate_host_memory(size_t size, void** ptr) override {
-    // try allocating from reserve segment first before calling into expensive APIs
-    if (get_reserve_segment().initialized()) {
-      *ptr = get_reserve_segment().allocate(size);
-      if (*ptr != nullptr) {
-        return;
-      }
-    }
-    allocate_host_memory_slowpath(size, ptr);
-  }
-
-  void allocate_host_memory_slowpath(size_t size, void** ptr) {
    // Pinned memory pointers allocated by any device can be directly used by
    // any other device, regardless of the current device at the time of
    // allocation, since we assume unified addressing. So we grab any existing
@ -123,18 +113,6 @@ struct CUDACachingHostAllocatorImpl
  }

  void free_block(Block* block) override {
-    // We never free blocks from the reserve segment
-    if (get_reserve_segment().initialized()) {
-      // Check if the block is from the reserve segment
-      if (get_reserve_segment().owns(block->ptr_)) {
-        return;
-      }
-    }
-
-    free_block_slowpath(block);
-  }
-
-  void free_block_slowpath(Block* block) {
    auto start = std::chrono::steady_clock::now();
    // Users may change the allocator config at will. torch unit tests do this.
    // However, allocations using cudaHostRegister should use corresonding
@ -194,20 +172,6 @@ struct CUDACachingHostAllocatorImpl
    return event_pool->get(idx);
  }

-  PinnedReserveSegment& get_reserve_segment() {
-    static auto reserve_segment = [&]() {
-      if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
-        void *ptr;
-        size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024;
-        allocate_host_memory_slowpath(sz, &ptr);
-        return PinnedReserveSegment(ptr, sz);
-      } else {
-        return PinnedReserveSegment();
-      }
-    } ();
-    return reserve_segment;
-  }
-
  TaskThreadPool* getThreadPool() {
    static TaskThreadPool* pool = new TaskThreadPool(
        static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
@ -222,15 +186,15 @@ struct CUDACachingHostAllocatorImpl
      size_t numThreads,
      size_t pageSize) {
    uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
-    uintptr_t end = start + (size / numThreads);
+    uintptr_t end = (uintptr_t)start + (size / numThreads);
    if (i == (numThreads - 1)) {
      end = (uintptr_t)ptr + size;
    }

    // pre-fault/map the pages by setting the first byte of the page
    uintptr_t alignedStart =
-        ((start + pageSize - 1) & ~(pageSize - 1));
-    for (uintptr_t p = alignedStart; p < (end); p += pageSize) {
+        (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
+    for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
      // NOLINTNEXTLINE(performance-no-int-to-ptr)
      memset((void*)p, 0, 1);
    }
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -326,23 +326,6 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #endif
 }

-bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
-#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
-  if (!hasCUDA()) {
-    return false;
-  }
-  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-  // Check for Volta cores
-  if (prop->major >= 8) {
-    return true;
-  } else {
-    return false;
-  }
-#else
-  return false;
-#endif
-}
-
 long CUDAHooks::versionCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_VERSION;
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -45,7 +45,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool supportsDilatedConvolutionWithCuDNN() const override;
  bool supportsDepthwiseConvolutionWithCuDNN() const override;
  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
-  bool supportsBFloat16RNNWithCuDNN() const override;
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
--- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
+++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread

    // Called by the destructor.  Releases this thread's handles back into the pool.
    void release() {
-        if(!my_handles.empty()) {
+        if(my_handles.size() > 0) {
            auto parent = weak_parent.lock();
            if (!parent) {
                // If this thread exits after atexit handlers have completed, the
--- a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
+++ b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
@ -19,7 +19,7 @@ struct PhiloxCudaState {
  // Called if graph capture is underway
  PhiloxCudaState(int64_t* seed,
                  int64_t* offset_extragraph,
-                  uint64_t offset_intragraph) {
+                  uint32_t offset_intragraph) {
    seed_.ptr = seed;
    offset_.ptr = offset_extragraph;
    offset_intragraph_ = offset_intragraph;
@ -36,7 +36,7 @@ struct PhiloxCudaState {

  Payload seed_{};
  Payload offset_{};
-  uint64_t offset_intragraph_ = 0;
+  uint32_t offset_intragraph_ = 0;
  bool captured_ = false;
 };

--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -404,6 +404,8 @@ TuningContext::TuningContext() :
    max_warmup_iterations_{0},
    icache_flush_{true},
    rotating_buffer_size_{-1},
+    filename_{},
+    untuned_file_{},
    results_count_from_input_file_{0},
    is_shutting_down_{false}
 {
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -141,7 +141,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
    size[i] = (int) t.size(i);
  }
  for (const auto i : c10::irange(dim, pad)) {
-    size[i] = 1;
+    size[i] = (int) 1;
  }
  dim = std::max(dim, pad);
  cudnnTensorFormat_t filter_format{};
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -166,10 +166,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    return false;
  }

-  virtual bool supportsBFloat16RNNWithCuDNN() const {
-    return false;
-  }
-
  virtual long versionCuDNN() const {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -176,7 +176,7 @@ struct LinalgCheckMatrixUnaryRuleHelper;

 template <char const *op_name, typename F, F Func, typename A, typename... T>
 struct LinalgCheckMatrixUnaryRuleHelper<op_name, F, Func, typelist<A, T...>> {
-  static Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
+  static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
    TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions.");
    return moveBatchDimToFront(tensor, batch_dim);
  }
@ -222,7 +222,7 @@ struct LinalgCheckMatrixBinaryRuleHelper;

 template <char const *op_name, typename F, F Func, typename A, typename B, typename... T>
 struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>> {
-  static std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
+  static inline std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
      const Tensor& first, std::optional<int64_t> first_bdim,
      const Tensor& second, std::optional<int64_t> second_bdim) {
    TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2,
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s

 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  dynamicLayerBack(op, stack, true);
+  return dynamicLayerBack(op, stack, true);
 }

 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  dynamicLayerBack(op, stack, false);
+  return dynamicLayerBack(op, stack, false);
 }

 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@ -12,7 +12,7 @@

 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 14.0+. ", \
+  "The MPS backend is supported on MacOS 13.0+.", \
  "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
  "as the MPS framework doesn't support float64. Please use float32 instead."
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@ -58,7 +58,7 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y,
 template<typename scalar_t>
 scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy);

-static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) {
+static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) {
  return n == 1 || lda >= std::max<int64_t>(1L, m);
 }

--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
  const at::BFloat16 beta,
  at::BFloat16* y,
  const int incy) {
-  bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

 template <>
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
    const Tensor& raw_boundaries) {
  Tensor trimmed_sorter;
  Tensor raw_sorter;
-  searchsorted_maybe_trim_input_tensors(
+  return searchsorted_maybe_trim_input_tensors(
      trimmed_input,
      trimmed_boundaries,
      trimmed_sorter,
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -991,7 +991,7 @@ std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) cons
 template <typename key_t, typename value_t>
 struct KernelCache  {
  using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
-  static std::shared_ptr<value_t>&& fetch_or_create(
+  static inline std::shared_ptr<value_t>&& fetch_or_create(
      const key_t& key,
      const std::function<std::shared_ptr<value_t>()>& callback) {
    auto&& search = get_store().find(key);
@ -1003,7 +1003,7 @@ struct KernelCache  {
    }
  }

-  static kstore_t& get_store() {
+  static inline kstore_t& get_store() {
    static thread_local kstore_t cache_kernels;
    return cache_kernels;
  }
@ -1067,7 +1067,7 @@ struct GemmHelper {
 struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
  // Fetch/create GemmHelper object and execute brgemm with batch size = 1
  template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
-  static void call(
+  static inline void call(
      int64_t M,
      int64_t N,
      int64_t K,
@ -1118,12 +1118,12 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
        .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
  }

-  static std::shared_ptr<GemmHelper>& get_current() {
+  static inline std::shared_ptr<GemmHelper>& get_current() {
    static thread_local std::shared_ptr<GemmHelper> current;
    return current;
  }

-  static bool device_check(ScalarType dtype) {
+  static inline bool device_check(ScalarType dtype) {
    if (!at::globalContext().userEnabledMkldnn()) {
      return false;
    }
@ -1153,7 +1153,7 @@ using pack_t = dnnl::ukernel::brgemm_pack_B;
 using pack_t = dnnl::ukernel::transform;
 #endif
 struct Pack : public KernelCache <PackKey, pack_t> {
-  static void call(
+  static inline void call(
      int64_t K,
      int64_t N,
      int64_t ld_in,
@ -1182,7 +1182,7 @@ struct Pack : public KernelCache <PackKey, pack_t> {
    }
  }

-  static bool could_pack(ScalarType dtype) {
+  static inline bool could_pack(ScalarType dtype) {
    if (!at::globalContext().userEnabledMkldnn()) {
      return false;
    }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -702,7 +702,7 @@ static void check_shape_forward(const at::Tensor& input,
      // If kernel size is incorrect
      std::ostringstream input_ss;
      std::ostringstream kernel_ss;
-      std::string separator;
+      std::string separator = "";

      for (int i = 0, len = input_shape.size(); i < len; ++i) {
        input_ss << separator << input_shape[i];
@ -1019,7 +1019,7 @@ static Tensor convolution_same(

  if (symmetric_padding) {
    // All backends handle symmetric padding natively
-    SymDimVector output_padding(dim);
+    SymDimVector output_padding(static_cast<size_t>(dim));
    return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
                               false, output_padding, groups);
  }
@ -1039,7 +1039,7 @@ static Tensor convolution_same(
    }
  }
  auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
-  SymDimVector output_padding(dim);
+  SymDimVector output_padding(static_cast<size_t>(dim));
  return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
                                dilation, false, output_padding, groups);
 }
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Copy.h>
+#include <ATen/native/Copy.h>

 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@ -93,12 +93,6 @@ inline bool cond_cudnn_grid_sampler(
  const TensorBase& input,
  const TensorBase& grid
 ) {
-  auto st = input.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
-  st = grid.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
  return (
    at::native::cudnn_is_acceptable(input) &&
    at::native::cudnn_is_acceptable(grid) &&
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@ -70,7 +70,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
        new_shape.emplace_back(input_sizes[i]);
    }

-    for (const auto i : c10::irange(l_pad)) {
+    for (const auto i : c10::irange((size_t)l_pad)) {
        auto pad_idx = pad.size() - ((i + 1) * 2);
        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
        TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -108,13 +108,6 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
  return false;
 }

-bool use_cudnn(const Tensor& t) {
-  bool acceptable = at::cudnn_is_acceptable(t);
-  auto st = t.scalar_type();
-  bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
-  return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
-}
-
 template<typename T>
 using pair_of = std::pair<T, T>;

@ -1207,7 +1200,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      bool train,                                                           \
      bool bidirectional,                                                   \
      bool batch_first) {                                                   \
-    if (use_cudnn(_input)) {                                                \
+    if (at::cudnn_is_acceptable(_input)) {                                  \
      Tensor output, hy;                                                    \
      NAME##_cudnn_stub(                                                    \
          _input.device().type(),                                           \
@ -1269,7 +1262,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
      double dropout_p,                                                     \
      bool train,                                                           \
      bool bidirectional) {                                                 \
-    if (use_cudnn(data)) {                                                  \
+    if (at::cudnn_is_acceptable(data)) {                                    \
      Tensor output, hy;                                                    \
      NAME##_packed_cudnn_stub(                                             \
          data.device().type(),                                             \
@ -1437,7 +1430,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (use_cudnn(_input)) {
+  if (at::cudnn_is_acceptable(_input)) {
    Tensor output, hy, cy;
    lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
            num_layers, dropout_p, train, bidirectional, batch_first);
@ -1498,7 +1491,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
      TensorList _params, bool has_biases,
      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (use_cudnn(data)) {
+  if (at::cudnn_is_acceptable(data)) {
    Tensor output, hy, cy;
    lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
            _params, has_biases, num_layers, dropout_p, train, bidirectional);
--- a/aten/src/ATen/native/RangeUtils.h
+++ b/aten/src/ATen/native/RangeUtils.h
@ -47,7 +47,7 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
    int64_t sgn = (xstep > 0) - (xstep < 0);
    size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
  } else {
-    size_d = std::ceil((end.to<double>() - start.to<double>())
+    size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
                        / step.to<double>());
  }

--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@ -107,6 +107,11 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) {
  storage->set_nbytes(size_bytes);
 }

+// Call the sparse implementation in SparseTensor.cpp directly.
+// A dynamic dispatch here is NOT necessary, so I didn't put
+// this function in native_functions.yaml
+const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src);
+
 // TODO(VitalyFedyunin): Move it to HTML docs.
 //
 // Strides of the output tensor of `resize_as_` operator is defined by input
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -145,6 +145,12 @@
 #include <utility>
 #include <vector>

+namespace at::native {
+
+AdvancedIndex make_info(Tensor self, IOptTensorListRef orig);
+
+} // namespace at::native
+
 namespace at::meta {

 TORCH_META_FUNC(gather)
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -73,6 +73,7 @@
 #include <ATen/ops/where_native.h>
 #include <ATen/ops/zeros_like.h>

+#include <iostream>
 #include <utility>
 #endif

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
jainapurva	923822be65	updates	2025-10-01 20:12:21 -07:00
jainapurva	3662a0178a	<Replace this line with a title. Use 1 line only, 67 chars or less> Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:	2025-10-01 19:46:46 -07:00
Apurva Jain	7255fa2bb1	Change test workflow from Linux to ROCm	2025-10-01 10:46:10 -07:00
Apurva Jain	c2608e2b47	Remove comment about H100 A100 runners	2025-09-30 14:07:57 -07:00
Apurva Jain	19fec2087f	Update operator_microbenchmark workflow Removed pull request triggers and B200 runner configurations.	2025-09-30 13:55:19 -07:00
jainapurva	1bd7e2504a	Add b200	2025-09-29 14:24:47 -07:00
jainapurva	fbf67f6821	Add rocm flow to CI	2025-09-29 13:28:15 -07:00