cleanup and remove alert test

update
check in
2025-10-28 02:04:53 +08:00 · 2025-09-10 22:46:12 +00:00 · 2025-09-02 17:48:42 +00:00 · 2025-08-28 23:43:31 +00:00
246 changed files with 2531 additions and 9495 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi

-_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
-_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
+_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@ -114,19 +114,31 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
-    CUDA_VERSION=13.0.0
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=9
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    VISION=yes
    KATEX=yes
@ -213,8 +225,7 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    # TODO (huydhn): Upgrade this to Python >= 3.10
+  pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-74a23feff57432129df84d8099e622773cf77925
+22bc29b4d503fc895ff73bc720ff396e9723465f
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -44,12 +44,8 @@ function install_ucc() {

  ./autogen.sh

-  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
-    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
-  else
-    # We only run distributed tests on Tesla M60 and A10G
-    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
-  fi
+  # We only run distributed tests on Tesla M60 and A10G
+  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"

  if [[ -n "$ROCM_VERSION" ]]; then
    if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -66,7 +66,6 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
-ARG CUDA_VERSION
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,10 +35,11 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -496,14 +496,6 @@ test_inductor_cpp_wrapper_shard() {
    -k 'take' \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
-
-  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
-    python test/run_test.py \
-      --include inductor/test_mkldnn_pattern_matcher \
-      -k 'xpu' \
-      --shard "$1" "$NUM_TEST_SHARDS" \
-      --verbose
-  fi
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -213,8 +213,7 @@ pip install requests ninja typing-extensions
 retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -57,21 +57,6 @@ runs:
        submodules: ${{ inputs.submodules }}
        show-progress: false

-    - name: Clean submodules post checkout
-      id: clean-submodules
-      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
-      shell: bash
-      env:
-        NO_SUDO: ${{ inputs.no-sudo }}
-      run: |
-        cd "${GITHUB_WORKSPACE}"
-        # Clean stale submodule dirs
-        if [ -z "${NO_SUDO}" ]; then
-          sudo git submodule foreach --recursive git clean -ffdx
-        else
-          git submodule foreach --recursive git clean -ffdx
-        fi
-
    - name: Clean workspace (try again)
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-1c66402d0fa47ea74d365dcaa468d397da481918
+10a5002c6195bd95e34df8fe28ff8a2d55a2a922
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-752d2e1c364e4195093e4f3f2fc33e3ae1840707
+321938e9ac4000e0cb37e328359a7fd3026bc672
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-763e5b78d4fcd74a9e812256656c075f99d9a781
+a1c6ee92c85e8b0955c20892ed68f032a6015c09
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -244,6 +244,8 @@ def generate_libtorch_matrix(
                arches.remove("13.0")
        elif os == "windows":
            arches += CUDA_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -308,6 +310,8 @@ def generate_wheels_matrix(
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
+            if "13.0" in arches:
+                arches.remove("13.0")
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -27,7 +27,6 @@ from trymerge import (
    get_drci_classifications,
    gh_get_team_members,
    GitHubPR,
-    iter_issue_timeline_until_comment,
    JobCheckState,
    main as trymerge_main,
    MandatoryChecksMissingError,
@ -35,8 +34,6 @@ from trymerge import (
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
-    sha_from_committed_event,
-    sha_from_force_push_after,
    validate_revert,
 )

@ -1141,176 +1138,5 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
        )


-@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
-@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
-@mock.patch(
-    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
-)
-class TestTimelineFunctions(TestCase):
-    """Tests for the new timeline-related functions"""
-
-    def test_sha_from_committed_event(self, *args: Any) -> None:
-        """Test extracting SHA from committed event"""
-        # Based on actual GitHub API format - committed events have "sha" at top level
-        event = {
-            "event": "committed",
-            "sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
-        }
-        self.assertEqual(
-            sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
-        )
-
-        # Test with missing SHA
-        event_no_sha = {"event": "committed"}
-        self.assertIsNone(sha_from_committed_event(event_no_sha))
-
-    def test_sha_from_force_push_after(self, *args: Any) -> None:
-        """Test extracting SHA from force push event"""
-        # NOTE: The current function doesn't handle the actual GitHub API format
-        # Real force push events have "commit_id" at top level, but this function
-        # looks for "after", "after_commit", "after_sha", or "head_sha" fields
-
-        # Test with the legacy format the current function handles
-        event_legacy = {
-            "event": "head_ref_force_pushed",
-            "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
-        }
-        self.assertEqual(
-            sha_from_force_push_after(event_legacy),
-            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        )
-
-        # Test with current GitHub API format (should return None with current implementation)
-        event_real_api = {
-            "event": "head_ref_force_pushed",
-            "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        }
-        self.assertEqual(
-            sha_from_force_push_after(event_real_api),
-            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        )  # Current function doesn't handle commit_id
-
-        # Test with missing SHA
-        event_no_sha = {"event": "head_ref_force_pushed"}
-        self.assertIsNone(sha_from_force_push_after(event_no_sha))
-
-    @mock.patch("trymerge.gh_fetch_json_list")
-    def test_iter_issue_timeline_until_comment(
-        self, mock_gh_fetch_json_list: Any, *args: Any
-    ) -> None:
-        """Test timeline iteration until target comment"""
-        # Mock timeline data based on actual GitHub API format
-        timeline_data = [
-            {"event": "commented", "id": 100, "body": "first comment"},
-            {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
-            {"event": "commented", "id": 200, "body": "target comment"},
-            {"event": "commented", "id": 300, "body": "after target"},
-        ]
-        mock_gh_fetch_json_list.return_value = timeline_data
-
-        # Test iteration stops at target comment
-        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
-        self.assertEqual(len(events), 3)  # Should stop at target comment
-        self.assertEqual(events[0]["event"], "commented")
-        self.assertEqual(events[0]["id"], 100)
-        self.assertEqual(events[1]["event"], "committed")
-        self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
-        self.assertEqual(events[2]["event"], "commented")
-        self.assertEqual(events[2]["id"], 200)
-
-    @mock.patch("trymerge.gh_fetch_json_list")
-    def test_iter_issue_timeline_until_comment_not_found(
-        self, mock_gh_fetch_json_list: Any, *args: Any
-    ) -> None:
-        """Test timeline iteration when target comment is not found"""
-        # Mock empty timeline
-        mock_gh_fetch_json_list.return_value = []
-
-        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
-        self.assertEqual(len(events), 0)
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_commit_after_comment(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        """Test get_commit_sha_at_comment returns correct SHA after comment"""
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "commented", "id": 100},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit2")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_force_push_before_comment(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "head_ref_force_pushed", "commit_id": "commit3"},
-            {"event": "commented", "id": 100},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-            {"event": "commented", "id": 100},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_multiple_comments(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "commented", "id": 100},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "commented", "id": 200},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-            {"event": "commented", "id": 300},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(200)
-        self.assertEqual(sha, "commit2")
-        sha = pr.get_commit_sha_at_comment(300)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_no_events(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "commented", "id": 100},
-            {"event": "labeled", "label": {"name": "test"}},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertIsNone(sha)
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_exception(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.side_effect = Exception("API error")
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertIsNone(sha)
-
-
 if __name__ == "__main__":
    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -450,63 +450,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10


-def iter_issue_timeline_until_comment(
-    org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
-) -> Any:
-    """
-    Yield timeline entries in order until (and including) the entry whose id == target_comment_id
-    for a 'commented' event. Stops once the target comment is encountered.
-    """
-    page = 1
-
-    while page <= max_pages:
-        url = (
-            f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
-        )
-        params = {"per_page": 100, "page": page}
-
-        batch = gh_fetch_json_list(url, params)
-
-        if not batch:
-            return
-        for ev in batch:
-            # The target is the issue comment row with event == "commented" and id == issue_comment_id
-            if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
-                yield ev  # nothing in the timeline after this matters, so stop early
-                return
-            yield ev
-        if len(batch) < 100:
-            return
-        page += 1
-
-    # If we got here without finding the comment, then we either hit a bug or some github PR
-    # has a _really_ long timeline.
-    # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
-    raise RuntimeError(
-        f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
-        f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
-    )
-
-
-def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
-    """Extract SHA from committed event in timeline"""
-    return ev.get("sha")
-
-
-def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
-    """Extract SHA from force push event in timeline"""
-    # The current GitHub API format
-    commit_id = ev.get("commit_id")
-    if commit_id:
-        return str(commit_id)
-
-    # Legacy format
-    after = ev.get("after") or ev.get("after_commit") or {}
-    if isinstance(after, dict):
-        return after.get("sha") or after.get("oid")
-    return ev.get("after_sha") or ev.get("head_sha")
-
-
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
    rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
    return rc["data"]["repository"]["pullRequest"]
@ -900,44 +843,6 @@ class GitHubPR:
    def get_commit_count(self) -> int:
        return int(self.info["commits_with_authors"]["totalCount"])

-    def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
-        """
-        Get the PR head commit SHA that was present when a specific comment was posted.
-        This ensures we only merge the state of the PR at the time the merge command was issued,
-        not any subsequent commits that may have been pushed after.
-
-        Returns None if no head-changing events found before the comment or if the comment was not found.
-        """
-        head = None
-
-        try:
-            for event in iter_issue_timeline_until_comment(
-                self.org, self.project, self.pr_num, comment_id
-            ):
-                etype = event.get("event")
-                if etype == "committed":
-                    sha = sha_from_committed_event(event)
-                    if sha:
-                        head = sha
-                        print(f"Timeline: Found commit event for SHA {sha}")
-                elif etype == "head_ref_force_pushed":
-                    sha = sha_from_force_push_after(event)
-                    if sha:
-                        head = sha
-                        print(f"Timeline: Found force push event for SHA {sha}")
-                elif etype == "commented":
-                    if event.get("id") == comment_id:
-                        print(f"Timeline: Found final comment with sha {sha}")
-                        return head
-        except Exception as e:
-            print(
-                f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
-            )
-            return None
-
-        print(f"Did not find comment with id {comment_id} in the PR timeline")
-        return None
-
    def get_pr_creator_login(self) -> str:
        return cast(str, self.info["author"]["login"])

@ -1329,14 +1234,11 @@ class GitHubPR:
        skip_all_rule_checks: bool = False,
    ) -> list["GitHubPR"]:
        """
-        :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
+        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
        """
        branch_to_merge_into = self.default_branch() if branch is None else branch
        if repo.current_branch() != branch_to_merge_into:
            repo.checkout(branch_to_merge_into)
-
-        # It's okay to skip the commit SHA check for ghstack PRs since
-        # authoring requires write access to the repo.
        if self.is_ghstack_pr():
            return self.merge_ghstack_into(
                repo,
@ -1347,41 +1249,14 @@ class GitHubPR:

        msg = self.gen_commit_message()
        pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-
-        # Determine which commit SHA to merge
-        commit_to_merge = None
-        if not comment_id:
-            raise ValueError("Must provide --comment-id when merging regular PRs")
-
-        # Get the commit SHA that was present when the comment was made
-        commit_to_merge = self.get_commit_sha_at_comment(comment_id)
-        if not commit_to_merge:
-            raise RuntimeError(
-                f"Could not find commit that was pushed before comment {comment_id}"
-            )
-
-        # Validate that this commit is the latest commit on the PR
-        latest_commit = self.last_commit_sha()
-        if commit_to_merge != latest_commit:
-            raise RuntimeError(
-                f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
-                f"but now the latest commit on the PR is {latest_commit}. "
-                f"Please re-issue the merge command to merge the latest commit."
-            )
-
-        print(f"Merging commit {commit_to_merge} locally")
-
-        repo.fetch(commit_to_merge, pr_branch_name)
+        repo.fetch(self.last_commit_sha(), pr_branch_name)
        repo._run_git("merge", "--squash", pr_branch_name)
        repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)

        # Did the PR change since we started the merge?
        pulled_sha = repo.show_ref(pr_branch_name)
        latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
-        if (
-            pulled_sha != latest_pr_status.last_commit_sha()
-            or pulled_sha != commit_to_merge
-        ):
+        if pulled_sha != latest_pr_status.last_commit_sha():
            raise RuntimeError(
                "PR has been updated since CI checks last passed. Please rerun the merge command."
            )
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -77,7 +77,6 @@ jobs:
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
-          git config --global core.ignorecase false

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -70,7 +70,6 @@ jobs:
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
-          git config --global core.ignorecase false

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -275,7 +275,7 @@ jobs:
      - name: Change permissions
        if: ${{ always() && steps.test.conclusion }}
        run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

      - name: Print remaining test logs
        shell: bash
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -50,9 +50,10 @@ jobs:
        runner: [linux.12xlarge]
        docker-image-name: [
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
@ -63,7 +64,7 @@ jobs:
          pytorch-linux-jammy-rocm-n-py3-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
          pytorch-linux-jammy-py3.10-gcc11,
-          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-n-1-py3,
          pytorch-linux-jammy-xpu-n-py3,
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -1038,253 +1038,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda13_0-shared-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cuda13_0-shared-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cuda13_0-shared-with-deps-debug-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda13_0-shared-with-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-cuda13_0-shared-with-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda13_0-shared-with-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -1038,253 +1038,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda13_0-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cuda13_0-shared-with-deps-release-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda13_0-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-cuda13_0-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda13_0-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -18,13 +18,13 @@ permissions:
  contents: read

 jobs:
-  inductor-build:
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: inductor-build
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      # Use metal host for benchmark jobs
      test-matrix: |
        { include: [
@ -32,13 +32,13 @@ jobs:
        ]}
    secrets: inherit

-  inductor-micro-benchmark-test:
-    name: inductor-micro-benchmark-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -32,13 +32,13 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  nightly-dynamo-benchmarks-build:
-    name: nightly-dynamo-benchmarks-build
+  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
+    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -51,13 +51,13 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  nightly-dynamo-benchmarks-test:
-    name: nightly-dynamo-benchmarks-test
+  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
+    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
    uses: ./.github/workflows/_linux-test.yml
-    needs: nightly-dynamo-benchmarks-build
+    needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -84,8 +84,9 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

+  # NB: Keep this in sync with trunk.yml
  build:
-    name: build
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -127,7 +128,7 @@ jobs:
    secrets: inherit

  test-periodically:
-    name: test-periodically
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '15 0,12 * * 1-6'
@ -144,7 +145,7 @@ jobs:
    secrets: inherit

  test-weekly:
-    name: test-weekly
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 0'
@ -161,7 +162,7 @@ jobs:
    secrets: inherit

  test:
-    name: test
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    # The pull_request trigger is used in PR to bump transformers pin which always
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -69,14 +69,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
@ -95,16 +95,16 @@ jobs:
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets: inherit

-  inductor-test-nightly:
-    name: inductor-test-nightly
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
@ -112,16 +112,17 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -74,14 +74,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@ -101,16 +101,16 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  inductor-test-nightly-freezing:
-    name: inductor-test-nightly-freezing
+  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
@ -118,16 +118,16 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
      # disable monitor in perf tests
      disable-monitor: false
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -79,6 +79,7 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

+  # NB: Keep this in sync with trunk.yml
  build:
    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -31,8 +31,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  periodic-dynamo-benchmarks-build:
-    name: periodic-dynamo-benchmarks-build
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
@ -57,33 +57,23 @@ jobs:
          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  periodic-dynamo-benchmarks-test:
-    name: periodic-dynamo-benchmarks-test
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-dynamo-benchmarks-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  rocm-periodic-dynamo-benchmarks-build:
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
    if: github.repository_owner == 'pytorch'
-    name: rocm-periodic-dynamo-benchmarks-build
+    name: rocm-py3_10-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
@ -109,21 +99,21 @@ jobs:
        ]}
    secrets: inherit

-  rocm-periodic-dynamo-benchmarks-test:
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm-periodic-dynamo-benchmarks-test
+    name: rocm-py3_10-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_rocm-test.yml
-    needs: rocm-periodic-dynamo-benchmarks-build
+    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
    with:
      build-environment: linux-jammy-rocm-py3_10
-      docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-smoke-build:
-    name: inductor-smoke-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
@ -139,23 +129,23 @@ jobs:
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  inductor-smoke-test:
-    name: inductor-smoke-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-smoke-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
    secrets: inherit

-  periodic-dynamo-benchmarks-cpu-build:
-    name: periodic-dynamo-benchmarks-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
+    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -170,6 +160,68 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
+    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+    secrets: inherit
+
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+      test-matrix: |
+        { include: [
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+      build-additional-packages: "vision audio fbgemm torchao"
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
+      test-matrix: |
+        { include: [
          { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
          { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
          { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@ -195,12 +247,12 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  periodic-dynamo-benchmarks-cpu-test:
-    name: periodic-dynamo-benchmarks-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-dynamo-benchmarks-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -28,8 +28,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -47,18 +47,44 @@ jobs:
        ]}
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-halide-build:
-    name: inductor-halide-build
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_12-inductor-halide-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -71,18 +97,18 @@ jobs:
        ]}
    secrets: inherit

-  inductor-halide-test:
-    name: inductor-halide-test
+  linux-jammy-cpu-py3_12-inductor-halide-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-halide-build
+    needs: linux-jammy-cpu-py3_12-inductor-halide-build
    with:
      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-triton-cpu-build:
-    name: inductor-triton-cpu-build
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -95,23 +121,23 @@ jobs:
        ]}
    secrets: inherit

-  inductor-triton-cpu-test:
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-triton-cpu-build
+    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
    with:
      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-cpu-build:
-    name: inductor-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
@ -122,12 +148,37 @@ jobs:
        ]}
    secrets: inherit

-  inductor-cpu-test:
-    name: inductor-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -44,8 +44,8 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  inductor-build:
-    name: inductor-build
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -53,6 +53,7 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@ -64,24 +65,25 @@ jobs:
      build-additional-packages: "vision audio fbgemm torchao"
    secrets: inherit

-  inductor-test:
-    name: inductor-test
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-cpu-build:
-    name: inductor-cpu-build
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
      test-matrix: |
        { include: [
          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@ -96,12 +98,12 @@ jobs:
      build-additional-packages: "vision audio torchao"
    secrets: inherit

-  inductor-cpu-test:
-    name: inductor-cpu-test
+  linux-jammy-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-cpu-build
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -24,38 +24,38 @@ permissions:
  contents: read

 jobs:
-  opbenchmark-build:
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
    if: github.repository_owner == 'pytorch'
-    name: opbenchmark-build
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit

-  opbenchmark-on-demand-build:
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
-    name: opbenchmark-on-demand-build
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit

-  opbenchmark-test:
-    name: opbenchmark-test
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
    uses: ./.github/workflows/_linux-test.yml
-    needs: opbenchmark-build
+    needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -170,38 +170,6 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda13_0-py3_10-gcc11-build:
-    name: linux-jammy-cuda13.0-py3.10-gcc11
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      cuda-arch-list: 7.5
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
-      test-matrix: |
-        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda13_0-py3_10-gcc11-test:
-    name: linux-jammy-cuda13.0-py3.10-gcc11
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda13_0-py3_10-gcc11-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-rocm-py3_10-build:
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -224,12 +224,13 @@ jobs:
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
    secrets: inherit

-  inductor-build:
-    name: inductor-build
+  # NB: Keep this in sync with inductor-perf-test-nightly.yml
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
    secrets: inherit
@ -241,7 +242,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1801,26 +1801,3 @@ command = [
  "python3",
  "tools/linter/adapters/gb_registry_linter.py",
 ]
-
-[[linter]]
-code = 'DISTRIBUTED_C10D_DIRECT_ACCESS'
-include_patterns = ['**/*.py']
-exclude_patterns = [
-    'torch/distributed/_distributed_c10d.py',
-    'fb/**',
-    '**/fb/**',
-]
-command = [
-    'python3',
-    'tools/linter/adapters/grep_linter.py',
-    '--pattern=torch\._C\._distributed_c10d',
-    '--linter-name=DISTRIBUTED_C10D_DIRECT_ACCESS',
-    '--error-name=direct access to torch._C._distributed_c10d',
-    """--error-description=\
-        Never access torch._C._distributed_c10d directly in code. Always \
-        import from and use torch.distributed._distributed_c10d which is \
-        guaranteed to have all functions available\
-    """,
-    '--',
-    '@{{PATHSFILE}}'
-]
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,6 +22,7 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
+    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default.
-# NB: USE_DISTRIBUTED simply disables the backend; distributed code
-# still gets built
+# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
+# tested and likely won't work without additional changes.
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -262,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Enable default distributed backends" ON)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -431,10 +430,11 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
+      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
+          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -216,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
 if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
  add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})

-  target_include_directories(flash_attention SYSTEM PUBLIC
+  target_include_directories(flash_attention PUBLIC
    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
    ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -279,45 +279,10 @@ bool Context::userEnabledOverrideableSDP() const {
  return enabled_overrideable;
 }

-static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
-static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
 #ifdef USE_ROCM
 static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
 #endif

-bool Context::checkCuBLASConfigDeterministic() {
-  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
-  // is set to deterministic setting
-  if (hasCUDART()) {
-    const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
-    return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
-  }
-  return true;
-}
-
-void Context::alertCuBLASConfigNotDeterministic() const {
-  static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
-  if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
-    return;
-  }
-
-  auto msg = c10::str(
-    "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
-    "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
-    "it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
-    "case, you must set an environment variable before running your PyTorch application: ",
-    cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
-    cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
-    "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
-  );
-
-  if (deterministicAlgorithmsWarnOnly()) {
-    TORCH_WARN(msg);
-  } else {
-    TORCH_CHECK(false, msg);
-  }
-}
-
 bool Context::benchmarkCuDNN() const {
  return benchmark_cudnn;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -310,13 +310,7 @@ class TORCH_API Context {
  //
  // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
  //   of the time, this should be accomplished by calling
-  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
-  //   nondeterministic behavior is caused by the CuBLAS workspace
-  //   configuration in CUDA >= 10.2,
-  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
-  //   called instead (in this case, a comment explaining why the operation is
-  //   nondeterministic is not necessary). See below for details on these
-  //   methods.
+  //   `at::globalContext().alertNotDeterminstic().
  //
  // * Have an entry in the list of nondeterministic PyTorch operations in the
  //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
@ -340,12 +334,6 @@ class TORCH_API Context {
  // Throws an error if `Context::deterministicAlgorithms()` is true
  static void alertNotDeterministic(std::string_view const& caller);

-  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
-  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
-  // ":4096:8". For more details:
-  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
-  void alertCuBLASConfigNotDeterministic() const;
-
  void setFloat32MatmulPrecision(const std::string& s);
  void setFloat32Precision(
      const std::string& backend,
@ -429,7 +417,6 @@ class TORCH_API Context {
  }

 private:
-  static bool checkCuBLASConfigDeterministic();
  std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -436,7 +436,6 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
  }

-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -570,8 +569,6 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D

 template <>
 void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -583,8 +580,6 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {

 template <>
 void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -596,8 +591,6 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {

 template <>
 void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -611,8 +604,6 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co

 template <>
 void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -626,8 +617,6 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com

 template <typename C_Dtype>
 inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -697,8 +686,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP

 template <typename C_Dtype>
 inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  BGEMM_CHECK_ARGVALUES(at::BFloat16);
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1027,8 +1014,6 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty

 template <>
 void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1040,8 +1025,6 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {

 template <>
 void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1053,8 +1036,6 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {

 template <>
 void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1068,8 +1049,6 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp

 template <>
 void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1083,8 +1062,6 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl

 template <typename C_Dtype>
 inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1191,7 +1168,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(

 template <typename C_Dtype>
 inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1947,11 +1923,11 @@ void scaled_gemm(
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
  cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
  cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
+#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
  // hipblaslt supported row-wise before cublas, and did so their own way (via
  // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
  // the SCALE_MODEs). Here we check for this early custom mode.
  bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
-#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
  if (use_rowwise) {
    matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
    matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
@ -1966,8 +1942,12 @@ void scaled_gemm(
            }
  #endif
  }
-#else
-  // rowwise isn't supported using cublaslt or older hipblaslt
+#elif (CUDA_VERSION < 12080) && !defined(USE_ROCM)
+  // hipblaslt supported row-wise before cublas, and did so their own way (via
+  // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
+  // the SCALE_MODEs). Here we check for this early custom mode.
+  bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
+  // rowwise isn't supported using older cublaslt or older hipblaslt
  TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
 #endif  // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
  computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
@ -2411,8 +2391,6 @@ void trsmBatched<c10::complex<double>>(

 template <>
 void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2428,8 +2406,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
  // gemv is bw bound, and does not benefit from TF32. But the precision
  // loss still happens on TF32. So we disable it here.
  NoTF32Guard disable_tf32;
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2442,8 +2418,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {

 template <>
 void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2457,8 +2431,6 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
  // gemv is bw bound, and does not benefit from TF32. But the precision
  // loss still happens on TF32. So we disable it here.
  NoTF32Guard disable_tf32;
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -185,17 +185,6 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
  // right:  "lro, summed, ro" permuted with rpermutation and the three flattened
  // then the permuted output is a view of bmm(left, right)
  // finally, opermutation reverts the permutation to the original order of dimensions
-  // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
-  // However, if all dimensions from the right operand appear before those from the left
-  // operand in the final output, we can swap the operands so that bmm directly produces
-  // the result in the correct memory order.
-
-  bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front();
-  if (swap_lo_ro) {
-    std::swap(left, right);
-    std::swap(lo, ro);
-    std::swap(lo_size, ro_size);
-  }
  auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
  std::vector<SymInt> out_size;
  out_size.reserve(out_num_dim);
--- a/aten/src/ATen/native/cpu/PaddingKernel.cpp
+++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@ -156,7 +156,7 @@ void cpu_padding(
  int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0;
  int64_t offset_w = p.offsets[ndim - 1];

-  // do vectorized copy when output is overlapped with input on W,
+  // do vectorized copy whe output is overlapped with input on W,
  // only applies to positive padding
  auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
    if (positive_padding) {
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl(
    //
    // The optimal THRESHOLD to tile was found empirically.
    // When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead.
-    // When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
+    // Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
    //
    // When num_threads == 1, always use Method 2 as there is no synchronization overhead.
    //
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1665,7 +1665,7 @@ const std::optional<at::Tensor>& bias,
 const std::optional<at::Tensor>& scale_result,
 std::optional<c10::ScalarType> out_dtype,
 bool use_fast_accum) {
-  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/false);
+  bool allowed_device = _scaled_mm_allowed_device();
  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");

  TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
--- a/aten/src/ATen/native/cuda/SegmentReduce.cu
+++ b/aten/src/ATen/native/cuda/SegmentReduce.cu
@ -20,7 +20,7 @@

 // SegmentReduce compilation with CUDA-12.9 causes  NVCC crash on Windows
 // See https://github.com/pytorch/pytorch/issues/156181
-#if !(defined(_WIN32) && CUDART_VERSION == 12090)
+#if !defined(_WIN32) || CUDART_VERSION < 12090

 namespace at::native {

@ -606,4 +606,4 @@ REGISTER_DISPATCH(

 } // namespace at::native

-#endif
+#endif
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@ -464,7 +464,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
    }
 #endif
    int32_t trailingSize;
-    TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
    if (isInOutAligned) {
      // in this case we can and should flatten the tensors after the cat dim
      // we want to view the tensors as if consisting of `alignment`-sized elements
@ -476,15 +475,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      // then we need to divide last out size by elems_per_vec,
      // and divide all strides except last by elems_per_vec (last stride is 1 always)
      // for input, we will fix up the sizes and strides in the kernel directly
-      kernelOutputParam = outputParam;
      nDims = dimension + 1;
      constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
-      auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
-      kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
+      auto out_size = dimension == 0 ? out.numel() : outputParam.tensorStride[dimension-1];
+      outputParam.tensorSize[dimension] = out_size / elems_per_vec;
      trailingSize = outputParam.tensorStride[dimension];
-      kernelOutputParam.tensorStride[dimension] = 1;
-      for (int i = 0; i < dimension; ++i) {
-        kernelOutputParam.tensorStride[i] /= elems_per_vec;
+      outputParam.tensorStride[dimension] = 1;
+      for (int i = 0; i < nDims; ++i) {
+        if (i!=dimension) {
+          outputParam.tensorStride[i] /= elems_per_vec;
+        }
      }
    }

@ -505,7 +505,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
      CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
      catGrid, applyBlock, 0, stream.stream()>>>(\
-        (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
+        (char*)data, catMetaData, outputParam, dimension, trailingSize);\
    } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@ -1,7 +1,5 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
-#include <ATen/native/mkldnn/xpu/qconv.h>
-
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/ScalarType.h>
 #include <torch/library.h>
@ -9,7 +7,7 @@
 using namespace at::native::onednn;
 namespace at::native::xpu {

-inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
+static inline c10::ScalarType qconv_decide_out_dtype(
    const at::Tensor& act,
    const std::optional<c10::ScalarType> output_dtype) {
  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@ -21,7 +19,7 @@ inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
  return dst_dtype;
 }

-at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
+static at::Tensor qconv_prepack_xpu(
    at::Tensor weight,
    at::Tensor weight_scales,
    double input_scale,
@ -35,265 +33,222 @@ at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
  return weight;
 }

-at::Tensor QConvoneDNNXPU::run_pointwise(
-    at::Tensor act,
-    double act_scale,
-    int64_t act_zero_point,
-    at::Tensor weight,
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    std::optional<at::Tensor> bias,
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    int64_t groups,
-    double inv_output_scale,
-    int64_t output_zero_point,
-    std::optional<c10::ScalarType> output_dtype,
-    std::string_view attr,
-    torch::List<std::optional<at::Scalar>> scalars,
-    std::optional<std::string_view> algorithm) {
-  if (act.dim() == 3 || act.dim() == 5) {
-    TORCH_CHECK(
-        attr == "none",
-        "quantized pointwise conv",
-        act.dim() - 2,
-        "d doesn't support unary_post_op fusion. Got unary_post_op:",
-        attr,
-        ".");
-  } else {
-    TORCH_CHECK(
-        attr == "none" || attr == "relu" || attr == "hardtanh" ||
-            attr == "hardswish" || attr == "swish",
-        "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
-        attr,
-        ".");
+class QConvoneDNNXPU final {
+ public:
+  static at::Tensor run_pointwise(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double inv_output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
+    if (act.dim() == 3 || act.dim() == 5) {
+      TORCH_CHECK(
+          attr == "none",
+          "quantized pointwise conv",
+          act.dim() - 2,
+          "d doesn't support unary_post_op fusion. Got unary_post_op:",
+          attr,
+          ".");
+    } else {
+      TORCH_CHECK(
+          attr == "none" || attr == "relu" || attr == "hardtanh" ||
+              attr == "hardswish" || attr == "swish",
+          "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
+          attr,
+          ".");
+    }
+
+    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+    auto mfmt = is_channels_last_suggested
+        ? get_cl_tag_by_ndim(act.ndimension())
+        : at::MemoryFormat::Contiguous;
+    Tensor input_ = act.contiguous(mfmt);
+    Tensor weight_ = weight.contiguous(mfmt);
+
+    auto dst_tz = conv_dst_size(
+        input_.ndimension(),
+        input_.sizes(),
+        weight_.sizes(),
+        padding.vec(),
+        padding.vec(),
+        stride.vec(),
+        dilation.vec());
+
+    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+    Tensor output =
+        at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+
+    return quantized_convolution(
+        act,
+        act_scale,
+        act_zero_point,
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        /*transposed*/ false,
+        groups,
+        output,
+        inv_output_scale,
+        output_zero_point,
+        /*accum*/ std::nullopt,
+        /*accum_scale*/ 0.0,
+        /*accum_zero_point*/ 0,
+        /*output_dtype*/ output_dtype,
+        /*binary_attr*/ std::nullopt,
+        /*binary_alpha*/ std::nullopt,
+        /*unary_attr*/ attr,
+        /*unary_scalars*/ scalars,
+        /*unary_algorithm*/ algorithm);
  }

-  bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
-  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
-                                         : at::MemoryFormat::Contiguous;
-  Tensor input_ = act.contiguous(mfmt);
-  Tensor weight_ = weight.contiguous(mfmt);
-
-  auto dst_tz = conv_dst_size(
-      input_.ndimension(),
-      input_.sizes(),
-      weight_.sizes(),
-      padding.vec(),
-      padding.vec(),
-      stride.vec(),
-      dilation.vec());
-
-  auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
-  Tensor output =
-      at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
-
-  return quantized_convolution(
-      act,
-      act_scale,
-      act_zero_point,
-      weight,
-      weight_scales,
-      weight_zero_points,
-      bias,
-      stride,
-      padding,
-      dilation,
-      /*transposed*/ false,
-      groups,
-      output,
-      inv_output_scale,
-      output_zero_point,
-      /*accum*/ std::nullopt,
-      /*accum_scale*/ 0.0,
-      /*accum_zero_point*/ 0,
-      /*output_dtype*/ output_dtype,
-      /*binary_attr*/ std::nullopt,
-      /*binary_alpha*/ std::nullopt,
-      /*unary_attr*/ attr,
-      /*unary_scalars*/ scalars,
-      /*unary_algorithm*/ algorithm);
-}
-
-at::Tensor QConvoneDNNXPU::run_pointwise_tensor(
-    at::Tensor act,
-    at::Tensor act_scale,
-    at::Tensor act_zero_point,
-    at::Tensor weight,
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    std::optional<at::Tensor> bias,
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    int64_t groups,
-    double output_scale,
-    int64_t output_zero_point,
-    std::optional<c10::ScalarType> output_dtype,
-    std::string_view attr,
-    torch::List<std::optional<at::Scalar>> scalars,
-    std::optional<std::string_view> algorithm) {
-  return run_pointwise(
-      act,
-      act_scale.item().toDouble(),
-      act_zero_point.item().toLong(),
-      weight,
-      weight_scales,
-      weight_zero_points,
-      bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_scale,
-      output_zero_point,
-      output_dtype,
-      /*unary_attr*/ attr,
-      /*unary_scalars*/ scalars,
-      /*unary_algorithm*/ algorithm);
-}
-
-at::Tensor QConvoneDNNXPU::run_pointwise_binary(
-    at::Tensor act,
-    double act_scale,
-    int64_t act_zero_point,
-    at::Tensor weight,
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    at::Tensor accum,
-    std::optional<at::Tensor> bias,
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    int64_t groups,
-    double output_scale,
-    int64_t output_zero_point,
-    std::optional<c10::ScalarType> output_dtype,
-    double accum_scale,
-    int64_t accum_zero_point,
-    std::string_view binary_attr,
-    std::optional<at::Scalar> alpha,
-    std::optional<std::string_view> unary_attr,
-    torch::List<std::optional<at::Scalar>> unary_scalars,
-    std::optional<std::string_view> unary_algorithm) {
-  TORCH_CHECK(
-      act.dim() == 4 && binary_attr == "sum" &&
-          (!unary_attr.has_value() ||
-           (unary_attr.has_value() &&
-            (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
-      "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
-      binary_attr,
-      " unary_post_op: ",
-      unary_attr.has_value() ? unary_attr.value() : "none",
-      ".")
-
-  bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
-  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
-                                         : at::MemoryFormat::Contiguous;
-  Tensor input_ = act.contiguous(mfmt);
-  Tensor weight_ = weight.contiguous(mfmt);
-
-  auto dst_tz = conv_dst_size(
-      input_.ndimension(),
-      input_.sizes(),
-      weight_.sizes(),
-      padding.vec(),
-      padding.vec(),
-      stride.vec(),
-      dilation.vec());
-
-  auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
-  bool has_accum_postop_sum = binary_attr == "sum";
-  Tensor output = has_accum_postop_sum
-      ? accum
-      : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
-
-  output = quantized_convolution(
-      act,
-      act_scale,
-      act_zero_point,
-      weight,
-      weight_scales,
-      weight_zero_points,
-      bias,
-      stride,
-      padding,
-      dilation,
-      /*transposed*/ false,
-      groups,
-      output,
-      output_scale,
-      output_zero_point,
-      /*accum*/ accum,
-      /*accum_scale*/ accum_scale,
-      /*accum_zero_point*/ accum_zero_point,
-      /*output_dtype*/ output_dtype,
-      /*binary_attr*/ binary_attr,
-      /*binary_alpha*/ alpha,
-      /*unary_attr*/ unary_attr,
-      /*unary_scalars*/ unary_scalars,
-      /*unary_algorithm*/ unary_algorithm);
-
-  if (!has_accum_postop_sum) {
-    return output;
-  } else {
-    return accum;
+  static at::Tensor run_pointwise_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
+    return run_pointwise(
+        act,
+        act_scale.item().toDouble(),
+        act_zero_point.item().toLong(),
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        /*unary_attr*/ attr,
+        /*unary_scalars*/ scalars,
+        /*unary_algorithm*/ algorithm);
  }
-}

-at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor(
-    at::Tensor act, // contains quantized values but not QTensor
-    at::Tensor act_scale,
-    at::Tensor act_zero_point,
-    at::Tensor weight, // contains quantized values but not QTensor
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    at::Tensor accum, // contains quantized values but not QTensor
-    std::optional<at::Tensor> bias,
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    int64_t groups,
-    double output_scale,
-    int64_t output_zero_point,
-    std::optional<c10::ScalarType> output_dtype,
-    double accum_scale,
-    int64_t accum_zero_point,
-    std::string_view binary_attr,
-    std::optional<at::Scalar> alpha,
-    std::optional<std::string_view> unary_attr,
-    torch::List<std::optional<at::Scalar>> unary_scalars,
-    std::optional<std::string_view> unary_algorithm) {
-  return run_pointwise_binary(
-      act,
-      act_scale.item().toDouble(),
-      act_zero_point.item().toLong(),
-      weight,
-      weight_scales,
-      weight_zero_points,
-      accum,
-      bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_scale,
-      output_zero_point,
-      output_dtype,
-      accum_scale,
-      accum_zero_point,
-      binary_attr,
-      alpha,
-      unary_attr,
-      unary_scalars,
-      unary_algorithm);
-}
+  static at::Tensor run_pointwise_binary(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      at::Tensor accum,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double accum_scale,
+      int64_t accum_zero_point,
+      std::string_view binary_attr,
+      std::optional<at::Scalar> alpha,
+      std::optional<std::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<std::string_view> unary_algorithm) {
+    TORCH_CHECK(
+        act.dim() == 4 && binary_attr == "sum" &&
+            (!unary_attr.has_value() ||
+             (unary_attr.has_value() &&
+              (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
+        "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
+        binary_attr,
+        " unary_post_op: ",
+        unary_attr.has_value() ? unary_attr.value() : "none",
+        ".")
+
+    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+    auto mfmt = is_channels_last_suggested
+        ? get_cl_tag_by_ndim(act.ndimension())
+        : at::MemoryFormat::Contiguous;
+    Tensor input_ = act.contiguous(mfmt);
+    Tensor weight_ = weight.contiguous(mfmt);
+
+    auto dst_tz = conv_dst_size(
+        input_.ndimension(),
+        input_.sizes(),
+        weight_.sizes(),
+        padding.vec(),
+        padding.vec(),
+        stride.vec(),
+        dilation.vec());
+
+    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+    bool has_accum_postop_sum = binary_attr == "sum";
+    Tensor output = has_accum_postop_sum
+        ? accum
+        : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+
+    output = quantized_convolution(
+        act,
+        act_scale,
+        act_zero_point,
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        /*transposed*/ false,
+        groups,
+        output,
+        output_scale,
+        output_zero_point,
+        /*accum*/ accum,
+        /*accum_scale*/ accum_scale,
+        /*accum_zero_point*/ accum_zero_point,
+        /*output_dtype*/ output_dtype,
+        /*binary_attr*/ binary_attr,
+        /*binary_alpha*/ alpha,
+        /*unary_attr*/ unary_attr,
+        /*unary_scalars*/ unary_scalars,
+        /*unary_algorithm*/ unary_algorithm);
+
+    if (!has_accum_postop_sum) {
+      return output;
+    } else {
+      return accum;
+    }
+  }
+};

 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qconv_prepack"),
-      TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu));
+      TORCH_FN(xpu::qconv_prepack_xpu));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"),
      QConvoneDNNXPU::run_pointwise);
@ -312,9 +267,6 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
      QConvoneDNNXPU::run_pointwise_tensor);
-  m.impl(
-      TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"),
-      QConvoneDNNXPU::run_pointwise_binary_tensor);
 }

 } // namespace at::native::xpu
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.h
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.h
@ -1,111 +0,0 @@
-#pragma once
-
-#include <ATen/Config.h>
-#include <ATen/Tensor.h>
-
-namespace at::native::xpu {
-class QConvoneDNNXPU final {
- public:
-  C10_API static at::Tensor run_pointwise(
-      at::Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double inv_output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view attr,
-      torch::List<std::optional<at::Scalar>> scalars,
-      std::optional<std::string_view> algorithm);
-
-  C10_API static at::Tensor run_pointwise_tensor(
-      at::Tensor act,
-      at::Tensor act_scale,
-      at::Tensor act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view attr,
-      torch::List<std::optional<at::Scalar>> scalars,
-      std::optional<std::string_view> algorithm);
-
-  C10_API static at::Tensor run_pointwise_binary(
-      at::Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      at::Tensor accum,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double accum_scale,
-      int64_t accum_zero_point,
-      std::string_view binary_attr,
-      std::optional<at::Scalar> alpha,
-      std::optional<std::string_view> unary_attr,
-      torch::List<std::optional<at::Scalar>> unary_scalars,
-      std::optional<std::string_view> unary_algorithm);
-
-  C10_API static at::Tensor run_pointwise_binary_tensor(
-      at::Tensor act,
-      at::Tensor act_scale,
-      at::Tensor act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      at::Tensor accum,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double accum_scale,
-      int64_t accum_zero_point,
-      std::string_view binary_attr,
-      std::optional<at::Scalar> alpha,
-      std::optional<std::string_view> unary_attr,
-      torch::List<std::optional<at::Scalar>> unary_scalars,
-      std::optional<std::string_view> unary_algorithm);
-
-  static inline c10::ScalarType qconv_decide_out_dtype(
-      const at::Tensor& act,
-      const std::optional<c10::ScalarType> output_dtype);
-
-  static at::Tensor qconv_prepack_xpu(
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      double input_scale,
-      int64_t input_zero_point,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      std::optional<torch::List<int64_t>> input_shape);
-};
-
-} // namespace at::native::xpu
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
@ -1,14 +1,13 @@
 #include <torch/library.h>

 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
-#include <ATen/native/mkldnn/xpu/qlinear.h>
 #include <c10/core/ScalarType.h>

 using namespace at::native::onednn;

 namespace at::native::xpu {

-inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
+static inline c10::ScalarType qlinear_decide_out_dtype(
    const at::Tensor& act,
    const std::optional<c10::ScalarType> output_dtype) {
  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@ -20,7 +19,7 @@ inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
  return dst_dtype;
 }

-Tensor QLinearOnednnXPU::q_linear_pointwise(
+static Tensor q_linear_pointwise(
    Tensor act,
    double act_scale,
    int64_t act_zero_point,
@ -79,7 +78,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise(
  return qout;
 }

-Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
+static Tensor q_linear_pointwise_tensor(
    Tensor act,
    Tensor act_scale,
    Tensor act_zero_point,
@ -138,7 +137,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
  return qout;
 }

-Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
+static Tensor q_linear_pointwise_binary(
    Tensor act,
    double act_scale,
    int64_t act_zero_point,
@ -209,7 +208,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
  return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
 }

-Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
+static Tensor q_linear_pointwise_binary_tensor(
    Tensor act,
    Tensor act_scale,
    Tensor act_zero_point,
@ -249,7 +248,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
      unary_post_op_algorithm);
 }

-Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
+static at::Tensor q_linear_prepack_onednn(
    at::Tensor weight,
    std::optional<torch::List<int64_t>> input_shape) {
  at::Tensor weight_transposed = weight.transpose(0, 1);
@ -259,19 +258,19 @@ Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise));
+      TORCH_FN(q_linear_pointwise));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor));
+      TORCH_FN(q_linear_pointwise_tensor));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn));
+      TORCH_FN(q_linear_prepack_onednn));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary));
+      TORCH_FN(q_linear_pointwise_binary));
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
-      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor));
+      TORCH_FN(q_linear_pointwise_binary_tensor));
 }

 } // namespace at::native::xpu
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.h
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.h
@ -1,91 +0,0 @@
-#pragma once
-
-#include <ATen/Config.h>
-#include <ATen/Tensor.h>
-#include <ATen/core/List.h>
-
-namespace at::native::xpu {
-
-class QLinearOnednnXPU final {
- public:
-  C10_API static Tensor q_linear_pointwise(
-      Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      Tensor weight,
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view post_op_name,
-      torch::List<std::optional<at::Scalar>> post_op_args,
-      std::string_view post_op_algorithm);
-
-  C10_API static Tensor q_linear_pointwise_tensor(
-      Tensor act,
-      Tensor act_scale,
-      Tensor act_zero_point,
-      Tensor weight,
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view post_op_name,
-      torch::List<std::optional<at::Scalar>> post_op_args,
-      std::string_view post_op_algorithm);
-
-  C10_API static Tensor q_linear_pointwise_binary(
-      Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      Tensor weight,
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<at::Tensor> other,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double other_scale,
-      int64_t other_zero_point,
-      std::string_view binary_post_op,
-      double binary_alpha,
-      std::string_view unary_post_op,
-      torch::List<std::optional<at::Scalar>> unary_post_op_args,
-      std::string_view unary_post_op_algorithm);
-
-  C10_API static Tensor q_linear_pointwise_binary_tensor(
-      Tensor act,
-      Tensor act_scale,
-      Tensor act_zero_point,
-      Tensor weight,
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<at::Tensor> other,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double other_scale,
-      int64_t other_zero_point,
-      std::string_view binary_post_op,
-      double binary_alpha,
-      std::string_view unary_post_op,
-      torch::List<std::optional<at::Scalar>> unary_post_op_args,
-      std::string_view unary_post_op_algorithm);
-
-  C10_API static Tensor q_linear_prepack_onednn(
-      at::Tensor weight,
-      std::optional<torch::List<int64_t>> input_shape);
-
-  static inline c10::ScalarType qlinear_decide_out_dtype(
-      const at::Tensor& act,
-      const std::optional<c10::ScalarType> output_dtype);
-
-}; // class QLinearOnednnXPU
-
-} // namespace at::native::xpu
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -2,7 +2,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
-#include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
@ -12,85 +11,10 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/kthvalue_native.h>
 #include <ATen/ops/sort.h>
 #include <ATen/ops/sort_native.h>
 #endif
 namespace at::native {
-namespace {
-
-void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) {
-  using namespace mps;
-  if (self.dim() == 0 && self.numel() == 1) {
-    values.copy_(self);
-    indices.zero_();
-    return;
-  }
-  // Handle empty tensors
-  if (self.numel() == 0) {
-    values.copy_(self);
-    indices.copy_(values.toType(at::ScalarType::Long));
-    return;
-  }
-  // issue #154890, raising error to prevent crash within MPSGraph until
-  // workaround is implemented.
-  TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
-
-  auto stream = getCurrentMPSStream();
-  struct CachedGraph : public MPSCachedGraph {
-    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
-    MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
-  };
-
-  // MPSGraph kthvalue is always sorted.
-  @autoreleasepool {
-    // Input as placeholders
-    MPSShape* input_shape = getMPSShape(self);
-    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-    std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" +
-        std::to_string(k) + ":dim" + std::to_string(dim);
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
-
-      MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor;
-      MPSDataType dataType = getMPSDataType(self);
-      // #issue 104398441 sortWithTensor and argsortWithTensor
-      if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) {
-        dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
-        castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"];
-      }
-      MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
-                                                         axis:(NSUInteger)dim
-                                                   descending:false
-                                                         name:nil];
-      sortedTensor = [mpsGraph sliceTensor:sortedTensor
-                                 dimension:(NSUInteger)dim
-                                     start:((NSUInteger)k - 1)
-                                    length:1
-                                      name:nil];
-      MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
-                                                               axis:(NSInteger)dim
-                                                         descending:false
-                                                               name:@"kthvalue_out"];
-      argSortedTensor = [mpsGraph sliceTensor:argSortedTensor
-                                    dimension:dim
-                                        start:((NSUInteger)k - 1)
-                                       length:1
-                                         name:nil];
-      newCachedGraph->valuesTensor = sortedTensor;
-      newCachedGraph->indicesTensor = argSortedTensor;
-    });
-    Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self);
-    // Outputs as placeholders
-    Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
-    Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
-    // Create dictionary of inputs and outputs
-    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
-    auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-  }
-}
-} // anonymous namespace

 // sort
 TORCH_IMPL_FUNC(sort_stable_out_mps)
@ -157,31 +81,4 @@ TORCH_IMPL_FUNC(sort_stable_out_mps)
    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
  }
 }
-
-std::tuple<Tensor&, Tensor&> kthvalue_out_mps(const Tensor& self,
-                                              int64_t k,
-                                              int64_t dim_,
-                                              bool keepdim,
-                                              Tensor& values,
-                                              Tensor& indices) {
-  // See note [Writing Nondeterministic Operations]
-  // If there are duplicate elements of the kth value, the procedure for choosing which
-  // of the duplicates to use for the indices output is nondeterministic.
-  at::globalContext().alertNotDeterministic("kthvalue MPS");
-
-  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
-  int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
-  TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim);
-  at::assert_no_overlap(self, values);
-  _reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim);
-
-  kthvalue_out_mps_impl(self, k, dim, values, indices);
-
-  if (!keepdim) {
-    values.squeeze_(dim);
-    indices.squeeze_(dim);
-  }
-
-  return std::forward_as_tuple(values, indices);
-}
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@ -335,9 +335,6 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
 }

 static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
-  if (iter.numel() == 0) {
-    return;
-  }
  const auto& self = iter.input(0);
  auto& out = iter.output(0);
  @autoreleasepool {
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -417,7 +417,6 @@ TORCH_IMPL_FUNC(sgn_out_mps)(const Tensor& self, const Tensor& output) {

 Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
  TORCH_CHECK(self.is_complex());
-  TORCH_CHECK(self.dtype() != at::kComplexDouble);
  mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
    return [mpsGraph conjugateWithTensor:inputTensor name:nil];
  });
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -340,8 +340,8 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: abs
-    SparseCPU, SparseCUDA, SparseMPS: abs_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr
+    SparseCPU, SparseCUDA: abs_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
  tags: [core, pointwise]

@ -350,16 +350,16 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: abs_
-    SparseCPU, SparseCUDA, SparseMPS: abs_sparse_
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_
+    SparseCPU, SparseCUDA: abs_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_

 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA, MPS, MTIA: abs_out
-    SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out
+    SparseCPU, SparseCUDA: abs_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
  tags: pointwise

 # Note [Adding an alias]
@ -428,7 +428,7 @@
  variants: function, method
  structured_delegate: sgn.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse
+    SparseCPU, SparseCUDA: sgn_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
  tags: pointwise
@ -437,7 +437,7 @@
  variants: method
  structured_delegate: sgn.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_
+    SparseCPU, SparseCUDA: sgn_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
  tags: pointwise
@ -448,7 +448,7 @@
  dispatch:
    CPU, CUDA: sgn_out
    MPS: sgn_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out
+    SparseCPU, SparseCUDA: sgn_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out
  tags: pointwise

@ -476,7 +476,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: _conj_physical
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr
  autogen: _conj_physical.out

 - func: conj_physical(Tensor self) -> Tensor
@ -487,8 +487,8 @@
  dispatch:
    CPU, CUDA: conj_physical_out
    MPS: conj_physical_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out
+    SparseCPU, SparseCUDA: conj_physical_out_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out
  tags: pointwise

 - func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
@ -554,7 +554,7 @@
  structured_delegate: add.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse
+    SparseCPU, SparseCUDA, SparseMeta: add_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
    MkldnnCPU: mkldnn_add
    ZeroTensor: add_zerotensor
@ -566,7 +566,7 @@
  variants: method
  structured_delegate: add.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_
+    SparseCPU, SparseCUDA, SparseMeta: add_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
    MkldnnCPU: mkldnn_add_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
@ -582,7 +582,6 @@
  dispatch:
    SparseCPU, SparseMeta: add_out_sparse_cpu
    SparseCUDA: add_out_sparse_cuda
-    SparseMPS: add_out_sparse_mps
    SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
    SparseCsrCUDA: add_out_sparse_compressed_cuda
    MkldnnCPU: mkldnn_add_out
@ -875,7 +874,7 @@
  variants: function, method
  structured_delegate: asinh.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse
+    SparseCPU, SparseCUDA: asinh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr
  tags: [core, pointwise]

@ -883,7 +882,7 @@
  variants: function, method
  structured_delegate: asinh.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_
+    SparseCPU, SparseCUDA: asinh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_
  tags: pointwise

@ -893,7 +892,7 @@
  dispatch:
    CPU, CUDA: asinh_out
    MPS: asinh_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out
+    SparseCPU, SparseCUDA: asinh_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out
  tags: pointwise

@ -910,7 +909,7 @@
  structured_delegate: atanh.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse
+    SparseCPU, SparseCUDA: atanh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr
  tags: [core, pointwise]

@ -918,7 +917,7 @@
  structured_delegate: atanh.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_
+    SparseCPU, SparseCUDA: atanh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_
  tags: pointwise

@ -928,7 +927,7 @@
  dispatch:
    CPU, CUDA: atanh_out
    MPS: atanh_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out
+    SparseCPU, SparseCUDA: atanh_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out
  tags: pointwise
 # arctanh, alias for atanh
@ -965,7 +964,7 @@
  variants: function, method
  structured_delegate: asin.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: asin_sparse
+    SparseCPU, SparseCUDA: asin_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr
  tags: [core, pointwise]

@ -974,7 +973,7 @@
  variants: function, method
  structured_delegate: asin.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: asin_sparse_
+    SparseCPU, SparseCUDA: asin_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_
  tags: pointwise

@ -984,7 +983,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: asin_out
-    SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out
+    SparseCPU, SparseCUDA: asin_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
  tags: pointwise

@ -1002,7 +1001,7 @@
  structured_delegate: atan.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: atan_sparse
+    SparseCPU, SparseCUDA: atan_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr
  tags: [core, pointwise]

@ -1011,7 +1010,7 @@
  structured_delegate: atan.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: atan_sparse_
+    SparseCPU, SparseCUDA: atan_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_
  tags: pointwise

@ -1021,7 +1020,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: atan_out
-    SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out
+    SparseCPU, SparseCUDA: atan_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
  tags: pointwise

@ -1460,7 +1459,7 @@
  structured_delegate: ceil.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse
+    SparseCPU, SparseCUDA: ceil_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr
  tags: [core, pointwise]

@ -1469,7 +1468,7 @@
  structured_delegate: ceil.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_
+    SparseCPU, SparseCUDA: ceil_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_
  tags: pointwise

@ -1479,7 +1478,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: ceil_out
-    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out
+    SparseCPU, SparseCUDA: ceil_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
  tags: pointwise

@ -2407,7 +2406,7 @@
    MPS: empty_mps
    Meta: empty_meta_symint
    MkldnnCPU: empty_mkldnn
-    SparseCPU, SparseCUDA, SparseMPS: empty_sparse
+    SparseCPU, SparseCUDA: empty_sparse
    SparseMeta: empty_sparse_symint
    SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
    SparseCsrMeta: empty_sparse_compressed_symint
@ -2535,7 +2534,7 @@
  structured_delegate: erf.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erf_sparse
+    SparseCPU, SparseCUDA: erf_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr
  tags: [core, pointwise]

@ -2544,7 +2543,7 @@
  structured_delegate: erf.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erf_sparse_
+    SparseCPU, SparseCUDA: erf_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_
  tags: pointwise

@ -2554,7 +2553,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: erf_out
-    SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out
+    SparseCPU, SparseCUDA: erf_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
  tags: pointwise

@ -2620,7 +2619,7 @@
  structured_delegate: expm1.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse
+    SparseCPU, SparseCUDA: expm1_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr
  tags: [core, pointwise]

@ -2629,7 +2628,7 @@
  structured_delegate: expm1.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_
+    SparseCPU, SparseCUDA: expm1_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_
  tags: pointwise

@ -2639,7 +2638,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: expm1_out
-    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out
+    SparseCPU, SparseCUDA: expm1_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
  tags: pointwise

@ -2738,7 +2737,7 @@
  structured_delegate: floor.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: floor_sparse
+    SparseCPU, SparseCUDA: floor_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr
  tags: [core, pointwise]

@ -2747,7 +2746,7 @@
  structured_delegate: floor.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: floor_sparse_
+    SparseCPU, SparseCUDA: floor_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_
  tags: pointwise

@ -2757,7 +2756,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: floor_out
-    SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out
+    SparseCPU, SparseCUDA: floor_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
  tags: pointwise

@ -2765,7 +2764,7 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    CPU, CUDA, MPS, MTIA: floor_divide
+    CPU, CUDA, MPS: floor_divide
    SparseCPU, SparseCUDA: floor_divide_sparse

 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@ -2799,7 +2798,7 @@
  structured_delegate: frac.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: frac_sparse
+    SparseCPU, SparseCUDA: frac_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr
  tags: pointwise

@ -2808,7 +2807,7 @@
  structured_delegate: frac.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: frac_sparse_
+    SparseCPU, SparseCUDA: frac_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_
  tags: pointwise

@ -2819,7 +2818,7 @@
  dispatch:
    CPU, CUDA: frac_out
    MPS: frac_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out
+    SparseCPU, SparseCUDA: frac_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out
  tags: pointwise

@ -3209,7 +3208,7 @@
  dispatch:
    CPU, CUDA, MPS, MTIA: isnan
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
-    SparseCPU, SparseCUDA, SparseMPS: isnan_sparse
+    SparseCPU, SparseCUDA: isnan_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
  autogen: isnan.out
  tags: [core, pointwise]
@ -3290,7 +3289,6 @@
  dispatch:
    CPU: kthvalue_out_cpu
    CUDA: kthvalue_out_cuda
-    MPS: kthvalue_out_mps

 - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  variants: function, method
@ -3338,21 +3336,21 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: nan_to_num
-    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse
+    SparseCPU, SparseCUDA: nan_to_num_sparse
  tags: pointwise

 - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: nan_to_num_
-    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_
+    SparseCPU, SparseCUDA: nan_to_num_sparse_
  tags: pointwise

 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU, CUDA, MTIA: nan_to_num_out
    MPS: nan_to_num_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out
+    SparseCPU, SparseCUDA: nan_to_num_sparse_out
  tags: pointwise

 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
@ -3555,7 +3553,7 @@
  structured_delegate: log1p.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse
+    SparseCPU, SparseCUDA: log1p_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr
  tags: [core, pointwise]

@ -3564,7 +3562,7 @@
  structured_delegate: log1p.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_
+    SparseCPU, SparseCUDA: log1p_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_
  tags: pointwise

@ -3574,7 +3572,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: log1p_out
-    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out
+    SparseCPU, SparseCUDA: log1p_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
  tags: pointwise

@ -4666,7 +4664,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: rad2deg
-    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse
+    SparseCPU, SparseCUDA: rad2deg_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr
  tags: pointwise

@ -4674,14 +4672,14 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: rad2deg_
-    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_
+    SparseCPU, SparseCUDA: rad2deg_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_
  tags: pointwise

 - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CompositeExplicitAutograd: rad2deg_out
-    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out
+    SparseCPU, SparseCUDA: rad2deg_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out
  tags: pointwise

@ -4689,7 +4687,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: deg2rad
-    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse
+    SparseCPU, SparseCUDA: deg2rad_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr
  tags: pointwise

@ -4697,14 +4695,14 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: deg2rad_
-    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_
+    SparseCPU, SparseCUDA: deg2rad_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_
  tags: pointwise

 - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CompositeExplicitAutograd: deg2rad_out
-    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out
+    SparseCPU, SparseCUDA: deg2rad_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out
  tags: pointwise

@ -4930,7 +4928,7 @@
  structured_delegate: neg.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: neg_sparse
+    SparseCPU, SparseCUDA: neg_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
  tags: [core, pointwise]
@ -4940,7 +4938,7 @@
  structured_delegate: neg.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: neg_sparse_
+    SparseCPU, SparseCUDA: neg_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
  tags: pointwise
@ -4951,7 +4949,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: neg_out
-    SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse
+    SparseCPU, SparseCUDA: neg_out_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
  tags: pointwise
 # Alias for neg
@ -5035,7 +5033,7 @@
  structured_delegate: round.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: round_sparse
+    SparseCPU, SparseCUDA: round_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr
  tags: [core, pointwise]

@ -5044,7 +5042,7 @@
  structured_delegate: round.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: round_sparse_
+    SparseCPU, SparseCUDA: round_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_
  tags: pointwise

@ -5054,7 +5052,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: round_out
-    SparseCPU, SparseCUDA, SparseMPS: round_sparse_out
+    SparseCPU, SparseCUDA: round_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
  tags: pointwise

@ -5097,7 +5095,7 @@
    QuantizedCPU: relu_quantized_cpu
    QuantizedCUDA: relu_quantized_cuda
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
-    SparseCPU, SparseCUDA, SparseMPS: relu_sparse
+    SparseCPU, SparseCUDA: relu_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
  tags: [core, pointwise]

@ -5112,7 +5110,7 @@
    QuantizedCPU: relu_quantized_cpu_
    QuantizedCUDA: relu_quantized_cuda_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
-    SparseCPU, SparseCUDA, SparseMPS: relu_sparse_
+    SparseCPU, SparseCUDA: relu_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
  autogen: relu.out
  tags: pointwise
@ -5399,7 +5397,7 @@
  variants: function, method
  dispatch:
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
-    SparseCPU, SparseCUDA, SparseMPS: sin_sparse
+    SparseCPU, SparseCUDA: sin_sparse
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
  tags: [core, pointwise]

@ -5409,7 +5407,7 @@
  variants: function, method
  dispatch:
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_
-    SparseCPU, SparseCUDA, SparseMPS: sin_sparse_
+    SparseCPU, SparseCUDA: sin_sparse_
  tags: pointwise

 - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -5419,7 +5417,7 @@
  dispatch:
    CPU, CUDA, MPS, MTIA: sin_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
-    SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out
+    SparseCPU, SparseCUDA: sin_sparse_out
  tags: pointwise

 - func: sinc(Tensor self) -> Tensor
@ -5444,7 +5442,7 @@
  structured_delegate: sinh.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse
+    SparseCPU, SparseCUDA: sinh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr
  tags: [core, pointwise]

@ -5453,7 +5451,7 @@
  structured_delegate: sinh.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_
+    SparseCPU, SparseCUDA: sinh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_
  tags: pointwise

@ -5463,7 +5461,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: sinh_out
-    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out
+    SparseCPU, SparseCUDA: sinh_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out

 # Returns a copy of this `Variable` that is detached from its autograd graph.
@ -5906,7 +5904,7 @@
  variants: function, method
  dispatch:
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
-    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse
+    SparseCPU, SparseCUDA: sqrt_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
  tags: [core, pointwise]

@ -5915,7 +5913,7 @@
  structured_delegate: sqrt.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_
+    SparseCPU, SparseCUDA: sqrt_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_
  tags: pointwise

@ -5925,7 +5923,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: sqrt_out
-    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out
+    SparseCPU, SparseCUDA: sqrt_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
  tags: pointwise

@ -6063,7 +6061,7 @@
  structured_delegate: tan.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: tan_sparse
+    SparseCPU, SparseCUDA: tan_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr
  tags: [core, pointwise]

@ -6072,7 +6070,7 @@
  structured_delegate: tan.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: tan_sparse_
+    SparseCPU, SparseCUDA: tan_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_
  tags: pointwise

@ -6082,7 +6080,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: tan_out
-    SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out
+    SparseCPU, SparseCUDA: tan_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
  tags: pointwise

@ -6093,7 +6091,7 @@
  dispatch:
    QuantizedCPU: tanh_quantized_cpu
    MkldnnCPU: mkldnn_tanh
-    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse
+    SparseCPU, SparseCUDA: tanh_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
  tags: [core, pointwise]
@ -6104,7 +6102,7 @@
  variants: function, method
  dispatch:
    MkldnnCPU: mkldnn_tanh_
-    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_
+    SparseCPU, SparseCUDA: tanh_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
  tags: pointwise
@ -6115,7 +6113,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: tanh_out
-    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out
+    SparseCPU, SparseCUDA: tanh_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
  tags: pointwise

@ -6387,8 +6385,8 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr
+    SparseCPU, SparseCUDA: trunc_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr
  tags: [core, pointwise]

 - func: trunc_(Tensor(a!) self) -> Tensor(a!)
@ -6396,8 +6394,8 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_
+    SparseCPU, SparseCUDA: trunc_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_
  tags: pointwise

 - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -6406,8 +6404,8 @@
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA, MPS: trunc_out
-    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out
+    SparseCPU, SparseCUDA: trunc_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
  tags: pointwise
 # Alias for trunc

@ -7369,8 +7367,8 @@
 - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense
+    SparseCPU, SparseCUDA: sparse_to_dense
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense
    MkldnnCPU: mkldnn_to_dense
  autogen: _to_dense.out

@ -7396,8 +7394,8 @@
 - func: dense_dim(Tensor self) -> int
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr
+    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
    CompositeExplicitAutograd: dense_dim_default
  device_check: NoCheck
  device_guard: False
@ -7530,7 +7528,7 @@
  device_check: NoCheck  # Allows copy into different device
  variants: function
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_
+    SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
  autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out

 # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@ -9721,7 +9719,7 @@
  structured_delegate: sign.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sign_sparse
+    SparseCPU, SparseCUDA: sign_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr
  tags: [core, pointwise]

@ -9730,7 +9728,7 @@
  structured_delegate: sign.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: sign_sparse_
+    SparseCPU, SparseCUDA: sign_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_
  tags: pointwise

@ -9741,7 +9739,7 @@
  dispatch:
    CPU, CUDA: sign_out
    MPS: sign_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out
+    SparseCPU, SparseCUDA: sign_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out
  tags: pointwise

@ -9749,7 +9747,7 @@
  variants: function, method
  structured_delegate: signbit.out
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: signbit_sparse
+    SparseCPU, SparseCUDA: signbit_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr
  tags: pointwise

@ -9760,7 +9758,7 @@
    CPU: signbit_out
    CUDA: signbit_out
    MPS: signbit_out_mps
-    SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out
+    SparseCPU, SparseCUDA: signbit_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out
  tags: pointwise

@ -13264,7 +13262,7 @@
  dispatch:
    CompositeExplicitAutograd: isinf
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
-    SparseCPU, SparseCUDA, SparseMPS: isinf_sparse
+    SparseCPU, SparseCUDA: isinf_sparse
    SparseMeta: isinf_sparse_meta
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
  autogen: isinf.out
@ -13280,7 +13278,7 @@
  structured_delegate: isposinf.out
  dispatch:
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
-    SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse
+    SparseCPU, SparseCUDA: isposinf_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
  tags: pointwise

@ -13289,7 +13287,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: isposinf_out
-    SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out
+    SparseCPU, SparseCUDA: isposinf_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out
  tags: pointwise

@ -13298,7 +13296,7 @@
  structured_delegate: isneginf.out
  dispatch:
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
-    SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse
+    SparseCPU, SparseCUDA: isneginf_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
  tags: pointwise

@ -13307,7 +13305,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: isneginf_out
-    SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out
+    SparseCPU, SparseCUDA: isneginf_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out
  tags: pointwise

--- a/aten/src/ATen/native/sparse/mps/FlattenIndices.mm
+++ b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm
@ -1,73 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/native/SparseTensorUtils.h>
-#include <ATen/native/mps/OperationUtils.h>
-#include <ATen/native/sparse/SparseStubs.h>
-#include <ATen/native/sparse/FlattenIndicesCommon.h>
-#include <ATen/ExpandUtils.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_coalesce_native.h>
-#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
-#include <ATen/ops/empty_native.h>
-#include <ATen/ops/zeros_native.h>
-#endif
-
-namespace at::native {
-namespace {
-
-using namespace mps;
-using namespace at::sparse;
-
-#ifndef PYTORCH_JIT_COMPILE_SHADERS
-static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
-#else
-#include <ATen/native/mps/FlattenIndices_metallib.h>
-#endif
-
-Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) {
-  TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
-  TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
-              "flatten_indices: indices.size(0) must equal size.size()");
-
-  const int64_t sparse_dim = indices.size(0);
-  const int64_t nnz = indices.size(1);
-
-  if (nnz == 0) {
-    return at::empty({0}, indices.options().dtype(kLong));
-  }
-
-  // Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j])
-  std::vector<int64_t> row_muls(sparse_dim);
-  row_muls[sparse_dim - 1] = 1;
-  for (int64_t i = sparse_dim - 2; i >= 0; --i) {
-    row_muls[i] = row_muls[i + 1] * size[i + 1];
-  }
-
-  auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
-
-  auto stream = getCurrentMPSStream();
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
-      auto encoder = stream->commandEncoder();
-      [encoder setComputePipelineState:pipeline];
-      mtl_setArgs(encoder,
-                  indices,
-                  row_muls,
-                  flat_indices,
-                  static_cast<uint>(sparse_dim),
-                  indices.strides()
-      );
-
-      mtl_dispatch1DJob(encoder, pipeline, nnz);
-    }
-  });
-  return flat_indices;
-}
-
-} // namespace
-REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps)
-} // namespace at::native
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
@ -20,9 +20,46 @@ using namespace at::sparse;
 #ifndef PYTORCH_JIT_COMPILE_SHADERS
 static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
 #else
-#include <ATen/native/mps/Coalesce_metallib.h>
+#include <ATen/native/mps/Sparse_metallib.h>
 #endif

+
+static Tensor flatten_indices(const Tensor& indices, IntArrayRef size) {
+
+  TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
+  TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
+              "flatten_indices: indices.size(0) must equal size.size()");
+
+  int64_t sparse_dim = indices.size(0);
+  int64_t nnz = indices.size(1);
+
+  if (nnz == 0) {
+    return at::empty({0}, indices.options().dtype(kLong));
+  }
+
+  std::vector<int64_t> strides(sparse_dim);
+  strides[sparse_dim - 1] = 1;
+  for (int64_t i = sparse_dim - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * size[i + 1];
+  }
+
+  Tensor flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
+      auto encoder = stream->commandEncoder();
+      [encoder setComputePipelineState:pipeline];
+
+      mtl_setArgs(encoder, indices, strides, flat_indices, sparse_dim, nnz);
+      mtl_dispatch1DJob(encoder, pipeline, nnz);
+    }
+  });
+
+  return flat_indices;
+}
+
 static Tensor compute_output_positions(const Tensor& is_unique) {

  int64_t nnz = is_unique.size(0);
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -1,169 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/native/SparseTensorUtils.h>
-#include <ATen/native/mps/OperationUtils.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_coalesce_native.h>
-#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
-#include <ATen/ops/cat.h>
-#include <ATen/ops/add_native.h>
-#include <ATen/ops/empty_native.h>
-#include <ATen/ops/zeros_native.h>
-#include <ATen/ops/result_type.h>
-#include <ATen/ops/copy_sparse_to_sparse.h>
-#include <ATen/ops/mul.h>
-#endif
-
-namespace at::native {
-
-using namespace at::sparse;
-
-Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
-
-Tensor& add_out_dense_sparse_mps(
-    Tensor& out,
-    const Tensor& dense,
-    const SparseTensor& sparse,
-    const Scalar& alpha) {
-  TORCH_CHECK(dense.is_mps(),  "add: expected 'self' to be an MPS tensor, got ", dense.device());
-  TORCH_CHECK(sparse.is_mps(), "add: expected 'other' to be an MPS tensor, got ", sparse.device());
-  TORCH_CHECK(out.is_mps(),    "add: expected 'out' to be an MPS tensor, got ", out.device());
-  TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
-              "add: expected 'self' and 'other' to have same size, but self has size ",
-              dense.sizes(), " while other has size ", sparse.sizes(),
-              " (FYI: dense-sparse addition does not currently support broadcasting)");
-
-  const int64_t nnz = sparse._nnz();
-  if (nnz == 0) {
-    out.resize_as_(dense);
-    out.copy_(dense);
-    return out;
-  }
-
-  auto commonDtype = at::result_type(dense, sparse);
-  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
-              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
-
-  Tensor r;
-  const bool need_separate_buffer = out.is_same(dense) || (out.scalar_type() != commonDtype);
-  if (need_separate_buffer) {
-    r = at::empty(dense.sizes(), out.options().dtype(commonDtype));
-  } else {
-    r = out;
-    r.resize_as_(dense);
-  }
-
-  Tensor dense_buffer = dense.to(commonDtype);
-  if (!r.is_same(dense_buffer)) {
-    r.copy_(dense_buffer);
-  }
-
-  Tensor indices = sparse._indices();
-  Tensor values  = sparse._values().to(commonDtype);
-  if (values.numel() == 0) {
-    if (!out.is_same(r)) {
-      out.resize_as_(dense);
-      out.copy_(r);
-    }
-    return out;
-  }
-
-  const int64_t nDim  = r.dim();
-  const int64_t nDimI = sparse.sparse_dim();
-  TORCH_CHECK(nDimI >= 0 && nDimI <= nDim,
-              "Invalid sparse_dim=", nDimI, " for dense tensor of dim ", nDim);
-
-  Tensor indices1D = at::sparse::flatten_indices(indices, sparse.sizes()).contiguous();
-
-  int64_t view_rows = 1;
-  int64_t view_cols = 1;
-  for (int64_t i = 0; i < nDimI; i++) {
-    view_rows *= r.size(i);
-  }
-  for (int64_t i = nDimI; i < nDim; i++) {
-    view_cols *= r.size(i);
-  }
-
-  if (view_cols == 1) {
-    Tensor r_flat = r.reshape({view_rows});
-    Tensor values_1d  = values.reshape({nnz});
-    r_flat.index_add_(0, indices1D, values_1d, alpha);
-  } else {
-    Tensor r_view = r.view({view_rows, view_cols});
-    Tensor values_2d  = values.reshape({nnz, view_cols});
-    r_view.index_add_(0, indices1D, values_2d, alpha);
-  }
-
-  if (!out.is_same(r)) {
-    out.resize_as_(dense);
-    out.copy_(r);
-  }
-  return out;
-}
-
-
-SparseTensor& add_out_sparse_mps(const SparseTensor& self,
-                                 const SparseTensor& other,
-                                 const Scalar& alpha,
-                                 SparseTensor& out) {
-  TORCH_CHECK(other.is_sparse(), "add(sparse, dense) is not supported. Use add(dense, sparse) instead.");
-  TORCH_CHECK(self.is_mps(),  "add: expected 'self' to be MPS, but got ", self.device());
-  TORCH_CHECK(other.is_mps(), "add: expected 'other' to be MPS, but got ", other.device());
-  TORCH_CHECK(out.is_mps(),   "add: expected 'out' to be MPS, but got ", out.device());
-  if (!self.is_sparse()) {
-    return add_out_dense_sparse_mps(out, self, other, alpha);
-  }
-  auto commonDtype = at::result_type(self, other);
-  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
-              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
-
-  TORCH_CHECK(self.sizes().equals(other.sizes()),
-              "add: expected 'self' and 'other' to have same size, but ", self.sizes(), " != ", other.sizes());
-
-  TORCH_CHECK(is_same_density(self, other),
-              "add: expected 'self' and 'other' to have same density, but 'self' has ",
-              self.sparse_dim(), " sparse dimensions while 'other' has ", other.sparse_dim(), " sparse dimensions");
-
-  if (other._nnz() == 0) {
-    out.resize_as_(self);
-    Tensor vals = self._values();
-    if (vals.scalar_type() != out.scalar_type()) {
-      vals = vals.to(out.scalar_type());
-    }
-    alias_into_sparse(out, self._indices(), vals);
-    out._coalesced_(self.is_coalesced());
-    return out;
-  }
-
-  Tensor t_indices_ = self._indices();
-  Tensor s_indices_ = other._indices();
-
-  Tensor t_values_ = self._values().to(commonDtype);
-  Tensor s_values_ = other._values().to(commonDtype);
-  if (!alpha.isIntegral(false) || alpha.to<double>() != 1.0) {
-    s_values_ = at::mul(s_values_, alpha);
-  }
-
-  Tensor r_indices_ = at::cat({t_indices_, s_indices_}, 1);
-  Tensor r_values_  = at::cat({t_values_,  s_values_ }, 0);
-
-  SparseTensor tmp = empty({0}, out.options().dtype(commonDtype));
-  tmp.resize_as_(other);
-  alias_into_sparse(tmp, r_indices_, r_values_);
-  tmp = _coalesce_sparse_mps(tmp);
-
-  out.resize_as_(other);
-  Tensor out_vals = tmp._values();
-  if (out.scalar_type() != commonDtype) {
-    out_vals = out_vals.to(out.scalar_type());
-  }
-  alias_into_sparse(out, tmp._indices(), out_vals);
-  out._coalesced_(tmp.is_coalesced());
-
-  return out;
-}
-
-} // namespace at::native
--- a/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal
@ -1,19 +0,0 @@
-#include <metal_stdlib>
-using namespace metal;
-
-
-kernel void flatten_indices_kernel(
-    device const long* indices        [[ buffer(0) ]],
-    device const long* row_muls       [[ buffer(1) ]],
-    device long*       flat_indices   [[ buffer(2) ]],
-    constant uint&     sparse_dim     [[ buffer(3) ]],
-    constant long2&    idx_strides    [[ buffer(4) ]],
-    uint               gid            [[ thread_position_in_grid ]]) {
-  long flat = 0;
-  for (uint d = 0; d < sparse_dim; ++d) {
-    long off = (long)d * idx_strides.x + (long)gid * idx_strides.y;
-    long v = indices[off];
-    flat += v * row_muls[d];
-  }
-  flat_indices[gid] = flat;
-}
--- a/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal
@ -2,6 +2,19 @@
 #include <metal_stdlib>
 using namespace metal;

+kernel void flatten_indices_kernel(
+    device const int64_t* indices [[buffer(0)]],
+    device const int64_t* strides [[buffer(1)]],
+    device int64_t* flat_indices [[buffer(2)]],
+    constant uint& sparse_dim [[buffer(3)]],
+    constant uint& nnz [[buffer(4)]],
+    uint gid [[thread_position_in_grid]]) {
+  int64_t flat_idx = 0;
+  for (uint d = 0; d < sparse_dim; d++) {
+    flat_idx += indices[d * nnz + gid] * strides[d];
+  }
+  flat_indices[gid] = flat_idx;
+}

 kernel void compute_output_positions_kernel(
    device const bool* is_unique [[buffer(0)]],
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@ -10,13 +10,8 @@ using namespace at::native::memory;

 constexpr int buffer_size = 1024;

-#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
 __managed__ double4 buffer1[buffer_size];
 __managed__ double4 buffer2[buffer_size];
-#else
-__managed__ double4_16a buffer1[buffer_size];
-__managed__ double4_16a buffer2[buffer_size];
-#endif

 void reset_buffers() {
  for (int i = 0; i < buffer_size; i++) {
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@ -106,7 +106,7 @@ dlrm,pass,0



-doctr_det_predictor,pass,3
+doctr_det_predictor,pass,4



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@ -106,7 +106,7 @@ dlrm,pass,0



-doctr_det_predictor,pass,3
+doctr_det_predictor,pass,4



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@ -106,7 +106,7 @@ dlrm,pass,0



-doctr_det_predictor,pass,3
+doctr_det_predictor,pass,4



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@ -106,7 +106,7 @@ dlrm,pass,0



-doctr_det_predictor,pass,3
+doctr_det_predictor,pass,4



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@ -106,7 +106,7 @@ dlrm,pass,0



-doctr_det_predictor,pass,3
+doctr_det_predictor,pass,4



--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -1427,13 +1427,23 @@ class AOTInductorModelCache:
            inductor_configs = {}
            if mode == "max-autotune":
                inductor_configs["max_autotune"] = True
-            ep = torch.export.export(
-                model_clone,
-                example_args,
-                example_kwargs,
-                dynamic_shapes=dynamic_shapes,
-                strict=False,
-            )
+            # We can't support this in non-strict
+            if hasattr(model_clone, "name") and model.name == "levit_128":
+                ep = torch.export.export(
+                    model_clone,
+                    example_args,
+                    example_kwargs,
+                    dynamic_shapes=dynamic_shapes,
+                    strict=True,
+                )
+            else:
+                ep = torch.export.export(
+                    model_clone,
+                    example_args,
+                    example_kwargs,
+                    dynamic_shapes=dynamic_shapes,
+                    strict=True,
+                )
            with torch.no_grad():
                package_path = torch._inductor.aoti_compile_and_package(
                    ep, inductor_configs=inductor_configs
@ -2317,6 +2327,7 @@ class BenchmarkRunner:
                    # no need for n iterations
                    # the logic should be the same to self.model_iter_fn (forward_pass)
                    with self.autocast(**self.autocast_arg):
+                        model_copy.name = name
                        optimized_model_iter_fn = optimize_ctx(
                            model_copy, example_inputs
                        )
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -759,13 +759,6 @@ libtorch_cuda_distributed_extra_sources = [
    "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
 ]

-libtorch_nvshmem_sources = [
-    "torch/csrc/distributed/c10d/cuda/utils.cpp",
-    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
-    "torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu",
-    "torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu",
-]
-
 libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources

 libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@ -61,14 +61,11 @@ void* get_symbol(const char* name, int version) {
  }
 #endif

-  // As of CUDA 13, this API is deprecated.
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000)
  // This fallback to the old API to try getting the symbol again.
  if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
    return out;
  }
-#endif

  // If the symbol cannot be resolved, report and return nullptr;
  // the caller is responsible for checking the pointer.
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -540,9 +540,11 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
    ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
  )

-  append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
+    endif()
  endif()
 endif()

@ -566,30 +568,32 @@ if(USE_CUDA)
    list(APPEND Caffe2_GPU_SRCS
      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
  endif()
-  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
-    set_source_files_properties(
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
-      PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
-    )
-  endif()
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
+      set_source_files_properties(
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
+        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
+      )
+    endif()

-  set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
-  # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
-  if(CMAKE_COMPILER_IS_GNUCXX)
-    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
-  endif()
-  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
-    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
+    set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
+    # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
+    if(CMAKE_COMPILER_IS_GNUCXX)
+      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
+    endif()
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
+      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
+    endif()
  endif()
  set_source_files_properties(
    ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -622,9 +626,11 @@ if(USE_ROCM)
    list(APPEND Caffe2_HIP_SRCS
      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
  endif()
-  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
+    endif()
  endif()
  # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
  # See NOTE [ ATen NVRTC Stub and HIP ]
@ -1057,7 +1063,7 @@ elseif(USE_CUDA)
        UNFUSE_FMA                      # Addressing issue #121558
      )
    target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
-    target_include_directories(torch_cuda SYSTEM PUBLIC
+    target_include_directories(torch_cuda PUBLIC
      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
@ -1345,10 +1351,12 @@ if(BUILD_TEST)
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
-    if(NOT WIN32)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
+    if(USE_DISTRIBUTED)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
+      if(NOT WIN32)
+        add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
+        add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
+      endif()
    endif()
    if(NOT NO_API)
      add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
@ -1453,40 +1461,46 @@ if(BUILD_LITE_INTERPRETER)
  endif()
 endif()

-if(USE_GLOO AND USE_C10D_GLOO)
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
-endif()
-if(USE_UCC AND USE_C10D_UCC)
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
-  if(USE_CUDA)
-    target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
+
+# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
+# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
+if(USE_DISTRIBUTED)
+  target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
+  if(USE_GLOO AND USE_C10D_GLOO)
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
  endif()
-endif()
-if(USE_NCCL AND USE_C10D_NCCL)
-  if(USE_ROCM)
-    target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
-  else()
-    target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
+  if(USE_UCC AND USE_C10D_UCC)
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
+    if(USE_CUDA)
+      target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
+    endif()
  endif()
-endif()
-if(USE_MPI AND USE_C10D_MPI)
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    set_source_files_properties(
-      "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
-      PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+  if(USE_NCCL AND USE_C10D_NCCL)
+    if(USE_ROCM)
+      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
+    else()
+      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
+    endif()
+  endif()
+  if(USE_MPI AND USE_C10D_MPI)
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      set_source_files_properties(
+        "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
+        PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+    endif()
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
+  endif()
+  # Pass USE_RPC in order to reduce use of
+  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+  # need to be removed when RPC is supported
+  if(NOT WIN32)
+    target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+  endif()
+  # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
+  # can only be compiled with USE_TENSORPIPE is set.
+  if(USE_TENSORPIPE)
+    target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
  endif()
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
-endif()
-# Pass USE_RPC in order to reduce use of
-# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
-# need to be removed when RPC is supported
-if(NOT WIN32)
-  target_compile_definitions(torch_cpu PUBLIC USE_RPC)
-endif()
-# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
-# can only be compiled with USE_TENSORPIPE is set.
-if(USE_TENSORPIPE)
-  target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
 endif()

 if(NOT INTERN_BUILD_MOBILE)
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1126,7 +1126,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
  include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()

-if(USE_TENSORPIPE)
+if(USE_DISTRIBUTED AND USE_TENSORPIPE)
  if(MSVC)
    message(WARNING "Tensorpipe cannot be used on Windows.")
  else()
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -191,11 +191,13 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
  message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
  message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
-  message(STATUS "    USE_MPI               : ${USE_MPI}")
-  message(STATUS "    USE_GLOO              : ${USE_GLOO}")
-  message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
-  message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
-  message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
+  if(${USE_DISTRIBUTED})
+    message(STATUS "    USE_MPI               : ${USE_MPI}")
+    message(STATUS "    USE_GLOO              : ${USE_GLOO}")
+    message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
+    message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
+    message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
+  endif()
  if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
    message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
  endif()
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -3331,6 +3331,13 @@ def coverage_post_process(app, exception):
    if not isinstance(app.builder, CoverageBuilder):
        return

+    if not torch.distributed.is_available():
+        raise RuntimeError(
+            "The coverage tool cannot run with a version "
+            "of PyTorch that was built with USE_DISTRIBUTED=0 "
+            "as this module's API changes."
+        )
+
    # These are all the modules that have "automodule" in an rst file
    # These modules are the ones for which coverage is checked
    # Here, we make sure that no module is missing from that list
--- a/docs/source/notes/get_start_xpu.rst
+++ b/docs/source/notes/get_start_xpu.rst
@ -24,12 +24,17 @@ For Intel Client GPU
 +-------------------------------------+----------------------------------------------------------------------------------------------------+
 | Supported OS                        | Validated Hardware                                                                                 |
 +=====================================+====================================================================================================+
-|| Windows 11 & Ubuntu 24.04/25.04    || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+|| Windows 11 & Ubuntu 24.10          || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
 ||                                    || Intel® Arc B-Series Graphics (CodeName: Battlemage)                                               |
 ||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
 ||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
 ||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
 +-------------------------------------+----------------------------------------------------------------------------------------------------+
+|| Ubuntu 24.04 & WSL2 (Ubuntu 24.04) || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
+||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
+||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
+-------------------------------------+----------------------------------------------------------------------------------------------------+

 Intel GPUs support (Prototype) is ready from PyTorch* 2.5 for Intel® Client GPUs and Intel® Data Center GPU Max Series on both Linux and Windows, which brings Intel GPUs and the SYCL* software stack into the official PyTorch stack with consistent user experience to embrace more AI application scenarios.

--- a/docs/source/notes/randomness.rst
+++ b/docs/source/notes/randomness.rst
@ -125,10 +125,6 @@ deterministic implementation will be used::
            [[ 0.1509,  1.8027],
             [ 0.0333, -1.1444]]], device='cuda:0')

-Furthermore, if you are using CUDA tensors, and your CUDA version is 10.2 or greater, you
-should set the environment variable `CUBLAS_WORKSPACE_CONFIG` according to CUDA documentation:
-`<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
-
 CUDA convolution determinism
 ----------------------------
 While disabling CUDA convolution benchmarking (discussed above) ensures that
--- a/docs/source/torch_cuda_memory.md
+++ b/docs/source/torch_cuda_memory.md
@ -32,7 +32,7 @@ torch.cuda.memory._dump_snapshot("my_snapshot.pickle")

 ## Using the visualizer

-Open <https://pytorch.org/memory_viz> and drag/drop the pickled snapshot file into the visualizer.
+Open [pytorch.org/memory_viz](https://pytorch.org/memory_viz) and drag/drop the pickled snapshot file into the visualizer.
 The visualizer is a javascript application that runs locally on your computer. It does not upload any snapshot data.


--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@ -1,4 +1,4 @@
-if(NOT WIN32)
+if(USE_DISTRIBUTED AND NOT WIN32)
  set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
  set(DIST_AUTOGRAD_TEST_SOURCES
    ${TORCH_ROOT}/test/cpp/common/main.cpp
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt
@ -15,8 +15,6 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)

-set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-
 if(APPLE)
  set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path")
 elseif(UNIX)
@ -36,7 +34,6 @@ else()
  message(FATAL_ERROR "Cannot find Python directory")
 endif()

-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake)

 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg)
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
@ -164,13 +164,13 @@ at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) {

 // LITERALINCLUDE START: FALLBACK IMPL
 void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  static const std::unordered_set<c10::OperatorName> cpu_fallback_blocklist = {
+  static const std::unordered_set<c10::OperatorName> cpu_fallback_blacklist = {
      c10::OperatorName("aten::abs", ""),
      c10::OperatorName("aten::abs", "out"),
  };

  const auto& op_name = op.schema().operator_name();
-  if (cpu_fallback_blocklist.count(op_name)) {
+  if (cpu_fallback_blacklist.count(op_name)) {
    TORCH_CHECK(
        false,
        "Operator '",
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegEvent.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegEvent.h
@ -1,146 +0,0 @@
-#pragma once
-
-#include <include/openreg.h>
-
-#include "OpenRegException.h"
-#include "OpenRegStream.h"
-
-namespace c10::openreg {
-
-struct OpenRegEvent {
-  OpenRegEvent(bool enable_timing) noexcept : enable_timing_{enable_timing} {}
-
-  ~OpenRegEvent() {
-    if (is_created_) {
-      OPENREG_CHECK(orEventDestroy(event_));
-    }
-  }
-
-  OpenRegEvent(const OpenRegEvent&) = delete;
-  OpenRegEvent& operator=(const OpenRegEvent&) = delete;
-
-  OpenRegEvent(OpenRegEvent&& other) noexcept {
-    moveHelper(std::move(other));
-  }
-  OpenRegEvent& operator=(OpenRegEvent&& other) noexcept {
-    if (this != &other) {
-      moveHelper(std::move(other));
-    }
-    return *this;
-  }
-
-  operator orEvent_t() const {
-    return event();
-  }
-
-  std::optional<at::Device> device() const {
-    if (is_created_) {
-      return at::Device(at::kPrivateUse1, device_index_);
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  bool isCreated() const {
-    return is_created_;
-  }
-
-  DeviceIndex device_index() const {
-    return device_index_;
-  }
-
-  orEvent_t event() const {
-    return event_;
-  }
-
-  bool query() const {
-    if (!is_created_) {
-      return true;
-    }
-
-    orError_t err = orEventQuery(event_);
-    if (err == orSuccess) {
-      return true;
-    }
-
-    return false;
-  }
-
-  void record() {
-    record(getCurrentOpenRegStream());
-  }
-
-  void recordOnce(const OpenRegStream& stream) {
-    if (!was_recorded_)
-      record(stream);
-  }
-
-  void record(const OpenRegStream& stream) {
-    if (!is_created_) {
-      createEvent(stream.device_index());
-    }
-
-    TORCH_CHECK(
-        device_index_ == stream.device_index(),
-        "Event device ",
-        device_index_,
-        " does not match recording stream's device ",
-        stream.device_index(),
-        ".");
-
-    OPENREG_CHECK(orEventRecord(event_, stream));
-    was_recorded_ = true;
-  }
-
-  void block(const OpenRegStream& stream) {
-    if (is_created_) {
-      OPENREG_CHECK(orStreamWaitEvent(stream, event_, 0));
-    }
-  }
-
-  float elapsed_time(const OpenRegEvent& other) const {
-    TORCH_CHECK_VALUE(
-        !(enable_timing_ & orEventDisableTiming) &&
-            !(other.enable_timing_ & orEventDisableTiming),
-        "Both events must be created with argument 'enable_timing=True'.");
-    TORCH_CHECK_VALUE(
-        is_created_ && other.isCreated(),
-        "Both events must be recorded before calculating elapsed time.");
-    TORCH_CHECK(
-        query() && other.query(),
-        "Both events must be completed before calculating elapsed time.");
-
-    float time_ms = 0;
-    OPENREG_CHECK(orEventElapsedTime(&time_ms, event_, other.event_));
-    return time_ms;
-  }
-
-  void synchronize() const {
-    if (is_created_) {
-      OPENREG_CHECK(orEventSynchronize(event_));
-    }
-  }
-
- private:
-  unsigned int enable_timing_{orEventDisableTiming};
-  bool is_created_{false};
-  bool was_recorded_{false};
-  DeviceIndex device_index_{-1};
-  orEvent_t event_{};
-
-  void createEvent(DeviceIndex device_index) {
-    device_index_ = device_index;
-    OPENREG_CHECK(orEventCreateWithFlags(&event_, enable_timing_));
-    is_created_ = true;
-  }
-
-  void moveHelper(OpenRegEvent&& other) {
-    std::swap(enable_timing_, other.enable_timing_);
-    std::swap(is_created_, other.is_created_);
-    std::swap(was_recorded_, other.was_recorded_);
-    std::swap(device_index_, other.device_index_);
-    std::swap(event_, other.event_);
-  }
-};
-
-} // namespace c10::openreg
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.cpp
@ -1,9 +0,0 @@
-#include "OpenRegException.h"
-
-void orCheckFail(
-    const char* func,
-    const char* file,
-    uint32_t line,
-    const char* msg) {
-  throw ::c10::Error({func, file, line}, msg);
-}
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
@ -1,20 +0,0 @@
-#pragma once
-
-#include <include/openreg.h>
-
-#include <c10/util/Exception.h>
-
-void orCheckFail(
-    const char* func,
-    const char* file,
-    uint32_t line,
-    const char* msg = "");
-
-#define OPENREG_CHECK(EXPR, ...)                                               \
-  do {                                                                         \
-    const orError_t __err = EXPR;                                              \
-    if (__err != orSuccess) {                                                  \
-      orCheckFail(                                                             \
-          __func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
-    }                                                                          \
-  } while (0)
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
@ -1,6 +1,5 @@
 #include <include/openreg.h>

-#include "OpenRegException.h"
 #include "OpenRegFunctions.h"

 namespace c10::openreg {
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
@ -1,10 +1,14 @@
 #pragma once

+#ifdef _WIN32
+  #define OPENREG_EXPORT __declspec(dllexport)
+#else
+  #define OPENREG_EXPORT __attribute__((visibility("default")))
+#endif
+
 #include <c10/core/Device.h>
 #include <c10/macros/Macros.h>

-#include <include/Macros.h>
-
 #include <limits>

 namespace c10::openreg {
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
@ -1,253 +0,0 @@
-#include "OpenRegStream.h"
-
-#include <c10/util/CallOnce.h>
-#include <c10/util/Exception.h>
-#include <c10/util/irange.h>
-
-#include <array>
-#include <atomic>
-#include <cstdint>
-#include <deque>
-
-namespace c10::openreg {
-
-namespace {
-
-// Global stream state and constants
-static c10::once_flag init_flag;
-
-static DeviceIndex num_devices = -1;
-static constexpr int kStreamsPerPoolBits = 5;
-static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
-static constexpr int kStreamTypeBits = 2;
-
-/*
- * The stream pools are lazily initialized when the first queue is requested
- * for a device. The device flags track the initialization of each device. When
- * a queue is requested, the next queue in the pool to be returned in a
- * round-robin fashion, see Note [Stream Management].
- */
-static std::deque<c10::once_flag> device_flags;
-static std::vector<std::array<
-    std::array<orStream_t, kStreamsPerPool>,
-    c10::openreg::max_compile_time_stream_priorities>>
-    streams;
-static std::deque<
-    std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
-    priority_counters;
-
-static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
-
-/*
- * Note [StreamId assignment]
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~
- * How do we assign stream IDs?
- *
- * -- 56 bits --    -- 5 bits --     -- 2 bits --     -- 1 bit --
- *     zeros       StreamIdIndex     StreamIdType    Ext/native stream
- *                ignored for ext   ignored for ext
- *
- * Where StreamIdType:
- *  00 = default stream
- *  01 = normal stream
- *  11 = external stream
- *
- * For external stream, StreamID is a orStream_t pointer. This means that last
- * bit will always be 0. So when constructing StreamId for a native stream we
- * set last bit to 1 to distinguish between native and external streams.
- *
- * StreamId is 64-bit, so we can just rely on regular promotion rules.
- * We rely on StreamIdIndex and StreamIdType being non-negative;
- */
-using StreamIdIndex = uint8_t;
-enum class StreamIdType : uint8_t {
-  DEFAULT = 0x0,
-  NORMAL = 0x1,
-  EXT = 0x3,
-};
-
-inline std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
-  switch (s) {
-    case StreamIdType::DEFAULT:
-      return stream << "DEFAULT";
-    case StreamIdType::NORMAL:
-      return stream << "NORMAL";
-    case StreamIdType::EXT:
-      return stream << "EXT";
-    default:
-      break;
-  }
-
-  return stream << static_cast<int16_t>(s);
-}
-
-static inline StreamIdType streamIdType(StreamId s) {
-  // Externally allocated streams have their id being the orStream_ptr
-  // so the last bit will be 0
-  if (!(s & 1)) {
-    return StreamIdType(StreamIdType::EXT);
-  }
-
-  int mask_for_type = (1 << kStreamTypeBits) - 1;
-  auto st = static_cast<StreamIdType>((s >> 1) & mask_for_type);
-  TORCH_CHECK(
-      st == StreamIdType::DEFAULT || st == StreamIdType::NORMAL,
-      "invalid StreamId: ",
-      s);
-  return st;
-}
-
-static inline size_t streamIdIndex(StreamId s) {
-  return static_cast<size_t>(
-      (s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
-}
-
-StreamId makeStreamId(StreamIdType st, size_t si) {
-  if (st == StreamIdType::EXT) {
-    return static_cast<StreamId>(0);
-  }
-
-  return (static_cast<StreamId>(si) << (kStreamTypeBits + 1)) |
-      (static_cast<StreamId>(st) << 1) | 1;
-}
-
-static void initGlobalStreamState() {
-  num_devices = device_count();
-  device_flags.resize(num_devices);
-  streams.resize(num_devices);
-  priority_counters.resize(num_devices);
-}
-
-static void initSingleDeviceStream(
-    int priority,
-    DeviceIndex device_index,
-    int i) {
-  auto& stream = streams[device_index][priority][i];
-
-  OPENREG_CHECK(orStreamCreateWithPriority(&stream, 0, priority));
-  priority_counters[device_index][priority] = 0;
-}
-
-// Creates stream pools for the specified device. It should be call only once.
-static void initDeviceStreamState(DeviceIndex device_index) {
-  for (const auto i : c10::irange(kStreamsPerPool)) {
-    for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
-      initSingleDeviceStream(p, device_index, i);
-    }
-  }
-}
-
-static void initOpenRegStreamsOnce() {
-  c10::call_once(init_flag, initGlobalStreamState);
-
-  if (current_streams) {
-    return;
-  }
-
-  // Inits current streams (thread local) to the last queue in the "normal
-  // priority" queue pool. Note: the queue pool have not been initialized yet.
-  // It will be initialized in initDeviceStreamState for the specified device.
-  current_streams = std::make_unique<StreamId[]>(num_devices);
-  for (const auto i : c10::irange(num_devices)) {
-    current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
-  }
-}
-
-static uint32_t get_idx(std::atomic<uint32_t>& counter) {
-  auto raw_idx = counter++;
-  return raw_idx % kStreamsPerPool;
-}
-
-OpenRegStream OpenRegStreamForId(DeviceIndex device_index, StreamId stream_id) {
-  return OpenRegStream(
-      OpenRegStream::UNCHECKED,
-      Stream(
-          Stream::UNSAFE,
-          c10::Device(DeviceType::PrivateUse1, device_index),
-          stream_id));
-}
-
-} // anonymous namespace
-
-// See Note [StreamId assignment]
-orStream_t OpenRegStream::stream() const {
-  c10::DeviceIndex device_index = stream_.device_index();
-  StreamId stream_id = stream_.id();
-  StreamIdType st = streamIdType(stream_id);
-  size_t si = streamIdIndex(stream_id);
-  switch (st) {
-    // The index 0 stream is default as well.
-    case StreamIdType::DEFAULT:
-    case StreamIdType::NORMAL:
-      return streams[device_index][static_cast<uint8_t>(st)][si];
-    case StreamIdType::EXT:
-      return reinterpret_cast<orStream_t>(stream_id);
-    default:
-      TORCH_CHECK(
-          false,
-          "Unrecognized stream ",
-          stream_,
-          " (I didn't recognize the stream type, ",
-          st,
-          ").",
-          " Did you manufacture the StreamId yourself?  Don't do that;");
-  }
-}
-
-// Returns a stream from the requested pool
-// Note: when called the first time on a device, this will create the
-// stream pools for that device.
-OpenRegStream getStreamFromPool(const int priority, DeviceIndex device_index) {
-  initOpenRegStreamsOnce();
-  if (device_index == -1) {
-    device_index = current_device();
-  }
-  c10::call_once(
-      device_flags[device_index], initDeviceStreamState, device_index);
-  auto pri_idx =
-      std::clamp(priority, 0, max_compile_time_stream_priorities - 1);
-  const auto idx = get_idx(priority_counters[device_index][pri_idx]);
-  auto id_type = static_cast<StreamIdType>(pri_idx);
-  return OpenRegStreamForId(device_index, makeStreamId(id_type, idx));
-}
-
-OpenRegStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
-  initOpenRegStreamsOnce();
-  int priority = 0;
-  return getStreamFromPool(priority, device);
-}
-
-OpenRegStream getStreamFromExternal(
-    orStream_t ext_stream,
-    DeviceIndex device_index) {
-  return OpenRegStreamForId(
-      device_index, reinterpret_cast<int64_t>(ext_stream));
-}
-
-OpenRegStream getDefaultOpenRegStream(DeviceIndex device_index) {
-  initOpenRegStreamsOnce();
-  if (device_index == -1) {
-    device_index = current_device();
-  }
-  return OpenRegStreamForId(
-      device_index, makeStreamId(StreamIdType::DEFAULT, 0));
-}
-
-OpenRegStream getCurrentOpenRegStream(DeviceIndex device_index) {
-  initOpenRegStreamsOnce();
-  if (device_index == -1) {
-    device_index = current_device();
-  }
-  return OpenRegStreamForId(device_index, current_streams[device_index]);
-}
-
-void setCurrentOpenRegStream(OpenRegStream stream) {
-  initOpenRegStreamsOnce();
-  current_streams[stream.device_index()] = stream.id();
-}
-
-std::ostream& operator<<(std::ostream& stream, const OpenRegStream& s) {
-  return stream << s.unwrap();
-}
-
-} // namespace c10::openreg
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.h
@ -1,162 +0,0 @@
-#pragma once
-
-#include <include/openreg.h>
-
-#include "OpenRegException.h"
-#include "OpenRegFunctions.h"
-
-#include <c10/core/DeviceGuard.h>
-#include <c10/core/Stream.h>
-#include <c10/util/Exception.h>
-
-namespace c10::openreg {
-
-static constexpr int max_compile_time_stream_priorities = 1;
-
-class OpenRegStream {
- public:
-  enum Unchecked { UNCHECKED };
-
-  explicit OpenRegStream(Stream stream) : stream_(stream) {
-    TORCH_CHECK(stream_.device_type() == DeviceType::PrivateUse1);
-  }
-
-  explicit OpenRegStream(Unchecked, Stream stream) : stream_(stream) {}
-
-  bool operator==(const OpenRegStream& other) const noexcept {
-    return unwrap() == other.unwrap();
-  }
-
-  bool operator!=(const OpenRegStream& other) const noexcept {
-    return unwrap() != other.unwrap();
-  }
-
-  operator orStream_t() const {
-    return stream();
-  }
-
-  operator Stream() const {
-    return unwrap();
-  }
-
-  DeviceType device_type() const {
-    return DeviceType::PrivateUse1;
-  }
-
-  DeviceIndex device_index() const {
-    return stream_.device_index();
-  }
-
-  Device device() const {
-    return Device(DeviceType::PrivateUse1, device_index());
-  }
-
-  StreamId id() const {
-    return stream_.id();
-  }
-
-  bool query() const {
-    DeviceGuard guard{stream_.device()};
-
-    if (orStreamQuery(stream()) == orSuccess) {
-      return true;
-    }
-
-    return false;
-  }
-
-  void synchronize() const {
-    DeviceGuard guard{stream_.device()};
-    OPENREG_CHECK(orStreamSynchronize(stream()));
-  }
-
-  int priority() const {
-    DeviceGuard guard{stream_.device()};
-    int priority = 0;
-    OPENREG_CHECK(orStreamGetPriority(stream(), &priority));
-    return priority;
-  }
-
-  orStream_t stream() const;
-
-  Stream unwrap() const {
-    return stream_;
-  }
-
-  struct c10::StreamData3 pack3() const {
-    return stream_.pack3();
-  }
-
-  static OpenRegStream unpack3(
-      StreamId stream_id,
-      DeviceIndex device_index,
-      DeviceType device_type) {
-    return OpenRegStream(Stream::unpack3(stream_id, device_index, device_type));
-  }
-
- private:
-  Stream stream_;
-};
-
-/*
- * Get a stream from the pool in a round-robin fashion.
- *
- * You can request a stream from the highest priority pool by setting
- * isHighPriority to true for a specific device.
- */
-OPENREG_EXPORT OpenRegStream
-getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
-
-/*
- * Get a stream from the pool in a round-robin fashion.
- *
- * You can request a stream by setting a priority value for a specific device.
- * The priority number lower, the priority higher.
- */
-OPENREG_EXPORT OpenRegStream
-getStreamFromPool(const int priority, DeviceIndex device = -1);
-
-/*
- * Get a OpenRegStream from a externally allocated one.
- *
- * This is mainly for interoperability with different libraries where we
- * want to operate on a non-torch allocated stream for data exchange or similar
- * purposes
- */
-OPENREG_EXPORT OpenRegStream
-getStreamFromExternal(orStream_t ext_stream, DeviceIndex device_index);
-
-/*
- * Get the default OpenReg stream, for the passed OpenReg device, or for the
- * current device if no device index is passed.
- */
-OPENREG_EXPORT OpenRegStream
-getDefaultOpenRegStream(DeviceIndex device_index = -1);
-
-/*
- * Get the current OpenReg stream, for the passed OpenReg device, or for the
- * current device if no device index is passed.
- */
-OPENREG_EXPORT OpenRegStream
-getCurrentOpenRegStream(DeviceIndex device_index = -1);
-
-/*
- * Set the current stream on the device of the passed in stream to be the passed
- * in stream.
- */
-OPENREG_EXPORT void setCurrentOpenRegStream(OpenRegStream stream);
-
-OPENREG_EXPORT std::ostream& operator<<(
-    std::ostream& stream,
-    const OpenRegStream& s);
-
-} // namespace c10::openreg
-
-namespace std {
-template <>
-struct hash<c10::openreg::OpenRegStream> {
-  size_t operator()(c10::openreg::OpenRegStream s) const noexcept {
-    return std::hash<c10::Stream>{}(s.unwrap());
-  }
-};
-} // namespace std
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/include/Macros.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/include/Macros.h
@ -1,7 +0,0 @@
-#pragma once
-
-#ifdef _WIN32
-#define OPENREG_EXPORT __declspec(dllexport)
-#else
-#define OPENREG_EXPORT __attribute__((visibility("default")))
-#endif
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
@ -53,7 +53,7 @@ def build_deps():
        ".",
        "--target",
        "install",
-        "--config",  # For multi-config generators
+        "--config",
        "Release",
        "--",
    ]
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt
@ -1,14 +1,7 @@
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-
-project(TORCH_OPENREG CXX C)
-
-option(USE_TEST "Build and run unit tests" ON)
-
 set(LIBRARY_NAME openreg)
-set(LIBRARY_TEST ortests)

 file(GLOB_RECURSE SOURCE_FILES
-    "${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
 )

 add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
@ -20,26 +13,3 @@ install(TARGETS ${LIBRARY_NAME}
    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
    RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
-
-if(USE_TEST)
-    enable_testing()
-
-    include(${CMAKE_CURRENT_LIST_DIR}/cmake/GTestTargets.cmake)
-
-    file(GLOB_RECURSE TEST_FILES
-        "${CMAKE_CURRENT_SOURCE_DIR}/tests/*.cpp"
-    )
-
-    add_executable(${LIBRARY_TEST} ${TEST_FILES})
-    target_link_libraries(${LIBRARY_TEST}
-        PRIVATE
-        ${LIBRARY_NAME}
-        gtest
-        gtest_main
-    )
-
-    add_test(NAME alltests COMMAND ${LIBRARY_TEST})
-    add_custom_command(TARGET ${LIBRARY_TEST}
-                POST_BUILD
-                COMMAND ${CMAKE_CTEST_COMMAND} -C Release --output-on-failure --verbose)
-endif()
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
@ -4,9 +4,9 @@

 OpenReg is a C++ backend library that simulates the behavior of a CUDA-like device on a CPU. Its core objective is **not to accelerate computation or improve performance**, but rather to **simulate modern CUDA programming, enabling developers to prototype and test in an environment without actual GPU hardware**. The current design principles are as follows:

-* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's `PrivateUse1` backend) to switch and test seamlessly.
+* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's PrivateUse1 backend) to switch and test seamlessly.
 * **Functional Consistency**: Provide behavior consistent with the CUDA Runtime, such as memory isolation, device context management, etc.
-* **Completeness**: Aim to support `PrivateUse1` device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
+* **Completeness**: Aim to support PrivateUse1 device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.

 ## Directory Structure

@ -14,34 +14,19 @@ The project's code is organized with a clear structure and separation of respons

 ```text
 openreg/
-├── README.md               # Comprehensive introduction of OpenReg.
-├── CMakeLists.txt          # Top-level CMake build script, used to compile and generate libopenreg.so
-├── cmake/
-│   └── GTestTargets.cmake  # Utils of fetching GoogleTest.
+├── CMakeLists.txt      # Top-level CMake build script, used to compile and generate libopenreg.so
 ├── include/
-│   ├── openreg.h           # Public API header file, external users only need to include this file
-│   └── openreg.inl         # Public API header file, as an extension of openreg.h, cannot be included separately.
-├── example/
-│   └── example.cpp         # Example for OpenReg.
-├── tests/
-│   ├── event_tests.cpp     # Testcases about OpenReg Event.
-│   ├── stream_tests.cpp    # Testcases about OpenReg Stream.
-│   ├── device_tests.cpp    # Testcases about OpenReg Device.
-│   └── memory_tests.cpp    # Testcases about OpenReg Memory.
+│   └── openreg.h       # Public API header file, external users only need to include this file
 └── csrc/
-    ├── device.cpp          # Implementation of device management APIs
-    ├── memory.cpp          # Implementation of memory management APIs
-    └── stream.cpp          # Implementation of stream and event APIs.
+    ├── device.cpp      # Implementation of device management-related APIs
+    └── memory.cpp      # Implementation of APIs for memory management, copying, and protection
 ```

-* `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.
-* `include`: Defines all externally exposed APIs, data structures, and enums.
-  * `openreg.h`: Defines all externally exposed C-style APIs.
-  * `openreg.inl`: Defines all externally exposed C++ APIs.
+* `include/openreg.h`: Defines all externally exposed C-style APIs, data structures, and enums. It is the "public face" of this library.
 * `csrc/`: Contains the C++ implementation source code for all core functionalities.
-  * `device.cpp`: Implements the core functions of device management: device discovery and context management.
-  * `memory.cpp`: Implements the core functions of memory management: allocation, free, copy and memory protection.
-  * `stream.cpp`: Implements the core functions of stream and event: creation, destroy, record, synchronization and so on.
+  * `device.cpp`: Implements device discovery (`orGetDeviceCount`) and thread context management (`orSetDevice`/`orGetDevice`).
+  * `memory.cpp`: Implements the core functions of memory allocation (`orMalloc`/`orMallocHost`), deallocation, copying, and memory protection (`orMemoryProtect`, `orMemoryUnprotect`).
+* `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.

 ## Implemented APIs

@ -49,49 +34,25 @@ OpenReg currently provides a set of APIs covering basic memory and device manage

 ### Device Management APIs

-| OpenReg                          | CUDA                               | Feature Description                |
-| :------------------------------- | :--------------------------------- | :--------------------------------- |
-| `orGetDeviceCount`               | `cudaGetDeviceCount`               | Get the number of available GPUs   |
-| `orSetDevice`                    | `cudaSetDevice`                    | Set the active GPU                 |
-| `orGetDevice`                    | `cudaGetDevice`                    | Get the current GPU                |
-| `orDeviceSynchronize`            | `cudaDeviceSynchronize`            | Wait for all GPU tasks to finish   |
-| `orDeviceGetStreamPriorityRange` | `cudaDeviceGetStreamPriorityRange` | Get the range of stream priorities |
+| OpenReg              | CUDA                 | Feature Description                               |
+| :------------------- | :------------------- | :------------------------------------------------ |
+| `orGetDeviceCount`   | `cudaGetDeviceCount` | Get the number of devices                         |
+| `orSetDevice`        | `cudaSetDevice`      | Set the current device for the current thread     |
+| `orGetDevice`        | `cudaGetDevice`      | Get the current device for the current thread     |

 ### Memory Management APIs

-| OpenReg                  | CUDA                       | Feature Description                       |
-| :----------------------- | :------------------------- | :---------------------------------------- |
-| `orMalloc`               | `cudaMalloc`               | Allocate device memory                    |
-| `orFree`                 | `cudaFree`                 | Free device memory                        |
-| `orMallocHost`           | `cudaMallocHost`           | Allocate page-locked (Pinned) host memory |
-| `orFreeHost`             | `cudaFreeHost`             | Free page-locked host memory              |
-| `orMemcpy`               | `cudaMemcpy`               | Synchronous memory copy                   |
-| `orMemcpyAsyn`           | `cudaMemcpyAsyn`           | Asynchronous memory copy                  |
-| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes                    |
-
-### Stream APIs
-
-| OpenReg                      | CUDA                           | Feature Description                    |
-| :--------------------------- | :----------------------------- | :------------------------------------- |
-| `orStreamCreate`             | `cudaStreamCreate`             |  Create a default-priority stream      |
-| `orStreamCreateWithPriority` | `cudaStreamCreateWithPriority` |  Create a stream with a given priority |
-| `orStreamDestroy`            | `cudaStreamDestroy`            |  Destroy a stream                      |
-| `orStreamQuery`              | `cudaStreamQuery`              |  Check if a stream has completed       |
-| `orStreamSynchronize`        | `cudaStreamSynchronize`        |  Wait for a stream to complete         |
-| `orStreamWaitEvent`          | `cudaStreamWaitEvent`          |  Make a stream wait for an event       |
-| `orStreamGetPriority`        | `cudaStreamGetPriority`        |  Get a stream’s priority               |
-
-### Event APIs
-
-| OpenReg                  | CUDA                       | Feature Description                 |
-| :----------------------- | :------------------------- | :---------------------------------- |
-| `orEventCreate`          | `cudaEventCreate`          | Create an event with default flag   |
-| `orEventCreateWithFlags` | `cudaEventCreateWithFlags` | Create an event with specific flag  |
-| `orEventDestroy`         | `cudaEventDestroy`         | Destroy an event                    |
-| `orEventRecord`          | `cudaEventRecord`          | Record an event in a stream         |
-| `orEventSynchronize`     | `cudaEventSynchronize`     | Wait for an event to complete       |
-| `orEventQuery`           | `cudaEventQuery`           | Check if an event has completed     |
-| `orEventElapsedTime`     | `cudaEventElapsedTime`     | Get time elapsed between two events |
+| OpenReg                  | CUDA                         | Feature Description                        |
+| :----------------------- | :--------------------------- | :----------------------------------------- |
+| `orMalloc`               | `cudaMalloc`                 | Allocate device memory                     |
+| `orFree`                 | `cudaFree`                   | Free device memory                         |
+| `orMallocHost`           | `cudaMallocHost`             | Allocate page-locked (Pinned) host memory  |
+| `orFreeHost`             | `cudaFreeHost`               | Free page-locked host memory               |
+| `orMemcpy`               | `cudaMemcpy`                 | Synchronous memory copy                    |
+| `orMemcpyAsync`          | `cudaMemcpyAsync`            | Asynchronous memory copy                   |
+| `orPointerGetAttributes` | `cudaPointerGetAttributes`   | Get pointer attributes                     |
+| `orMemoryUnprotect`      | -                            | (Internal use) Unprotect memory            |
+| `orMemoryProtect`        | -                            | (Internal use) Restore memory protection   |

 ## Implementation Principles

@ -110,42 +71,67 @@ Simulating device memory, host memory, and memory copies:
 2. **Deallocation**: Memory is freed using `munmap`.
 3. **Authorization**: When a legitimate memory access is required, an RAII guard restores the memory permissions to `PROT_READ | PROT_WRITE`. The permissions are automatically reverted to `PROT_NONE` when the scope is exited.

-### Stream&Event Principles
-
-Simulating creation, release and synchronization for event and steam:
-
-1. **Event**: Each event is encapsulated as a task function and placed into a stream, which acts as a thread. Upon completion of the task, a flag within the event is modified to simulate the event's status.
-2. **Stream**: When each stream is requested, a new thread is created, which sequentially processes each task in the task queue within the stream structure. Tasks can be wrappers around kernel functions or events.
-3. **Synchronization**: Synchronization between streams and events is achieved using multithreading, condition variables, and mutexes.
-
 ## Usage Example

-Please refer to [example](example/example.cpp) for example.
+The following is a simple code snippet demonstrating how to use the core features of the OpenReg library.

-The command to compile example.cpp is as follow:
+```cpp
+#include "openreg.h"
+#include <iostream>
+#include <vector>
+#include <cstdio>

-```Shell
-mkdir build
+#define OR_CHECK(call) do { \
+    orError_t err = call; \
+    if (err != orSuccess) { \
+        fprintf(stderr, "OR Error code %d in %s at line %d\n", err, __FILE__, __LINE__); \
+        exit(EXIT_FAILURE); \
+    } \
+} while (0)

-pushd build
-cmake ..
-make -j 32
-popd
+int main() {
+    int device_count = 0;
+    OR_CHECK(orGetDeviceCount(&device_count));
+    std::cout << "Found " << device_count << " simulated devices." << std::endl;

-g++ -o out example/example.cpp -L ./build -lopenreg
-LD_LIBRARY_PATH=./build ./out
-```
+    int current_device = -1;
+    OR_CHECK(orSetDevice(1));
+    OR_CHECK(orGetDevice(&current_device));
+    std::cout << "Set current device to " << current_device << "." << std::endl;

-The output is as follow:
+    const int n = 1024;
+    const size_t size = n * sizeof(int);
+    int *h_a, *d_a;
+    OR_CHECK(orMallocHost((void**)&h_a, size));
+    OR_CHECK(orMalloc((void**)&d_a, size));

-```Shell
-Current environment have 2 devices
-Current is 0 device
-All tasks have been submitted.
-Kernel execution time: 0.238168 ms
-Verification PASSED!
+    orPointerAttributes attr;
+    OR_CHECK(orPointerGetAttributes(&attr, d_a));
+    std::cout << "Pointer " << (void*)d_a << " is of type " << attr.type
+              << " on device " << attr.device << std::endl;
+
+    for (int i = 0; i < n; ++i) {
+        h_a[i] = i;
+    }
+    OR_CHECK(orMemcpy(d_a, h_a, size, orMemcpyHostToDevice));
+    std::cout << "Data copied from Host to Device." << std::endl;
+
+    // std::cout << "Trying to access device memory directly from CPU..." << std::endl;
+    // int val = d_a[0]; // CRASH!
+
+    // Clean up resources
+    OR_CHECK(orFree(d_a));
+    OR_CHECK(orFreeHost(h_a));
+    std::cout << "Resources freed." << std::endl;
+
+    return 0;
+}
 ```

 ## Next Steps

-The most basic functions of the OpenReg backend are currently supported, and will be dynamically optimized and expanded based on the needs of PyTorch integration.
+To better support PrivateUse1 device integration, the following capabilities are planned for the future:
+
+* **Stream Support**: Provide the ability to simulate CUDA Streams.
+* **Event Support**: Provide the ability to simulate CUDA Events.
+* **Cross-Platform Support**: Add support for Windows and macOS (low priority).
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/cmake/GTestTargets.cmake
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/cmake/GTestTargets.cmake
@ -1,12 +0,0 @@
-set(GTest_REL_PATH "../../../../../../../third_party/googletest")
-get_filename_component(GTest_DIR "${CMAKE_CURRENT_LIST_DIR}/${GTest_REL_PATH}" ABSOLUTE)
-
-if(EXISTS "${GTest_DIR}/CMakeLists.txt")
-    message(STATUS "Found GTest: ${GTest_DIR}")
-
-    set(BUILD_GMOCK OFF CACHE BOOL "Disable GMock build")
-    set(INSTALL_GTEST OFF CACHE BOOL "Disable GTest install")
-    add_subdirectory(${GTest_DIR} "${CMAKE_BINARY_DIR}/gtest")
-else()
-    message(FATAL_ERROR "GTest Not Found")
-endif()
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp
@ -1,12 +1,10 @@
 #include <include/openreg.h>

 namespace {
-
 // Total device numbers
 constexpr int DEVICE_COUNT = 2;
 // Current device index
 thread_local int gCurrentDevice = 0;
-
 } // namespace

 orError_t orGetDeviceCount(int* count) {
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp
@ -1,20 +1,10 @@
 #include "memory.h"

-#include <include/openreg.h>
-
 #include <map>
 #include <mutex>

 namespace {

-struct Block {
-  orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
-  int device = -1;
-  void* pointer = nullptr;
-  size_t size = 0;
-  int refcount{0};
-};
-
 class MemoryManager {
 public:
  static MemoryManager& getInstance() {
@ -48,7 +38,7 @@ class MemoryManager {
      }
    }

-    m_registry[mem] = {type, current_device, mem, aligned_size, 0};
+    m_registry[mem] = {type, current_device, mem, aligned_size};
    *ptr = mem;
    return orSuccess;
  }
@ -61,15 +51,14 @@ class MemoryManager {
    auto it = m_registry.find(ptr);
    if (it == m_registry.end())
      return orErrorUnknown;
-
    const auto& info = it->second;
+
    if (info.type == orMemoryType::orMemoryTypeDevice) {
      openreg::mprotect(info.pointer, info.size, F_PROT_READ | F_PROT_WRITE);
      openreg::munmap(info.pointer, info.size);
    } else {
      openreg::free(info.pointer);
    }
-
    m_registry.erase(it);
    return orSuccess;
  }
@ -81,39 +70,36 @@ class MemoryManager {
      orMemcpyKind kind) {
    if (!dst || !src || count == 0)
      return orErrorUnknown;
-
    std::lock_guard<std::mutex> lock(m_mutex);
-    Block* dst_info = getBlockInfoNoLock(dst);
-    Block* src_info = getBlockInfoNoLock(src);
-
+    orPointerAttributes dst_info = getPointerInfo(dst);
+    orPointerAttributes src_info = getPointerInfo(src);
    switch (kind) {
      case orMemcpyHostToDevice:
-        if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
-            (src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
+        if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
+            src_info.type == orMemoryType::orMemoryTypeDevice)
          return orErrorUnknown;
        break;
      case orMemcpyDeviceToHost:
-        if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
-            (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
+        if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
+            src_info.type != orMemoryType::orMemoryTypeDevice)
          return orErrorUnknown;
        break;
      case orMemcpyDeviceToDevice:
-        if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
-            (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
+        if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
+            src_info.type != orMemoryType::orMemoryTypeDevice)
          return orErrorUnknown;
        break;
      case orMemcpyHostToHost:
-        if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
-            (src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
+        if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
+            src_info.type == orMemoryType::orMemoryTypeDevice)
          return orErrorUnknown;
        break;
    }
-
-    unprotectNoLock(dst_info);
-    unprotectNoLock(src_info);
-    ::memcpy(dst, src, count);
-    protectNoLock(dst_info);
-    protectNoLock(src_info);
+    {
+      ScopedMemoryProtector dst_protector(dst_info);
+      ScopedMemoryProtector src_protector(src_info);
+      ::memcpy(dst, src, count);
+    }

    return orSuccess;
  }
@ -125,16 +111,17 @@ class MemoryManager {
      return orErrorUnknown;

    std ::lock_guard<std::mutex> lock(m_mutex);
-    Block* info = getBlockInfoNoLock(ptr);
+    orPointerAttributes info = getPointerInfo(ptr);

-    if (!info) {
-      attributes->type = orMemoryType::orMemoryTypeUnmanaged;
+    attributes->type = info.type;
+    if (info.type == orMemoryType::orMemoryTypeUnmanaged) {
      attributes->device = -1;
      attributes->pointer = const_cast<void*>(ptr);
+      attributes->size = 0;
    } else {
-      attributes->type = info->type;
-      attributes->device = info->device;
-      attributes->pointer = info->pointer;
+      attributes->device = info.device;
+      attributes->pointer = info.pointer;
+      attributes->size = info.size;
    }

    return orSuccess;
@ -142,61 +129,71 @@ class MemoryManager {

  orError_t unprotect(void* ptr) {
    std::lock_guard<std::mutex> lock(m_mutex);
-    return unprotectNoLock(getBlockInfoNoLock(ptr));
+    orPointerAttributes info = getPointerInfo(ptr);
+    if (info.type != orMemoryType::orMemoryTypeDevice) {
+      return orErrorUnknown;
+    }
+    if (openreg::mprotect(
+            info.pointer, info.size, F_PROT_READ | F_PROT_WRITE) != 0) {
+      return orErrorUnknown;
+    }
+    return orSuccess;
  }

  orError_t protect(void* ptr) {
    std::lock_guard<std::mutex> lock(m_mutex);
-    return protectNoLock(getBlockInfoNoLock(ptr));
+    orPointerAttributes info = getPointerInfo(ptr);
+    if (info.type != orMemoryType::orMemoryTypeDevice) {
+      return orErrorUnknown;
+    }
+    if (openreg::mprotect(info.pointer, info.size, F_PROT_NONE) != 0) {
+      return orErrorUnknown;
+    }
+    return orSuccess;
  }

 private:
+  class ScopedMemoryProtector {
+   public:
+    ScopedMemoryProtector(const orPointerAttributes& info)
+        : m_info(info), m_protected(false) {
+      if (m_info.type == orMemoryType::orMemoryTypeDevice) {
+        if (openreg::mprotect(
+                m_info.pointer, m_info.size, F_PROT_READ | F_PROT_WRITE) == 0) {
+          m_protected = true;
+        }
+      }
+    }
+    ~ScopedMemoryProtector() {
+      if (m_protected) {
+        openreg::mprotect(m_info.pointer, m_info.size, F_PROT_NONE);
+      }
+    }
+    ScopedMemoryProtector(const ScopedMemoryProtector&) = delete;
+    ScopedMemoryProtector& operator=(const ScopedMemoryProtector&) = delete;
+
+   private:
+    orPointerAttributes m_info;
+    bool m_protected;
+  };
+
  MemoryManager() = default;

-  orError_t unprotectNoLock(Block* info) {
-    if (info && info->type == orMemoryType::orMemoryTypeDevice) {
-      if (info->refcount == 0) {
-        if (openreg::mprotect(
-                info->pointer, info->size, F_PROT_READ | F_PROT_WRITE) != 0) {
-          return orErrorUnknown;
-        }
-      }
-
-      info->refcount++;
-    }
-
-    return orSuccess;
-  }
-
-  orError_t protectNoLock(Block* info) {
-    if (info && info->type == orMemoryType::orMemoryTypeDevice) {
-      if (info->refcount == 1) {
-        if (openreg::mprotect(info->pointer, info->size, F_PROT_NONE) != 0) {
-          return orErrorUnknown;
-        }
-      }
-
-      info->refcount--;
-    }
-
-    return orSuccess;
-  }
-
-  Block* getBlockInfoNoLock(const void* ptr) {
+  orPointerAttributes getPointerInfo(const void* ptr) {
    auto it = m_registry.upper_bound(const_cast<void*>(ptr));
    if (it != m_registry.begin()) {
      --it;
      const char* p_char = static_cast<const char*>(ptr);
      const char* base_char = static_cast<const char*>(it->first);
      if (p_char >= base_char && p_char < (base_char + it->second.size)) {
-        return &it->second;
+        return it->second;
      }
    }

-    return nullptr;
+    return {};
  }

-  std::map<void*, Block> m_registry;
+  std::map<void*, orPointerAttributes> m_registry;
  std::mutex m_mutex;
 };

@ -228,22 +225,6 @@ orError_t orMemcpy(
  return MemoryManager::getInstance().memcpy(dst, src, count, kind);
 }

-orError_t orMemcpyAsync(
-    void* dst,
-    const void* src,
-    size_t count,
-    orMemcpyKind kind,
-    orStream_t stream) {
-  if (!stream) {
-    return orErrorUnknown;
-  }
-
-  auto& mm = MemoryManager::getInstance();
-
-  return orLaunchKernel(
-      stream, &MemoryManager::memcpy, &mm, dst, src, count, kind);
-}
-
 orError_t orPointerGetAttributes(
    orPointerAttributes* attributes,
    const void* ptr) {
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h
@ -4,6 +4,8 @@
 #include <cstdlib>
 #include <cstring>

+#include <include/openreg.h>
+
 #if defined(_WIN32)
 #include <windows.h>
 #else
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/stream.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/stream.cpp
@ -1,313 +0,0 @@
-#include <include/openreg.h>
-
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <mutex>
-#include <queue>
-#include <set>
-#include <thread>
-
-static std::mutex g_mutex;
-static std::once_flag g_flag;
-static std::vector<std::set<orStream_t>> g_streams_per_device;
-
-static void initialize_registries() {
-  int device_count = 0;
-  orGetDeviceCount(&device_count);
-  g_streams_per_device.resize(device_count);
-}
-
-struct orEventImpl {
-  std::mutex mtx;
-  std::condition_variable cv;
-  std::atomic<bool> completed{true};
-  int device_index = -1;
-  bool timing_enabled{false};
-  std::chrono::high_resolution_clock::time_point completion_time;
-};
-
-struct orEvent {
-  std::shared_ptr<orEventImpl> impl;
-};
-
-struct orStream {
-  std::queue<std::function<void()>> tasks;
-  std::mutex mtx;
-  std::condition_variable cv;
-  std::thread worker;
-  std::atomic<bool> stop_flag{false};
-  int device_index = -1;
-
-  orStream() {
-    worker = std::thread([this] {
-      while (true) {
-        std::function<void()> task;
-        {
-          std::unique_lock<std::mutex> lock(this->mtx);
-          this->cv.wait(lock, [this] {
-            return this->stop_flag.load() || !this->tasks.empty();
-          });
-          if (this->stop_flag.load() && this->tasks.empty()) {
-            return;
-          }
-          task = std::move(this->tasks.front());
-          this->tasks.pop();
-        }
-        task();
-      }
-    });
-  }
-
-  ~orStream() {
-    stop_flag.store(true);
-    cv.notify_one();
-    worker.join();
-  }
-};
-
-orError_t openreg::addTaskToStream(
-    orStream_t stream,
-    std::function<void()> task) {
-  if (!stream)
-    return orErrorUnknown;
-
-  {
-    std::lock_guard<std::mutex> lock(stream->mtx);
-    stream->tasks.push(std::move(task));
-  }
-
-  stream->cv.notify_one();
-  return orSuccess;
-}
-
-orError_t orEventCreateWithFlags(orEvent_t* event, unsigned int flags) {
-  if (!event)
-    return orErrorUnknown;
-
-  auto impl = std::make_shared<orEventImpl>();
-  orGetDevice(&(impl->device_index));
-  if (flags & orEventEnableTiming) {
-    impl->timing_enabled = true;
-  }
-
-  *event = new orEvent{std::move(impl)};
-  return orSuccess;
-}
-
-orError_t orEventCreate(orEvent_t* event) {
-  return orEventCreateWithFlags(event, orEventDisableTiming);
-}
-
-orError_t orEventDestroy(orEvent_t event) {
-  if (!event)
-    return orErrorUnknown;
-
-  delete event;
-  return orSuccess;
-}
-
-orError_t orEventRecord(orEvent_t event, orStream_t stream) {
-  if (!event || !stream)
-    return orErrorUnknown;
-
-  auto event_impl = event->impl;
-  event_impl->completed.store(false);
-  auto record_task = [event_impl]() {
-    if (event_impl->timing_enabled) {
-      event_impl->completion_time = std::chrono::high_resolution_clock::now();
-    }
-
-    {
-      std::lock_guard<std::mutex> lock(event_impl->mtx);
-      event_impl->completed.store(true);
-    }
-
-    event_impl->cv.notify_all();
-  };
-
-  return openreg::addTaskToStream(stream, record_task);
-}
-
-orError_t orEventSynchronize(orEvent_t event) {
-  if (!event)
-    return orErrorUnknown;
-
-  auto event_impl = event->impl;
-  std::unique_lock<std::mutex> lock(event_impl->mtx);
-  event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
-
-  return orSuccess;
-}
-
-orError_t orEventQuery(orEvent_t event) {
-  if (!event)
-    return orErrorUnknown;
-
-  return event->impl->completed.load() ? orSuccess : orErrorNotReady;
-}
-
-orError_t orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end) {
-  if (!ms || !start || !end)
-    return orErrorUnknown;
-
-  auto start_impl = start->impl;
-  auto end_impl = end->impl;
-
-  if (start_impl->device_index != end_impl->device_index) {
-    return orErrorUnknown;
-  }
-
-  if (!start_impl->timing_enabled || !end_impl->timing_enabled) {
-    return orErrorUnknown;
-  }
-
-  if (!start_impl->completed.load() || !end_impl->completed.load()) {
-    return orErrorUnknown;
-  }
-
-  auto duration = end_impl->completion_time - start_impl->completion_time;
-  *ms = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
-            duration)
-            .count();
-
-  return orSuccess;
-}
-
-orError_t orStreamCreateWithPriority(
-    orStream_t* stream,
-    [[maybe_unused]] unsigned int flag,
-    int priority) {
-  if (!stream) {
-    return orErrorUnknown;
-  }
-
-  int min_p, max_p;
-  orDeviceGetStreamPriorityRange(&min_p, &max_p);
-  if (priority < min_p || priority > max_p) {
-    return orErrorUnknown;
-  }
-
-  int current_device = 0;
-  orGetDevice(&current_device);
-
-  orStream_t new_stream = nullptr;
-  new_stream = new orStream();
-  new_stream->device_index = current_device;
-
-  {
-    std::lock_guard<std::mutex> lock(g_mutex);
-    std::call_once(g_flag, initialize_registries);
-    g_streams_per_device[current_device].insert(new_stream);
-  }
-
-  *stream = new_stream;
-
-  return orSuccess;
-}
-
-orError_t orStreamCreate(orStream_t* stream) {
-  int min_p, max_p;
-  orDeviceGetStreamPriorityRange(&min_p, &max_p);
-
-  return orStreamCreateWithPriority(stream, 0, max_p);
-}
-
-orError_t orStreamGetPriority(
-    [[maybe_unused]] orStream_t stream,
-    int* priority) {
-  // Since OpenReg has only one priority level, the following code
-  // returns 0 directly for convenience.
-  *priority = 0;
-
-  return orSuccess;
-}
-
-orError_t orStreamDestroy(orStream_t stream) {
-  if (!stream)
-    return orErrorUnknown;
-
-  {
-    std::lock_guard<std::mutex> lock(g_mutex);
-
-    int device_idx = stream->device_index;
-    if (device_idx >= 0 && device_idx < g_streams_per_device.size()) {
-      g_streams_per_device[device_idx].erase(stream);
-    }
-  }
-
-  delete stream;
-  return orSuccess;
-}
-
-orError_t orStreamQuery(orStream_t stream) {
-  if (!stream) {
-    return orErrorUnknown;
-  }
-
-  std::lock_guard<std::mutex> lock(stream->mtx);
-  return stream->tasks.empty() ? orSuccess : orErrorNotReady;
-}
-
-orError_t orStreamSynchronize(orStream_t stream) {
-  if (!stream)
-    return orErrorUnknown;
-
-  orEvent_t event;
-  orEventCreate(&event);
-  orEventRecord(event, stream);
-
-  orError_t status = orEventSynchronize(event);
-  orEventDestroy(event);
-
-  return status;
-}
-
-orError_t orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int) {
-  if (!stream || !event)
-    return orErrorUnknown;
-
-  auto event_impl = event->impl;
-  auto wait_task = [event_impl]() {
-    std::unique_lock<std::mutex> lock(event_impl->mtx);
-    event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
-  };
-
-  return openreg::addTaskToStream(stream, wait_task);
-}
-
-orError_t orDeviceGetStreamPriorityRange(
-    int* leastPriority,
-    int* greatestPriority) {
-  if (!leastPriority || !greatestPriority) {
-    return orErrorUnknown;
-  }
-
-  // OpenReg have only one priority now.
-  *leastPriority = 0;
-  *greatestPriority = 0;
-  return orSuccess;
-}
-
-orError_t orDeviceSynchronize(void) {
-  int current_device = 0;
-  orGetDevice(&current_device);
-
-  std::vector<orStream_t> streams;
-  {
-    std::lock_guard<std::mutex> lock(g_mutex);
-    std::call_once(g_flag, initialize_registries);
-
-    auto& streams_on_device = g_streams_per_device[current_device];
-    streams.assign(streams_on_device.begin(), streams_on_device.end());
-  }
-
-  for (orStream_t stream : streams) {
-    orError_t status = orStreamSynchronize(stream);
-    if (status != orSuccess) {
-      return status;
-    }
-  }
-
-  return orSuccess;
-}
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
@ -1,112 +0,0 @@
-#include "include/openreg.h"
-
-#include <algorithm>
-#include <iostream>
-#include <numeric>
-#include <vector>
-
-struct MemoryGuard {
-  MemoryGuard(void* ptr) : ptr_(ptr) {
-    orMemoryUnprotect(ptr_);
-  }
-  ~MemoryGuard() {
-    orMemoryProtect(ptr_);
-  }
-
- private:
-  void* ptr_{};
-};
-
-void add_kernel(float* out, float* a, float* b, int num) {
-  for (int i = 0; i < num; ++i) {
-    out[i] = a[i] + b[i];
-  }
-}
-
-int main() {
-  int device_count = 0;
-  orGetDeviceCount(&device_count);
-
-  std::cout << "Current environment have " << device_count << " devices"
-            << std::endl;
-
-  orSetDevice(0);
-  int current_device = -1;
-  orGetDevice(&current_device);
-
-  std::cout << "Current is " << current_device << " device" << std::endl;
-
-  constexpr int num = 50000;
-  constexpr size_t size = num * sizeof(float);
-
-  std::vector<float> host_a(num), host_b(num), host_out(num, 0.0f);
-  std::iota(host_a.begin(), host_a.end(), 0.0f);
-  for (int i = 0; i < num; ++i) {
-    host_b[i] = 2.0f;
-  }
-
-  float *dev_a, *dev_b, *dev_out;
-  orMalloc((void**)&dev_a, size);
-  orMalloc((void**)&dev_b, size);
-  orMalloc((void**)&dev_out, size);
-
-  // There will be subsequent memory access operations, so memory protection
-  // needs to be released
-  MemoryGuard a{dev_a};
-  MemoryGuard b{dev_b};
-  MemoryGuard c{dev_out};
-
-  orStream_t stream1, stream2;
-  orEvent_t start_event, stop_event;
-
-  orStreamCreate(&stream1);
-  orStreamCreate(&stream2);
-  orEventCreateWithFlags(&start_event, orEventEnableTiming);
-  orEventCreateWithFlags(&stop_event, orEventEnableTiming);
-
-  // Copy input from host to device
-  orMemcpyAsync(dev_a, host_a.data(), size, orMemcpyHostToDevice, stream1);
-  orMemcpyAsync(dev_b, host_b.data(), size, orMemcpyHostToDevice, stream1);
-
-  // Submit compute kernel and two events those are used for calculating time.
-  orEventRecord(start_event, stream1);
-  orLaunchKernel(stream1, add_kernel, dev_out, dev_a, dev_b, num);
-  orEventRecord(stop_event, stream1);
-
-  // Synchronization between streams.
-  orStreamWaitEvent(stream2, stop_event, 0);
-  orMemcpyAsync(host_out.data(), dev_out, size, orMemcpyDeviceToHost, stream2);
-  orStreamSynchronize(stream2);
-
-  std::cout << "All tasks have been submitted." << std::endl;
-
-  float elapsed_ms = 0.0f;
-  orEventElapsedTime(&elapsed_ms, start_event, stop_event);
-  std::cout << "Kernel execution time: " << elapsed_ms << " ms" << std::endl;
-
-  bool success = true;
-  for (int i = 0; i < num; ++i) {
-    if (std::abs(host_out[i] - (host_a[i] + host_b[i])) > 1e-5) {
-      std::cout << "Verification FAILED at index " << i << "! Expected "
-                << (host_a[i] + host_b[i]) << ", got " << host_out[i]
-                << std::endl;
-      success = false;
-      break;
-    }
-  }
-  if (success) {
-    std::cout << "Verification PASSED!" << std::endl;
-  }
-
-  orFree(dev_a);
-  orFree(dev_b);
-  orFree(dev_out);
-
-  orStreamDestroy(stream1);
-  orStreamDestroy(stream2);
-
-  orEventDestroy(start_event);
-  orEventDestroy(stop_event);
-
-  return 0;
-}
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h
@ -3,20 +3,16 @@
 #include <cstddef>

 #ifdef _WIN32
-#define OPENREG_EXPORT __declspec(dllexport)
+  #define OPENREG_EXPORT __declspec(dllexport)
 #else
-#define OPENREG_EXPORT __attribute__((visibility("default")))
+  #define OPENREG_EXPORT __attribute__((visibility("default")))
 #endif

 #ifdef __cplusplus
 extern "C" {
 #endif

-typedef enum orError_t {
-  orSuccess = 0,
-  orErrorUnknown = 1,
-  orErrorNotReady = 2
-} orError_t;
+typedef enum orError_t { orSuccess = 0, orErrorUnknown = 1 } orError_t;

 typedef enum orMemcpyKind {
  orMemcpyHostToHost = 0,
@ -35,75 +31,25 @@ struct orPointerAttributes {
  orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
  int device;
  void* pointer;
+  size_t size;
 };

-typedef enum orEventFlags {
-  orEventDisableTiming = 0x0,
-  orEventEnableTiming = 0x1,
-} orEventFlags;
-
-struct orStream;
-struct orEvent;
-typedef struct orStream* orStream_t;
-typedef struct orEvent* orEvent_t;
-
-// Memory
 OPENREG_EXPORT orError_t orMalloc(void** devPtr, size_t size);
 OPENREG_EXPORT orError_t orFree(void* devPtr);
 OPENREG_EXPORT orError_t orMallocHost(void** hostPtr, size_t size);
 OPENREG_EXPORT orError_t orFreeHost(void* hostPtr);
-OPENREG_EXPORT orError_t
-orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
-OPENREG_EXPORT orError_t orMemcpyAsync(
-    void* dst,
-    const void* src,
-    size_t count,
-    orMemcpyKind kind,
-    orStream_t stream);
-OPENREG_EXPORT orError_t
-orPointerGetAttributes(orPointerAttributes* attributes, const void* ptr);
+OPENREG_EXPORT orError_t orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
 OPENREG_EXPORT orError_t orMemoryUnprotect(void* devPtr);
 OPENREG_EXPORT orError_t orMemoryProtect(void* devPtr);

-// Device
 OPENREG_EXPORT orError_t orGetDeviceCount(int* count);
 OPENREG_EXPORT orError_t orSetDevice(int device);
 OPENREG_EXPORT orError_t orGetDevice(int* device);
-OPENREG_EXPORT orError_t
-orDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
-OPENREG_EXPORT orError_t orDeviceSynchronize(void);

-// Stream
-OPENREG_EXPORT orError_t orStreamCreateWithPriority(
-    orStream_t* stream,
-    unsigned int flags,
-    int priority);
-OPENREG_EXPORT orError_t orStreamCreate(orStream_t* stream);
-OPENREG_EXPORT orError_t orStreamGetPriority(orStream_t stream, int* priority);
-OPENREG_EXPORT orError_t orStreamDestroy(orStream_t stream);
-OPENREG_EXPORT orError_t orStreamQuery(orStream_t stream);
-OPENREG_EXPORT orError_t orStreamSynchronize(orStream_t stream);
-OPENREG_EXPORT orError_t
-orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int flags);
-
-// Event
-OPENREG_EXPORT orError_t
-orEventCreateWithFlags(orEvent_t* event, unsigned int flags);
-OPENREG_EXPORT orError_t orEventCreate(orEvent_t* event);
-OPENREG_EXPORT orError_t orEventDestroy(orEvent_t event);
-OPENREG_EXPORT orError_t orEventRecord(orEvent_t event, orStream_t stream);
-OPENREG_EXPORT orError_t orEventSynchronize(orEvent_t event);
-OPENREG_EXPORT orError_t orEventQuery(orEvent_t event);
-OPENREG_EXPORT orError_t
-orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end);
+OPENREG_EXPORT orError_t orPointerGetAttributes(
+    orPointerAttributes* attributes,
+    const void* ptr);

 #ifdef __cplusplus
 } // extern "C"
 #endif
-
-#ifdef __cplusplus
-
-#define OPENREG_H
-#include "openreg.inl"
-
-#endif
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.inl
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.inl
@ -1,42 +0,0 @@
-#ifndef OPENREG_H
-#error "Don`t include openreg.inl directly, include openreg.h instead."
-#endif
-
-#include <functional>
-#include <tuple>
-#include <utility>
-
-namespace openreg {
-OPENREG_EXPORT orError_t
-addTaskToStream(orStream* stream, std::function<void()> task);
-}
-
-template <typename Func, typename... Args>
-OPENREG_EXPORT inline orError_t orLaunchKernel(
-    orStream* stream,
-    Func&& kernel_func,
-    Args&&... args) {
-  if (!stream) {
-    return orErrorUnknown;
-  }
-
-/*
- * Some tests in PyTorch still use C++11, so we use conditional macro to
- * select different approaches for different C++ version.
- *
- * Std::apply is only supported in C++17, so for C++11/14, std::bind is
- * a more appropriate approach, but the former has better performance.
- */
-#if __cplusplus >= 201703L
-  auto task = [func = std::forward<Func>(kernel_func),
-               args_tuple =
-                   std::make_tuple(std::forward<Args>(args)...)]() mutable {
-    std::apply(func, std::move(args_tuple));
-  };
-#else
-  auto task =
-      std::bind(std::forward<Func>(kernel_func), std::forward<Args>(args)...);
-#endif
-
-  return openreg::addTaskToStream(stream, std::move(task));
-}
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/device_tests.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/device_tests.cpp
@ -1,41 +0,0 @@
-#include <gtest/gtest.h>
-#include <include/openreg.h>
-
-namespace {
-
-class DeviceTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    orSetDevice(0);
-  }
-};
-
-TEST_F(DeviceTest, GetDeviceCountValid) {
-  int count = -1;
-  EXPECT_EQ(orGetDeviceCount(&count), orSuccess);
-  EXPECT_EQ(count, 2);
-}
-
-TEST_F(DeviceTest, GetDeviceValid) {
-  int device = -1;
-  EXPECT_EQ(orGetDevice(&device), orSuccess);
-  EXPECT_EQ(device, 0);
-}
-
-TEST_F(DeviceTest, SetDeviceValid) {
-  EXPECT_EQ(orSetDevice(1), orSuccess);
-
-  int device = -1;
-  EXPECT_EQ(orGetDevice(&device), orSuccess);
-  EXPECT_EQ(device, 1);
-
-  EXPECT_EQ(orSetDevice(0), orSuccess);
-  EXPECT_EQ(orGetDevice(&device), orSuccess);
-  EXPECT_EQ(device, 0);
-}
-
-TEST_F(DeviceTest, SetDeviceInvalidNegative) {
-  EXPECT_EQ(orSetDevice(-1), orErrorUnknown);
-}
-
-} // namespace
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Eddie Yan	e38ac55aae	cleanup and remove alert test	2025-09-10 22:46:12 +00:00
Eddie Yan	f20c28eba1	update	2025-09-02 17:48:42 +00:00
Eddie Yan	daac58237f	check in	2025-08-28 23:43:31 +00:00