[Dynamo] Remove ignored modes from torch function mode stack guard

ghstack-source-id: c3398f28b58561ba6241279c6a7cf404aabfa8c7 Pull Request resolved: https://github.com/pytorch/pytorch/pull/135503
[Dynamo] Remove ignored modes workaround
2025-10-28 02:04:53 +08:00 · 2024-09-11 14:02:23 -07:00 · 2024-09-11 14:02:22 -07:00 · 2024-09-11 14:02:22 -07:00 · 2024-09-09 16:02:11 -07:00 · 2024-09-09 16:02:11 -07:00
316 changed files with 9703 additions and 4135 deletions
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -108,10 +108,10 @@ ENV CMAKE_C_COMPILER cc
 ENV CMAKE_CXX_COMPILER c++
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
+COPY ci_commit_pins/triton.txt triton.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

 # Install AOTriton (Early fail)
 COPY ./aotriton_version.txt aotriton_version.txt
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +0,0 @@
-21eae954efa5bf584da70324b640288c3ee7aede
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-1b2f15840e0d70eec50d84c7a0575cb835524def
+cc981feba10a3f4c2e46f3fe368e8fcf5f5643df
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-dedb7bdf339a3546896d4820366ca562c586bfa0
+757b6a61e7df814ba806f498f8bb3160f84b120c
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -12,10 +12,7 @@ conda_reinstall() {
  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }

-if [ -n "${ROCM_VERSION}" ]; then
-  TRITON_REPO="https://github.com/openai/triton"
-  TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${XPU_VERSION}" ]; then
+if [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
 else
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -30,9 +30,14 @@ dill==0.3.7
 #Pinned versions: 0.3.7
 #test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py

-expecttest==0.1.6
+expecttest==0.2.1
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
+#Pinned versions: 0.2.1
+#test that import:
+
+fbscribelogger==0.1.6
+#Description: write to scribe from authenticated jobs on CI
 #Pinned versions: 0.1.6
 #test that import:

--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -100,10 +100,10 @@ ARG TRITON
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
+COPY ci_commit_pins/triton.txt triton.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

 # Install AOTriton
 COPY ./aotriton_version.txt aotriton_version.txt
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -596,6 +596,9 @@ test_single_dynamo_benchmark() {

 test_inductor_micro_benchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+    test_inductor_set_cpu_affinity
+  fi
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }

--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -119,6 +119,11 @@ fi
 # Test the package
 /builder/check_binary.sh

+if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm*  && "$PACKAGE_TYPE" != libtorch ]]; then
+  # Exclude s390, xpu, rocm and libtorch builds from smoke testing
+  python /builder/test/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled
+fi
+
 # Clean temp files
 cd /builder && git clean -ffdx

--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -90,7 +90,7 @@ fi
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
+        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -9,6 +9,7 @@ ciflow_push_tags:
 - ciflow/inductor-rocm
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
+- ciflow/inductor-micro-benchmark-cpu-x86
 - ciflow/inductor-cu124
 - ciflow/linux-aarch64
 - ciflow/mps
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,6 +1,7 @@
 boto3==1.19.12
 hypothesis==6.56.4
-expecttest==0.1.6
+expecttest==0.2.1
+fbscribelogger==0.1.6
 librosa>=0.6.2
 mpmath==1.3.0
 networkx==2.8.7
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -15,9 +15,7 @@ REPO_DIR = SCRIPT_DIR.parent.parent

 def read_triton_pin(device: str = "cuda") -> str:
    triton_file = "triton.txt"
-    if device == "rocm":
-        triton_file = "triton-rocm.txt"
-    elif device == "xpu":
+    if device == "xpu":
        triton_file = "triton-xpu.txt"
    with open(REPO_DIR / ".ci" / "docker" / "ci_commit_pins" / triton_file) as f:
        return f.read().strip()
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -325,6 +325,7 @@ def generate_wheels_matrix(
    os: str,
    arches: Optional[List[str]] = None,
    python_versions: Optional[List[str]] = None,
+    use_split_build: bool = False,
 ) -> List[Dict[str, str]]:
    package_type = "wheel"
    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@ -371,7 +372,17 @@ def generate_wheels_matrix(
            ) and python_version == "3.13":
                continue

+            if use_split_build and (
+                arch_version not in ["12.4", "12.1", "11.8", "cpu"] or os != "linux"
+            ):
+                raise RuntimeError(
+                    "Split build is only supported on linux with cuda 12.4, 12.1, 11.8, and cpu.\n"
+                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
+                    "Please modify the matrix generation to exclude this combination."
+                )
+
            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
+
            if (
                arch_version in ["12.4", "12.1", "11.8"]
                and os == "linux"
@ -385,6 +396,7 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
+                        "use_split_build": "True" if use_split_build else "False",
                        "devtoolset": (
                            "cxx11-abi" if arch_version == "cuda-aarch64" else ""
                        ),
@ -400,7 +412,8 @@ def generate_wheels_matrix(
                        ),
                    }
                )
-                if arch_version != "cuda-aarch64":
+                # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
+                if python_version == "3.10" and arch_version == "12.1":
                    ret.append(
                        {
                            "python_version": python_version,
@ -409,40 +422,16 @@ def generate_wheels_matrix(
                            "desired_cuda": translate_desired_cuda(
                                gpu_arch_type, gpu_arch_version
                            ),
-                            "use_split_build": "True",
+                            "use_split_build": "True" if use_split_build else "False",
                            "devtoolset": "",
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
-                            "pytorch_extra_install_requirements": (
-                                PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
-                                if os != "linux-aarch64"
-                                else ""
-                            ),
-                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace(  # noqa: B950
+                            "pytorch_extra_install_requirements": "",
+                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
                                ".", "_"
                            ),
                        }
                    )
-                    # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                    if python_version == "3.10" and arch_version == "12.1":
-                        ret.append(
-                            {
-                                "python_version": python_version,
-                                "gpu_arch_type": gpu_arch_type,
-                                "gpu_arch_version": gpu_arch_version,
-                                "desired_cuda": translate_desired_cuda(
-                                    gpu_arch_type, gpu_arch_version
-                                ),
-                                "use_split_build": "False",
-                                "devtoolset": "",
-                                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                                "package_type": package_type,
-                                "pytorch_extra_install_requirements": "",
-                                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
-                                    ".", "_"
-                                ),
-                            }
-                        )
            else:
                ret.append(
                    {
@ -452,6 +441,7 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
+                        "use_split_build": "True" if use_split_build else "False",
                        "devtoolset": (
                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
                        ),
@ -467,6 +457,7 @@ def generate_wheels_matrix(
                        ),
                    }
                )
+
    return ret


--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -61,6 +61,7 @@ class BinaryBuildWorkflow:
    # Mainly for macos
    cross_compile_arm64: bool = False
    macos_runner: str = "macos-14-xlarge"
+    use_split_build: bool = False

    def __post_init__(self) -> None:
        if self.abi_version:
@ -75,6 +76,11 @@ class BinaryBuildWorkflow:
            GITHUB_DIR
            / f"workflows/generated-{self.build_environment}-{self.branches}.yml"
        )
+        if self.use_split_build:
+            output_file_path = (
+                GITHUB_DIR
+                / f"workflows/generated-{self.build_environment}-{self.branches}-split.yml"
+            )
        with open(output_file_path, "w") as output_file:
            GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
            output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
@ -110,6 +116,20 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
            isolated_workflow=True,
        ),
    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            use_split_build=True,
+            arches=["11.8", "12.1", "12.4", "cpu"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            isolated_workflow=True,
+        ),
+        use_split_build=True,
+    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="conda",
@ -162,6 +182,21 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        ),
        branches="main",
    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["11.8", "12.1", "12.4"],
+            python_versions=["3.9"],
+            use_split_build=True,
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_PERIODIC},
+        ),
+        branches="main",
+        use_split_build=True,
+    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -45,7 +45,7 @@
  {%- if is_windows %}
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
  {%- endif %}

 {%- else %}
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -13,7 +13,6 @@ on:
      - .github/scripts/build_triton_wheel.py
      - .github/ci_commit_pins/triton.txt
      - .ci/docker/ci_commit_pins/triton.txt
-      - .ci/docker/ci_commit_pins/triton-rocm.txt
      - .ci/docker/ci_commit_pins/triton-xpu.txt
  pull_request:
    paths:
@ -21,7 +20,6 @@ on:
      - .github/scripts/build_triton_wheel.py
      - .github/ci_commit_pins/triton.txt
      - .ci/docker/ci_commit_pins/triton.txt
-      - .ci/docker/ci_commit_pins/triton-rocm.txt
      - .ci/docker/ci_commit_pins/triton-xpu.txt

 concurrency:
@ -31,7 +29,7 @@ concurrency:
 jobs:
  build-wheel:
    name: "Build Triton Wheel"
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: [self-hosted, linux.4xlarge]
    strategy:
      fail-fast: false
      matrix:
@ -120,7 +118,7 @@ jobs:
          fi
          docker exec -t "${container_name}" chown -R 1000.1000 /artifacts

-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4.4.0
        with:
          name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
          if-no-files-found: error
@ -253,7 +251,7 @@ jobs:
          docker exec -t "${container_name}" python /pytorch/.github/scripts/build_triton_wheel.py --build-conda --py-version="${PY_VERS}" $RELEASE
          docker exec -t "${container_name}" chown -R 1000.1000 /artifacts

-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4.4.0
        with:
          name: pytorch-triton-conda-${{ matrix.py_vers }}
          if-no-files-found: error
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -58,6 +58,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -81,6 +82,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -103,6 +105,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
    secrets:
@ -125,6 +128,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -149,6 +153,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda-aarch64
    secrets:
@ -170,6 +175,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -193,6 +199,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -215,6 +222,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
    secrets:
@ -237,6 +245,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -261,6 +270,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda-aarch64
    secrets:
@ -282,6 +292,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -305,6 +316,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -327,6 +339,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
    secrets:
@ -349,6 +362,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -373,6 +387,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda-aarch64
    secrets:
@ -394,6 +409,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -417,6 +433,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -439,6 +456,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
    secrets:
@ -461,6 +479,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
@ -485,6 +504,7 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda-aarch64
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-main-split.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main-split.yml
@ -0,0 +1,182 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-manywheel
+
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'ciflow/periodic/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  manywheel-py3_9-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda11_8
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda11_8
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_1
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_1
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_4
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -54,6 +54,7 @@ jobs:
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda11_8
@ -77,6 +78,7 @@ jobs:
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
      build_environment: linux-binary-manywheel
@ -85,53 +87,6 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_9-cuda11_8-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda11_8-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda11_8-split-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
  manywheel-py3_9-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -146,6 +101,7 @@ jobs:
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_1
@ -169,6 +125,7 @@ jobs:
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
      build_environment: linux-binary-manywheel
@ -177,53 +134,6 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_9-cuda12_1-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_1-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_1-split-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
  manywheel-py3_9-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -238,6 +148,7 @@ jobs:
      GPU_ARCH_VERSION: 12.4
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
@ -261,6 +172,7 @@ jobs:
      GPU_ARCH_VERSION: 12.4
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
      build_environment: linux-binary-manywheel
@ -268,50 +180,3 @@ jobs:
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_4-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_4-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_4-split-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly-split.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly-split.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -58,6 +58,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -81,6 +82,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -103,6 +105,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
    secrets:
@ -124,6 +127,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -147,6 +151,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -169,6 +174,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
    secrets:
@ -190,6 +196,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -213,6 +220,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -235,6 +243,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
    secrets:
@ -256,6 +265,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -279,6 +289,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -301,6 +312,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
    secrets:
@ -322,6 +334,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -345,6 +358,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -367,6 +381,7 @@ jobs:
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
    secrets:
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
@ -49,7 +49,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -51,7 +51,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -169,7 +169,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -58,7 +58,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -176,7 +176,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cpu-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -316,7 +316,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -435,7 +435,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -550,7 +550,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda11_8-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -695,7 +695,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -810,7 +810,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda12_1-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -836,7 +836,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -955,7 +955,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -1070,7 +1070,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda12_4-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -51,7 +51,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -169,7 +169,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -58,7 +58,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -176,7 +176,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cpu-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -316,7 +316,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -435,7 +435,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -550,7 +550,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda11_8-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -695,7 +695,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -810,7 +810,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda12_1-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -836,7 +836,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -955,7 +955,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
        shell: bash
@ -1070,7 +1070,7 @@ jobs:
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
      build_name: libtorch-cuda12_4-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -0,0 +1,40 @@
+name: inductor-micro-benchmark-x86
+
+on:
+  schedule:
+    - cron: 0 7 * * *
+  push:
+    tags:
+      - ciflow/inductor-micro-benchmark-cpu-x86/*
+  workflow_dispatch:
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      # Use metal host for benchmark jobs
+      test-matrix: |
+        { include: [
+          { config: "inductor-micro-benchmark-cpu-x86", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
+        ]}
+
+  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -223,7 +223,7 @@ jobs:
          cache: pip
      - name: Install dependencies
        run: |
-          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
+          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
          pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
      - name: Run run_test.py (nonretryable)
        run: |
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -2,7 +2,7 @@ name: Upload test stats

 on:
  workflow_run:
-    workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark, inductor-cu124, inductor-rocm]
+    workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm]
    types:
      - completed

--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -138,7 +138,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
-    'expecttest==0.1.6',
+    'expecttest==0.2.1',
    'mypy==1.10.0',
    'sympy==1.12.1 ; python_version == "3.8"',
    'sympy==1.13.0 ; python_version >= "3.9"',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -332,6 +332,7 @@ intern_build_aten_ops(
        "@fbgemm",
        "@mkl",
        "@sleef",
+        "@mkl_dnn//:mkl-dnn",
    ],
 )

--- a/1
+++ b/1
@ -57,7 +57,6 @@ nn/qat/ @jerryzh168
 # Docker
 /.ci/docker/ @jeffdaily
 /.ci/docker/ci_commit_pins/triton.txt @desertfire @Chillee @eellison @shunting314 @bertmaher @jeffdaily @jataylo @jithunnair-amd @pruthvistony
-/.ci/docker/ci_commit_pins/triton-rocm.txt @jeffdaily @jataylo @jithunnair-amd @pruthvistony
 /.ci/docker/ci_commit_pins/triton-xpu.txt @EikanWang @gujinghui

 # Github Actions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 | PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
 | --- | --- | --- | --- | --- |
+| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
 | 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
 | 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
 | 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -707,7 +707,12 @@ bool are_all_mutations_under_no_grad_or_inference_mode(const Tensor& functional_
 }

 bool isFunctionalTensor(const at::Tensor& tensor) {
-  return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
+   return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
+}
+
+bool isBaseTensor(const at::Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(tensor));
+  return unsafeGetFunctionalWrapper(tensor)->isBaseTensor();
 }

 bool isFunctionalTensor(const std::optional<Tensor>& t) {
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -165,6 +165,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
    was_storage_changed_ = true;
  }

+  // A FunctionalTensor is considered a base if its not a view of another
+  // tensor.
+  bool isBaseTensor() const {
+    return view_metas_.empty();
+  }
+
  c10::SymInt get_storage_size(bool before) {
    return functional_storage_impl()->get_storage_size(before);
  }
@ -290,6 +296,8 @@ TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
  return functional_impl;
 }

+TORCH_API bool isBaseTensor(const at::Tensor& tensor);
+
 TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
 TORCH_API bool isFunctionalTensor(const std::optional<Tensor>& t);
 TORCH_API bool isFunctionalTensor(
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -69,7 +69,7 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
        at::ScalarType::Undefined, // Vulkan
        at::ScalarType::Undefined, // Metal
        at::kHalf, // XPU
-        at::ScalarType::Undefined, // MPS
+        at::kHalf, // MPS
        at::ScalarType::Undefined, // Meta (tensors with no data)
        at::kBFloat16, // HPU / HABANA
        at::ScalarType::Undefined, // SX-Aurora / NEC
@ -206,6 +206,118 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
         TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
 }

+TORCH_LIBRARY_IMPL(_, AutocastMPS, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
+TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
+  // lower_precision_fp
+  KERNEL_MPS2(_convolution, deprecated, lower_precision_fp)
+  KERNEL_MPS(_convolution, lower_precision_fp)
+  KERNEL_MPS(conv1d, lower_precision_fp)
+  KERNEL_MPS(conv2d, lower_precision_fp)
+  KERNEL_MPS(conv_tbc, lower_precision_fp)
+  KERNEL_MPS(conv_transpose1d, lower_precision_fp)
+  KERNEL_MPS2(conv_transpose2d, input, lower_precision_fp)
+  KERNEL_MPS(convolution, lower_precision_fp)
+  KERNEL_MPS(_mps_convolution, lower_precision_fp)
+  KERNEL_MPS(prelu, lower_precision_fp)
+  KERNEL_MPS(addmm, lower_precision_fp)
+  KERNEL_MPS(addmv, lower_precision_fp)
+  KERNEL_MPS(addr, lower_precision_fp)
+  KERNEL_MPS(matmul, lower_precision_fp)
+  KERNEL_MPS(einsum, lower_precision_fp)
+  KERNEL_MPS(mm, lower_precision_fp)
+  KERNEL_MPS(mv, lower_precision_fp)
+  KERNEL_MPS(linear, lower_precision_fp)
+  KERNEL_MPS(addbmm, lower_precision_fp)
+  KERNEL_MPS(baddbmm, lower_precision_fp)
+  KERNEL_MPS(bmm, lower_precision_fp)
+  KERNEL_MPS(chain_matmul, lower_precision_fp)
+  KERNEL_MPS(linalg_multi_dot, lower_precision_fp)
+  KERNEL_MPS(lstm_cell, lower_precision_fp)
+
+  // fp32
+  KERNEL_MPS(acos, fp32)
+  KERNEL_MPS(asin, fp32)
+  KERNEL_MPS(cosh, fp32)
+  KERNEL_MPS(erfinv, fp32)
+  KERNEL_MPS(exp, fp32)
+  KERNEL_MPS(expm1, fp32)
+  KERNEL_MPS(log, fp32)
+  KERNEL_MPS(log10, fp32)
+  KERNEL_MPS(log2, fp32)
+  KERNEL_MPS(log1p, fp32)
+  KERNEL_MPS(reciprocal, fp32)
+  KERNEL_MPS(rsqrt, fp32)
+  KERNEL_MPS(sinh, fp32)
+  KERNEL_MPS(tan, fp32)
+  KERNEL_MPS2(pow, Tensor_Scalar, fp32)
+  KERNEL_MPS2(pow, Tensor_Tensor, fp32)
+  KERNEL_MPS2(pow, Scalar, fp32)
+  KERNEL_MPS(softplus, fp32)
+  KERNEL_MPS(layer_norm, fp32)
+  KERNEL_MPS(native_layer_norm, fp32)
+  KERNEL_MPS(group_norm, fp32)
+  KERNEL_MPS2(frobenius_norm, dim, fp32)
+  KERNEL_MPS(nuclear_norm, fp32)
+  KERNEL_MPS2(nuclear_norm, dim, fp32)
+  KERNEL_MPS(batch_norm, fp32)
+  KERNEL_MPS(cosine_similarity, fp32)
+  KERNEL_MPS(poisson_nll_loss, fp32)
+  KERNEL_MPS(cosine_embedding_loss, fp32)
+  KERNEL_MPS(nll_loss, fp32)
+  KERNEL_MPS(nll_loss2d, fp32)
+  KERNEL_MPS(hinge_embedding_loss, fp32)
+  KERNEL_MPS(kl_div, fp32)
+  KERNEL_MPS(l1_loss, fp32)
+  KERNEL_MPS(smooth_l1_loss, fp32)
+  KERNEL_MPS(huber_loss, fp32)
+  KERNEL_MPS(mse_loss, fp32)
+  KERNEL_MPS(margin_ranking_loss, fp32)
+  KERNEL_MPS(multilabel_margin_loss, fp32)
+  KERNEL_MPS(soft_margin_loss, fp32)
+  KERNEL_MPS(triplet_margin_loss, fp32)
+  KERNEL_MPS(multi_margin_loss, fp32)
+  KERNEL_MPS(binary_cross_entropy_with_logits, fp32)
+  KERNEL_MPS(dist, fp32)
+  KERNEL_MPS(pdist, fp32)
+  KERNEL_MPS(cdist, fp32)
+  KERNEL_MPS(renorm, fp32)
+  KERNEL_MPS(logsumexp, fp32)
+
+  // fp32_set_opt_dtype
+  KERNEL_MPS(prod, fp32)
+  KERNEL_MPS2(prod, dim_int, fp32)
+  KERNEL_MPS2(prod, dim_Dimname, fp32)
+  KERNEL_MPS2(softmax, int, fp32)
+  KERNEL_MPS2(softmax, Dimname, fp32)
+  KERNEL_MPS2(log_softmax, int, fp32)
+  KERNEL_MPS2(log_softmax, Dimname, fp32)
+  KERNEL_MPS(cumprod, fp32)
+  KERNEL_MPS2(cumprod, dimname, fp32)
+  KERNEL_MPS(cumsum, fp32)
+  KERNEL_MPS2(cumsum, dimname, fp32)
+  KERNEL_MPS(linalg_vector_norm, fp32)
+  KERNEL_MPS(linalg_matrix_norm, fp32)
+  KERNEL_MPS2(linalg_matrix_norm, str_ord, fp32)
+  KERNEL_MPS(sum, fp32)
+  KERNEL_MPS2(sum, dim_IntList, fp32)
+  KERNEL_MPS2(sum, dim_DimnameList, fp32)
+  //
+  // promote
+  KERNEL_MPS(addcdiv, promote)
+  KERNEL_MPS(addcmul, promote)
+  KERNEL_MPS(atan2, promote)
+  KERNEL_MPS(bilinear, promote)
+  KERNEL_MPS(cross, promote)
+  KERNEL_MPS(dot, promote)
+  KERNEL_MPS(grid_sampler, promote)
+  KERNEL_MPS(index_put, promote)
+  KERNEL_MPS(tensordot, promote)
+  KERNEL_MPS(scatter_add, promote)
+}
+
 TORCH_LIBRARY_IMPL(_, AutocastCPU, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
 }
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -145,6 +145,8 @@ inline bool is_autocast_eligible(
      return tensor.is_xla() && tensor.is_floating_point();
    case c10::DeviceType::PrivateUse1:
      return tensor.is_privateuseone() && tensor.is_floating_point();
+    case c10::DeviceType::MPS:
+      return tensor.is_mps() && tensor.is_floating_point();
    default:
      return false;
  }
@ -168,6 +170,8 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
      return DispatchKey::AutocastXLA;
    case c10::DeviceType::PrivateUse1:
      return DispatchKey::AutocastPrivateUse1;
+    case c10::DeviceType::MPS:
+      return DispatchKey::AutocastMPS;
    default:
      throw std::runtime_error(
          "unknown device type for autocast in get_autocast_dispatch_key_from_device_type");
@ -178,7 +182,7 @@ inline bool is_autocast_available(c10::DeviceType device_type) {
  if (device_type == at::kCPU || device_type == at::kCUDA ||
      device_type == at::kXPU || device_type == at::kIPU ||
      device_type == at::kHPU || device_type == at::kXLA ||
-      device_type == at::kPrivateUse1) {
+      device_type == at::kPrivateUse1 || device_type == at::kMPS) {
    return true;
  } else {
    return false;
@ -745,6 +749,27 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
      REDISPATCH_SIGNATURE,                                  \
      POLICY)

+// KERNEL_MPS registration for AutocastMPS
+#define KERNEL_MPS(OP, POLICY)            \
+  m.impl(                                 \
+      TORCH_SELECTIVE_NAME("aten::" #OP), \
+      &WrapFunction<                      \
+          CastPolicy::POLICY,             \
+          DeviceType::MPS,                \
+          decltype(ATEN_FN(OP)),          \
+          decltype(ATEN_FN(OP)),          \
+          &ATEN_FN(OP)>::type::call);
+
+#define KERNEL_MPS2(OP, OVERLOAD, POLICY)               \
+  m.impl(                                               \
+      TORCH_SELECTIVE_NAME("aten::" #OP "." #OVERLOAD), \
+      &WrapFunction<                                    \
+          CastPolicy::POLICY,                           \
+          DeviceType::MPS,                              \
+          decltype(ATEN_FN2(OP, OVERLOAD)),             \
+          decltype(ATEN_FN2(OP, OVERLOAD)),             \
+          &ATEN_FN2(OP, OVERLOAD)>::type::call);
+
 // Op lists for different policies.
 // To make sure other backends can reuse the policy op list.
 #define AT_FORALL_LOWER_PRECISION_FP(_)  \
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -228,6 +228,7 @@ namespace c10 {
  _(aten, is_autocast_cpu_enabled)   \
  _(aten, is_autocast_xla_enabled)   \
  _(aten, get_autocast_dtype)        \
+  _(aten, is_autocast_mps_enabled)   \
  FORALL_ATEN_BASE_SYMBOLS(_)        \
  _(onnx, Add)                       \
  _(onnx, Concat)                    \
--- a/aten/src/ATen/cpu/Utils.cpp
+++ b/aten/src/ATen/cpu/Utils.cpp
@ -9,7 +9,7 @@
 #endif

 namespace at::cpu {
-bool is_cpu_support_avx2() {
+bool is_avx2_supported() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_avx2();
 #else
@ -17,7 +17,7 @@ bool is_cpu_support_avx2() {
 #endif
 }

-bool is_cpu_support_avx512() {
+bool is_avx512_supported() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512vl() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq();
 #else
@ -25,7 +25,7 @@ bool is_cpu_support_avx512() {
 #endif
 }

-bool is_cpu_support_avx512_vnni() {
+bool is_avx512_vnni_supported() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_avx512vnni();
 #else
@ -33,7 +33,15 @@ bool is_cpu_support_avx512_vnni() {
 #endif
 }

-bool is_cpu_support_amx_tile() {
+bool is_avx512_bf16_supported() {
+#if !defined(__s390x__) && !defined(__powerpc__)
+  return cpuinfo_initialize() && cpuinfo_has_x86_avx512bf16();
+#else
+  return false;
+#endif
+}
+
+bool is_amx_tile_supported() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_amx_tile();
 #else
@ -42,7 +50,7 @@ bool is_cpu_support_amx_tile() {
 }

 bool init_amx() {
-  if (!is_cpu_support_amx_tile()) {
+  if (!is_amx_tile_supported()) {
    return false;
  }

--- a/aten/src/ATen/cpu/Utils.h
+++ b/aten/src/ATen/cpu/Utils.h
@ -6,14 +6,17 @@

 namespace at::cpu {

-TORCH_API bool is_cpu_support_avx2();
-TORCH_API bool is_cpu_support_avx512();
+TORCH_API bool is_avx2_supported();
+TORCH_API bool is_avx512_supported();

 // Detect if CPU support Vector Neural Network Instruction.
-TORCH_API bool is_cpu_support_avx512_vnni();
+TORCH_API bool is_avx512_vnni_supported();
+
+// Detect if CPU supports AVX512_BF16 ISA
+TORCH_API bool is_avx512_bf16_supported();

 // Detect if CPU support Advanced Matrix Extension.
-TORCH_API bool is_cpu_support_amx_tile();
+TORCH_API bool is_amx_tile_supported();

 // Enable the system to use AMX instructions.
 TORCH_API bool init_amx();
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@ -636,6 +636,21 @@ inline void transpose_mxn<float, 8, 8>(
  _mm256_storeu_ps(&dst[7 * ld_dst], th);
 }

+template<>
+inline void transpose_mxn<float, 16, 16>(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst) {
+  transpose_mxn<float, 8, 8>(
+          src , ld_src, dst, ld_dst);
+  transpose_mxn<float, 8, 8>(
+          src + 8, ld_src, dst + 8 * ld_dst, ld_dst);
+  transpose_mxn<float, 8, 8>(
+          src + 8 * ld_src, ld_src, dst + 8, ld_dst);
+  transpose_mxn<float, 8, 8>(
+          src + 8 * ld_src + 8, ld_src, dst + 8 * ld_dst + 8, ld_dst);
+}
 #endif

 }} // namespace at::vec::CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -582,8 +582,7 @@ Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<floa
 // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304
 // kernel for transposing mxn where m, n <= 16
 // M + (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + 2 * N instructions
-template <>
-inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
+inline void transpose_mxn_16x16(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
  TORCH_CHECK(M <= 16 && N <= 16, "transpose_mxn<float> expects M, N <= 16.");
  // load from src to registers
  __m512 input[16];
@ -667,8 +666,39 @@ inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, i
  }
 }

+template<>
+inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
+  int64_t i = 0;
+  for (; i < M / 16 * 16; i += 16) {
+    int64_t j = 0;
+    for (; j < N / 16 * 16; j += 16) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, 16);
+    }
+    // handle remainder j
+    int nrem = N - j;
+    if (nrem > 0) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, nrem);
+    }
+  }
+  // handle remainder i
+  int mrem = M - i;
+  if (mrem > 0) {
+    int j = 0;
+    for (; j < N / 16 * 16; j += 16) {
+      transpose_mxn_16x16(
+          src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, 16);
+    }
+    // handle remainder j
+    int nrem = N - j;
+    transpose_mxn_16x16(
+        src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, nrem);
+  }
+}
+
 template <typename T, int M, int N,
-          typename std::enable_if_t<std::is_same<T, float>::value && M <= 16 && N <= 16, int> = 0>
+          typename std::enable_if_t<std::is_same<T, float>::value, int> = 0>
 inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, int64_t ld_dst) {
  transpose_mxn<float>(src, ld_src, dst, ld_dst, M, N);
 }
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@ -23,6 +23,9 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
  OP_DECOMPOSE(dropout_);
  OP_DECOMPOSE(feature_alpha_dropout_);
  OP_DECOMPOSE(feature_dropout_);
+  OP_DECOMPOSE(dropout);
+  OP_DECOMPOSE(_scaled_dot_product_attention_math);
+  OP_DECOMPOSE(scaled_dot_product_attention);
 }

 static void unsupportedData(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
@ -235,7 +238,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE(relu6_);
  OP_DECOMPOSE(prelu);
  OP_DECOMPOSE2(softmax, int);
-  OP_DECOMPOSE(scaled_dot_product_attention);
  OP_DECOMPOSE(special_gammainc);
  OP_DECOMPOSE(special_gammaincc);
  OP_DECOMPOSE(special_logit);
@ -261,7 +263,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE(special_xlogy);
  OP_DECOMPOSE2(special_xlogy, other_scalar);
  OP_DECOMPOSE2(special_xlogy, self_scalar);
-  OP_DECOMPOSE(_scaled_dot_product_attention_math);


  m.impl("split.sizes", native::split_symint);
@ -386,6 +387,11 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE2(to, dtype);
  OP_DECOMPOSE2(to, dtype_layout);
  OP_DECOMPOSE2(to, other);
+
+  // Random ops that are also registered here
+  OP_DECOMPOSE(dropout);
+  OP_DECOMPOSE(_scaled_dot_product_attention_math);
+  OP_DECOMPOSE(scaled_dot_product_attention);
 }

 } // namespace at::functorch
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -496,6 +496,11 @@ _scaled_dot_product_flash_attention_batch_rule(
  bool return_debug_mask,
  c10::optional<double> scale
 ) {
+  if (dropout_p > 0) {
+    auto maybe_layer = maybeCurrentDynamicLayer();
+    RandomnessType randomness = maybe_layer->randomness();
+    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
+  }
  auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
  auto query_ = moveBatchDimToFront(query, query_bdim);
  auto key_ = moveBatchDimToFront(key, key_bdim);
@ -540,6 +545,11 @@ fourOutputs _scaled_dot_product_efficient_attention_batch_rule(
  bool is_causal,
  c10::optional<double> scale
 ) {
+  if (dropout_p > 0) {
+    auto maybe_layer = maybeCurrentDynamicLayer();
+    RandomnessType randomness = maybe_layer->randomness();
+    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
+  }
  auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
  auto query_ = moveBatchDimToFront(query, query_bdim);
  auto key_ = moveBatchDimToFront(key, key_bdim);
@ -577,6 +587,11 @@ _scaled_dot_product_cudnn_attention_batch_rule(
  bool return_debug_mask,
  c10::optional<double> scale
 ) {
+  if (dropout_p > 0) {
+    auto maybe_layer = maybeCurrentDynamicLayer();
+    RandomnessType randomness = maybe_layer->randomness();
+    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
+  }
  auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
  auto query_ = moveBatchDimToFront(query, query_bdim);
  auto key_ = moveBatchDimToFront(key, key_bdim);
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -41,6 +41,17 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int *
 #include <fbgemm/FbgemmI64.h>
 #endif  // USE_FBGEMM

+#if AT_MKLDNN_ENABLED()
+#include <oneapi/dnnl/dnnl_version.h>
+#endif // oneDNN
+
+#define ONEDNN_UKERNEL_ENABLED (DNNL_VERSION_MAJOR >=3 && DNNL_VERSION_MINOR >=5)
+
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+#include <oneapi/dnnl/dnnl_ukernel.hpp>
+#include <oneapi/dnnl/dnnl.hpp>
+#endif // oneDNN BRGEMM
+
 namespace at::native::cpublas {
 namespace internal {

@ -822,4 +833,366 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
      n, x, incx, y, incy);
 }

-}  // namespace at::native::cpublas
+// oneDNN BRGEMM
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+struct BrgemmKey {
+  int64_t M;
+  int64_t N;
+  int64_t K;
+  int64_t batch_size;
+  int64_t lda;
+  int64_t ldb;
+  int64_t ldc;
+  ScalarType dt_a;
+  ScalarType dt_b;
+  ScalarType dt_c;
+  float alpha;
+  float beta;
+  BrgemmKey(
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t batch_size,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc,
+      ScalarType dt_a,
+      ScalarType dt_b,
+      ScalarType dt_c,
+      float alpha,
+      float beta)
+      : M(M),
+        N(N),
+        K(K),
+        batch_size(batch_size),
+        lda(lda),
+        ldb(ldb),
+        ldc(ldc),
+        dt_a(dt_a),
+        dt_b(dt_b),
+        dt_c(dt_c),
+        alpha(alpha),
+        beta(beta) {}
+  bool operator==(const BrgemmKey& other) const {
+    return M == other.M && N == other.N && K == other.K &&
+        batch_size == other.batch_size && lda == other.lda &&
+        ldb == other.ldb && ldc == other.ldc && dt_a == other.dt_a &&
+        dt_b == other.dt_b && dt_c == other.dt_c && alpha == other.alpha &&
+        beta == other.beta;
+  }
+};
+
+struct PackKey {
+  int64_t K;
+  int64_t N;
+  int64_t ld_in;
+  int64_t ld_out;
+  ScalarType dt_in;
+  ScalarType dt_out;
+  PackKey(
+      int64_t K,
+      int64_t N,
+      int64_t ld_in,
+      int64_t ld_out,
+      ScalarType dt_in,
+      ScalarType dt_out)
+      : K(K),
+        N(N),
+        ld_in(ld_in),
+        ld_out(ld_out),
+        dt_in(dt_in),
+        dt_out(dt_out) {}
+  bool operator==(const PackKey& other) const {
+    return N == other.N && K == other.K && ld_in == other.ld_in &&
+        ld_out == other.ld_out && dt_in == other.dt_in &&
+        dt_out == other.dt_out;
+  }
+};
+
+inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
+  if (dtype == ScalarType::Float) {
+    return dnnl::memory::data_type::f32;
+  } else if (dtype == ScalarType::BFloat16) {
+    return dnnl::memory::data_type::bf16;
+  } else if (dtype == ScalarType::Half) {
+    return dnnl::memory::data_type::f16;
+  } else if (dtype == ScalarType::Byte) {
+    return dnnl::memory::data_type::u8;
+  } else if (dtype == ScalarType::Char) {
+    return dnnl::memory::data_type::s8;
+  } else {
+    TORCH_CHECK(false, "get_dnnl_dtype expects float/bfloat16/half/int8 tensor input");
+  }
+}
+
+template<typename key_t>
+struct UnsafeUkernelKeyHasher {
+  std::size_t operator()(const key_t& key) const;
+};
+
+template<>
+std::size_t UnsafeUkernelKeyHasher<BrgemmKey>::operator()(const BrgemmKey& key) const {
+  // Use beta, M, N, and K to compute hash to reduce the overhead as
+  // batch size, alpha, and data types are unlikely to change within the same kernel and
+  // leading dimensions are likely to be related to M, K, N or use fixed values.
+  std::size_t h = std::hash<float>()(key.beta + 1);
+  h = std::hash<int64_t>()(key.M) ^ (h << 1);
+  h = std::hash<int64_t>()(key.N) ^ (h << 1);
+  h = std::hash<int64_t>()(key.K) ^ (h << 1);
+  h = std::hash<int64_t>()(key.ldc) ^ (h << 1);
+  return h;
+}
+
+template<>
+std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) const {
+  // Use K and N to compute hash to reduce the overhead as
+  // data types are unlikely to change and
+  // ld_in/ld_out is likely to be related to K, N or use fixed values
+  std::size_t h = std::hash<int64_t>()(key.K);
+  h = std::hash<int64_t>()(key.N) ^ (h << 1);
+  return h;
+}
+
+template <typename key_t, typename value_t>
+struct KernelCache  {
+  using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
+  static inline std::shared_ptr<value_t>&& fetch_or_create(
+      const key_t& key,
+      const std::function<std::shared_ptr<value_t>()>& callback) {
+    auto&& search = get_store().find(key);
+    if (search != get_store().end()) {
+      return std::move(search->second);
+    } else {
+      get_store().insert({key, callback()});
+      return std::move(get_store()[key]);
+    }
+  }
+
+  static inline kstore_t& get_store() {
+    static thread_local kstore_t cache_kernels;
+    return cache_kernels;
+  }
+};
+
+// Helper struct for convenient brgemm configuration
+struct GemmHelper {
+  GemmHelper(
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t bs,
+      int64_t ld_a,
+      int64_t ld_b,
+      int64_t ld_c,
+      ScalarType dt_a,
+      ScalarType dt_b,
+      ScalarType dt_c,
+      const float alpha,
+      const float beta) {
+    // Create brgemm
+    brg = dnnl::ukernel::brgemm(
+        M,
+        N,
+        K,
+        bs,
+        ld_a,
+        ld_b,
+        ld_c,
+        get_dnnl_dtype(dt_a),
+        get_dnnl_dtype(dt_b),
+        get_dnnl_dtype(dt_c),
+        alpha,
+        beta);
+    // Create a scratchpad buffer for the brgemm execution
+    scratchpad = std::vector<uint8_t>(brg.get_scratchpad_size());
+    // Prepare default vector of pairs of tensors A and B offsets for each batch.
+    A_B_offsets.reserve(1);
+    A_B_offsets[0] = std::make_pair(0, 0);
+  }
+  dnnl::ukernel::brgemm brg;
+  std::vector<uint8_t> scratchpad;
+  std::vector<std::pair<int64_t, int64_t>> A_B_offsets;
+};
+
+struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
+  // Fetch/create GemmHelper object and execute brgemm with batch size = 1
+  template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
+  static inline void call(
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t ld_a,
+      int64_t ld_b,
+      int64_t ld_c,
+      const float alpha,
+      const float beta,
+      const scalar_t_a* A,
+      const scalar_t_b* B,
+      scalar_t_c* C) {
+    auto&& key = BrgemmKey(
+        M,
+        N,
+        K,
+        int64_t(1),
+        ld_a,
+        ld_b,
+        ld_c,
+        c10::CppTypeToScalarType<scalar_t_a>::value,
+        c10::CppTypeToScalarType<scalar_t_b>::value,
+        c10::CppTypeToScalarType<scalar_t_c>::value,
+        alpha,
+        beta);
+    // Fetch/create GemmHelper object
+    auto&& value = fetch_or_create(key, [&]() {
+      auto&& v = std::make_shared<GemmHelper>(
+          M,
+          N,
+          K,
+          1,
+          ld_a,
+          ld_b,
+          ld_c,
+          c10::CppTypeToScalarType<scalar_t_a>::value,
+          c10::CppTypeToScalarType<scalar_t_b>::value,
+          c10::CppTypeToScalarType<scalar_t_c>::value,
+          alpha,
+          beta);
+      (*v).brg.generate();
+      return std::move(v);
+    });
+    if (get_current() != value) {
+      dnnl::ukernel::brgemm::release_hw_context();
+      ((*value).brg).set_hw_context();
+      get_current() = value;
+    }
+    ((*value).brg)
+        .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
+  }
+
+  static inline std::shared_ptr<GemmHelper>& get_current() {
+    static thread_local std::shared_ptr<GemmHelper> current;
+    return current;
+  }
+
+  static inline bool device_check(ScalarType dtype) {
+    if (!at::globalContext().userEnabledMkldnn()) {
+      return false;
+    }
+    if (dtype == ScalarType::Half) {
+      static bool fp16_support = dnnl::get_effective_cpu_isa() >= dnnl::cpu_isa::avx512_core_fp16;
+      return fp16_support;
+    }
+    return false;
+  }
+};
+
+using pack_t = dnnl::ukernel::brgemm_pack_B;
+struct Pack : public KernelCache <PackKey, pack_t> {
+  static inline void call(
+      int64_t K,
+      int64_t N,
+      int64_t ld_in,
+      int64_t ld_out,
+      ScalarType dt_in,
+      ScalarType dt_out,
+      const void* in,
+      void* out) {
+    auto&& key = PackKey(K, N, ld_in, ld_out, dt_in, dt_out);
+    auto&& pack = fetch_or_create(key, [&]() {
+      auto&& p = std::make_shared<pack_t>(
+          K, N, ld_in, ld_out, get_dnnl_dtype(dt_in), get_dnnl_dtype(dt_out));
+      if (need_pack(dt_in)) {
+        (*p).generate();
+      }
+      return std::move(p);
+    });
+    if (need_pack(dt_in)) {
+      (*pack).execute(in, out);
+    } else {
+      TORCH_CHECK(false, "No need to pack");
+    }
+  }
+
+  static inline bool need_pack(ScalarType dtype) {
+    if (!at::globalContext().userEnabledMkldnn()) {
+      return false;
+    }
+    if (dtype == ScalarType::Half) {
+      static bool fp16_pack = dnnl::get_effective_cpu_isa() >= dnnl::cpu_isa::avx512_core_amx_fp16;
+      return fp16_pack;
+    }
+    return false;
+  }
+};
+#endif
+
+void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const float alpha,
+    const float beta,
+    const at::Half* A,
+    const at::Half* B,
+    float* C) {
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+  if (Brgemm::device_check(ScalarType::Half)) {
+    Brgemm::call<at::Half, at::Half, float>(
+      M, N, K, ld_a, ld_b, ld_c, alpha, beta, A, B, C);
+    return;
+  }
+#endif
+  TORCH_CHECK(false,
+  "Half Brgemm is only supported on X64 when oneDNN ukernel is enabled and avx512_fp16 is supported");
+}
+
+void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const float alpha,
+    const float beta,
+    const at::BFloat16* A,
+    const at::BFloat16* B,
+    float* C) {
+  TORCH_CHECK(false,
+  "BFloat16 Brgemm is currently not supported");
+}
+
+void brgemm_release() {
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+  dnnl::ukernel::brgemm::release_hw_context();
+#endif
+}
+
+void pack(
+    int64_t K,
+    int64_t N,
+    int64_t ld_in,
+    int64_t ld_out,
+    ScalarType dt_in,
+    ScalarType dt_out,
+    const void* in,
+    void* out) {
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+  Pack::call(K, N, ld_in, ld_out, dt_in, dt_out, in, out);
+#else
+  TORCH_CHECK(false, "pack is only supported on X64 with oneDNN ukernel enabled");
+#endif
+}
+
+bool need_pack(ScalarType dt_in) {
+#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+  return Pack::need_pack(dt_in);
+#else
+  return false;
+#endif
+}
+
+} // namespace at::native::cpublas
--- a/aten/src/ATen/native/CPUBlas.h
+++ b/aten/src/ATen/native/CPUBlas.h
@ -7,6 +7,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/Scalar.h>

+
 namespace at::native::cpublas {

 namespace internal {
@ -186,4 +187,58 @@ void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy);
 void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
 void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);

-}  // namespace at::native::cpublas
+// Batch-reduce GEMM
+// Operates by the following formula:
+// C = alpha * SUM(A[i] x B[i]) + beta * C, i = 0 to batch size
+// A Base pointer to a tensor A.
+// B Base pointer to a tensor B.
+// Byte offsets vector of pairs of tensors A and B offsets for
+//     each batch. The number of batches must coincide with the
+//     `batch_size` value passed at object construction stage.
+// C Pointer to a tensor C (accumulation buffer).
+// scratchpad Pointer to a scratchpad buffer.
+// Currently, only brgemm with batch size = 1 will be used
+TORCH_API void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const float alpha,
+    const float beta,
+    const at::Half* A,
+    const at::Half* B,
+    float* C);
+
+TORCH_API void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const float alpha,
+    const float beta,
+    const at::BFloat16* A,
+    const at::BFloat16* B,
+    float* C);
+
+// Release brgemm hardware context
+void brgemm_release();
+
+// Pack B matrix to get better performance if needed
+void pack(
+    int64_t K,
+    int64_t N,
+    int64_t ld_in,
+    int64_t ld_out,
+    ScalarType dt_in,
+    ScalarType dt_out,
+    const void* in,
+    void* out);
+
+// Whether pack is needed in the platform.
+bool need_pack(ScalarType dt_in);
+
+} // namespace at::native::cpublas
--- a/aten/src/ATen/native/Col2Im.cpp
+++ b/aten/src/ATen/native/Col2Im.cpp
@ -144,7 +144,7 @@ static void col2im_out_cpu_template(

  output.resize_({batch_size, n_output_plane, output_height, output_width});

-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kBFloat16, kHalf, kBool,
      input.scalar_type(), "col2im_out_cpu", [&] {
        Tensor input_n = Tensor();
        Tensor output_n = Tensor();
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -421,12 +421,18 @@ struct ConvParams {
  // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
  // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
 #if !defined(C10_MOBILE)
-    if (needs_64bit_indexing_no_split(input, weight)) {
-      return false;
-    }
    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
      return false;
    }
+    if (needs_64bit_indexing_no_split(input, weight)) {
+      static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
+        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
+                        " if the V8 API is not enabled or before cuDNN version 9.3+."
+                        " Consider upgrading cuDNN and/or enabling the V8 API for better efficiency.");
+        return false;
+      }
+    }
    if (!input.is_cuda() || !cudnn_enabled) {
      return false;
    }
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@ -94,7 +94,7 @@ static void im2col_out_cpu_template(

  output.resize_({batch_size, n_output_plane, output_length});

-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kBFloat16, kHalf, kBool,
      input.scalar_type(), "im2col_out_cpu", [&] {
        Tensor input_n;
        Tensor output_n;
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -19,6 +19,7 @@
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/Matmul.h>
+#include <ATen/native/mkldnn/Utils.h>
 #include <c10/core/GradMode.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
@ -1358,13 +1359,8 @@ static inline int64_t get_mkldnn_matmul_min_dim() {
  static auto value = [&] {
    const int64_t default_min_dim = [&] {
      // Minimum dimension requirement for MKLDNN; derived based on experiments.
-      // By default, it's only enabled on Neoverse V1.
-#if !defined(__s390x__)  && !defined(__powerpc__)
-      if (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 && cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1) {
-        return 8;
-      }
-#endif
-      return 0;
+      //it's enabled on all Neoverse cpus.
+      return is_arm_neoverse() ? 8 : 0;
    }();
    const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_DIM");
    return ptr != nullptr ? std::atoi(ptr) : default_min_dim;
@ -1377,13 +1373,8 @@ static inline int64_t get_mkldnn_matmul_min_size() {
  static auto value = [&] {
    const int64_t default_min_size = [&] {
      // Minimum size requirement for MKLDNN; derived based on experiments.
-      // By default, it's only enabled on Neoverse V1.
-#if !defined(__s390x__)  && !defined(__powerpc__)
-      if (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 && cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1) {
-        return 8 * 1024;
-      }
-#endif
-      return 0;
+      // it's enabled on all Neoverse cpus.
+      return is_arm_neoverse() ? 8 * 1024 : 0;
    }();
    const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_SIZE");
    return ptr != nullptr ? std::atoi(ptr) : default_min_size;
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@ -284,7 +284,7 @@ void resize_bytes_nocuda(const Storage& storage, const c10::SymInt& newsize) {
  } else if (device_type == at::kPrivateUse1) {
    at::detail::getPrivateUse1Hooks().resizePrivateUse1Bytes(
        storage, newsize.expect_int());
-  } else if (device_type == at::kXPU || device_type == at::kHPU) {
+  } else if (device_type == at::kXPU || device_type == at::kHPU || device_type == at::kMTIA) {
    ptrdiff_t size_bytes_i = newsize.expect_int();
    TORCH_CHECK(
        !c10::overflows<int64_t>(size_bytes_i),
--- a/aten/src/ATen/native/cuda/Col2Im.cu
+++ b/aten/src/ATen/native/cuda/Col2Im.cu
@ -102,7 +102,7 @@ void col2im_out_cuda_template(
  output.resize_({batch_size, n_output_plane, output_height, output_width});
  int64_t output_batch_stride = output.stride(0);

-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kBool,
      input.scalar_type(), "col2im_out_cuda", [&] {
    int64_t height_col = (output_height + 2 * pad_height -
                          (dilation_height * (kernel_height - 1) + 1)) /
--- a/aten/src/ATen/native/cuda/Im2Col.cu
+++ b/aten/src/ATen/native/cuda/Im2Col.cu
@ -103,7 +103,7 @@ static void im2col_out_cuda_template(
  output.resize_({batch_size, n_output_plane, output_length});

  // Launch kernel
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kBool,
      input.scalar_type(), "im2col_out_cuda", [&] {
    Tensor input_n;
    Tensor output_n;
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@ -89,10 +89,22 @@ const std::map<c10::string_view, ideep::algorithm>& fusion_binary_alg_map();
 inline bool mkldnn_bf16_device_check_arm() {
  return cpuinfo_initialize() && cpuinfo_has_arm_bf16();
 }
+
+inline bool is_arm_neoverse() {
+  return (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 &&
+          (cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1 ||
+           cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v2 ||
+           cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_n1 ||
+           cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_n2));
+}
 #else
 constexpr bool mkldnn_bf16_device_check_arm() {
  return false;
 }
+
+constexpr bool is_arm_neoverse() {
+  return false;
+}
 #endif

 #if AT_MKLDNN_ENABLED()
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3400,9 +3400,9 @@

 - func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor

- func: wrapped_linear_prepack(Tensor weight, Tensor weight_scale, Tensor weight_zero_point, Tensor bias) -> Tensor
+- func: _wrapped_linear_prepack(Tensor weight, Tensor weight_scale, Tensor weight_zero_point, Tensor bias) -> Tensor

- func: wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor
+- func: _wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor

 - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor

--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@ -436,12 +436,12 @@ at::Tensor wrapped_quantized_linear_meta(
 #endif // USE_FBGEMM
 }

-at::Tensor wrapped_linear_prepack(const at::Tensor& weight,
+at::Tensor _wrapped_linear_prepack(const at::Tensor& weight,
    const at::Tensor& weight_scale,
    const at::Tensor& weight_zero_point,
    const at::Tensor& bias);

-at::Tensor wrapped_linear_prepack(const at::Tensor& weight,
+at::Tensor _wrapped_linear_prepack(const at::Tensor& weight,
    const at::Tensor& weight_scale,
    const at::Tensor& weight_zero_point,
    const at::Tensor& bias) {
@ -474,14 +474,14 @@ at::Tensor wrapped_linear_prepack(const at::Tensor& weight,
 #endif // USE_FBGEMM
 }

-at::Tensor wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
+at::Tensor _wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
    const at::Tensor& input_zero_point,
    const at::Tensor& packed_weight,
    const at::Tensor& output_scale,
    const at::Tensor& output_zero_point,
    [[maybe_unused]] const int64_t out_channel);

-at::Tensor wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
+at::Tensor _wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
    const at::Tensor& input_zero_point,
    const at::Tensor& packed_weight,
    const at::Tensor& output_scale,
@ -507,12 +507,12 @@ at::Tensor wrapped_quantized_linear_prepacked(const at::Tensor& input, const at:
 #endif // USE_FBGEMM
 }

-at::Tensor wrapped_linear_prepack_meta(const at::Tensor& weight,
+at::Tensor _wrapped_linear_prepack_meta(const at::Tensor& weight,
    [[maybe_unused]] const at::Tensor& weight_scale,
    [[maybe_unused]] const at::Tensor& weight_zero_point,
    [[maybe_unused]] const at::Tensor& bias);

-at::Tensor wrapped_linear_prepack_meta(const at::Tensor& weight,
+at::Tensor _wrapped_linear_prepack_meta(const at::Tensor& weight,
    [[maybe_unused]] const at::Tensor& weight_scale,
    [[maybe_unused]] const at::Tensor& weight_zero_point,
    [[maybe_unused]] const at::Tensor& bias) {
@ -530,7 +530,7 @@ at::Tensor wrapped_linear_prepack_meta(const at::Tensor& weight,
 #endif // USE_FBGEMM
 }

-at::Tensor wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
+at::Tensor _wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
    [[maybe_unused]] const at::Tensor& input_scale,
    [[maybe_unused]] const at::Tensor& input_zero_point,
    [[maybe_unused]] const at::Tensor& packed_weight,
@ -538,7 +538,7 @@ at::Tensor wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
    [[maybe_unused]] const at::Tensor& output_zero_point,
    const int64_t out_channel);

-at::Tensor wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
+at::Tensor _wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
    [[maybe_unused]] const at::Tensor& input_scale,
    [[maybe_unused]] const at::Tensor& input_zero_point,
    [[maybe_unused]] const at::Tensor& packed_weight,
@ -695,21 +695,21 @@ TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run));
  m.impl(TORCH_SELECTIVE_NAME("_quantized::wrapped_quantized_linear"), TORCH_FN(wrapped_quantized_linear));
  m.impl(
-      TORCH_SELECTIVE_NAME("_quantized::wrapped_linear_prepack"),
-      wrapped_linear_prepack);
+      TORCH_SELECTIVE_NAME("_quantized::_wrapped_linear_prepack"),
+      _wrapped_linear_prepack);
  m.impl(
-      TORCH_SELECTIVE_NAME("_quantized::wrapped_quantized_linear_prepacked"),
-      wrapped_quantized_linear_prepacked);
+      TORCH_SELECTIVE_NAME("_quantized::_wrapped_quantized_linear_prepacked"),
+      _wrapped_quantized_linear_prepacked);
 }

 TORCH_LIBRARY_IMPL(_quantized, Meta, m) {
  m.impl(TORCH_SELECTIVE_NAME("_quantized::wrapped_quantized_linear"), TORCH_FN(wrapped_quantized_linear_meta));
  m.impl(
-      TORCH_SELECTIVE_NAME("_quantized::wrapped_linear_prepack"),
-      wrapped_linear_prepack_meta);
+      TORCH_SELECTIVE_NAME("_quantized::_wrapped_linear_prepack"),
+      _wrapped_linear_prepack_meta);
  m.impl(
-      TORCH_SELECTIVE_NAME("_quantized::wrapped_quantized_linear_prepacked"),
-      wrapped_quantized_linear_prepacked_meta);
+      TORCH_SELECTIVE_NAME("_quantized::_wrapped_quantized_linear_prepacked"),
+      _wrapped_quantized_linear_prepacked_meta);
 }

 TORCH_LIBRARY_IMPL(onednn, CPU, m) {
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@ -251,8 +251,8 @@ TORCH_LIBRARY(_quantized, m) {
  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16(Tensor W) -> Tensor"));
  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_linear_fp16_weight(Tensor X, Tensor W, Tensor B, int out_channel) -> Tensor"));
  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_quantized_linear(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
-  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_linear_prepack(Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_quantized_linear_prepacked(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W_prepack, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_linear_prepack(Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_quantized_linear_prepacked(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W_prepack, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
 }

 TORCH_LIBRARY(onednn, m) {
--- a/aten/src/ATen/native/tags.yaml
+++ b/aten/src/ATen/native/tags.yaml
@ -46,6 +46,15 @@
  desc: |
          This tag indicates that the operator should be passed Tensors following
          the same stride permutation as observed in eager when compiled in inductor.
+          Only one of {needs_fixed_stride_order, flexible_layout} can apply; if
+          multiple are assigned then we assume the most restrictive one.
+- tag: flexible_layout
+  desc: |
+          This tag indicates that the custom operator can accept inputs with varying
+          strides/storage_offset and that when compiled, Inductor is allowed to change
+          the strides/storage_offset of inputs to the custom operator.
+          Only one of {needs_fixed_stride_order, flexible_layout} can apply; if
+          multiple are assigned then we assume the most restrictive one.

 # NOTE [Core ATen Ops]
 - tag: core
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -90,7 +90,7 @@ detectron2_maskrcnn_r_50_fpn,fail_to_run,0



-dlrm,fail_to_run,0
+dlrm,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@ -90,7 +90,7 @@ detectron2_maskrcnn_r_50_fpn,fail_to_run,0



-dlrm,fail_to_run,0
+dlrm,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -74,7 +74,7 @@ detectron2_fasterrcnn_r_50_fpn,fail_to_run,0



-dlrm,fail_to_run,0
+dlrm,pass,0



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
@ -74,7 +74,7 @@ detectron2_fasterrcnn_r_50_fpn,fail_to_run,0



-dlrm,fail_to_run,0
+dlrm,pass,0



--- a/benchmarks/gpt_fast/benchmark.py
+++ b/benchmarks/gpt_fast/benchmark.py
@ -3,7 +3,12 @@ import csv
 import dataclasses
 import os

-from generate import run_llama2_7b_bf16, run_llama2_7b_int8, run_mixtral_8x7b_int8
+from generate import (
+    get_arch_name,
+    run_llama2_7b_bf16,
+    run_llama2_7b_int8,
+    run_mixtral_8x7b_int8,
+)

 import torch
 import torch.nn as nn
@ -24,6 +29,7 @@ class Experiment:
    actual: float
    dtype: str
    device: str
+    arch: str  # GPU name for CUDA or CPU arch for CPU
    is_model: bool = False


@ -71,7 +77,12 @@ def run_mlp_layer_norm_gelu(device: str = "cuda"):
            for _ in range(WARMUP_ITER):
                compiled_mod(x)

-            us_per_iter = benchmarker.benchmark_gpu(lambda: compiled_mod(x)) * 1000
+            benchmark_fn = (
+                benchmarker.benchmark_gpu
+                if device == "cuda"
+                else benchmarker.benchmark_cpu
+            )
+            us_per_iter = benchmark_fn(lambda: compiled_mod(x)) * 1000
            flops_utilization += us_per_iter * flops / 1e9 / A100_40G_BF16_TFLOPS

        flops_utilization = flops_utilization / len(input_shapes)
@ -84,6 +95,7 @@ def run_mlp_layer_norm_gelu(device: str = "cuda"):
                f"{flops_utilization:.02f}",
                dtype_str,
                device,
+                get_arch_name(),
            )
        )
    return results
@ -108,7 +120,12 @@ def run_layer_norm(device: str = "cuda"):
            for _ in range(WARMUP_ITER):
                compiled_mod(x)

-            us_per_iter = benchmarker.benchmark_gpu(lambda: compiled_mod(x)) * 1000
+            benchmark_fn = (
+                benchmarker.benchmark_gpu
+                if device == "cuda"
+                else benchmarker.benchmark_cpu
+            )
+            us_per_iter = benchmark_fn(lambda: compiled_mod(x)) * 1000
            memory_bandwidth += (1e6 / us_per_iter) * 2 * BS * D * dtype.itemsize / 1e9

        memory_bandwidth = memory_bandwidth / len(input_shapes)
@ -121,6 +138,7 @@ def run_layer_norm(device: str = "cuda"):
                f"{memory_bandwidth:.02f}",
                dtype_str,
                device,
+                get_arch_name(),
            )
        )
    return results
@ -151,9 +169,12 @@ def run_gather_gemv(device: str = "cuda"):
            for _ in range(WARMUP_ITER):
                compiled_fn(W, score_idxs, x)

-            us_per_iter = (
-                benchmarker.benchmark_gpu(lambda: compiled_fn(W, score_idxs, x)) * 1000
+            benchmark_fn = (
+                benchmarker.benchmark_gpu
+                if device == "cuda"
+                else benchmarker.benchmark_cpu
            )
+            us_per_iter = benchmark_fn(lambda: compiled_fn(W, score_idxs, x)) * 1000
            memory_bandwidth += (1e6 / us_per_iter) * 2 * D * D * dtype.itemsize / 1e9

        memory_bandwidth = memory_bandwidth / len(input_shapes)
@ -166,6 +187,7 @@ def run_gather_gemv(device: str = "cuda"):
                f"{memory_bandwidth:.02f}",
                dtype_str,
                device,
+                get_arch_name(),
            )
        )
    return results
@ -186,15 +208,20 @@ def run_gemv(device: str = "cuda"):
            def gemv(W, x):
                return W.to(x.dtype) @ x

-            W = torch.randn(D, D, device="cuda").to(dtype=dtype)
-            x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+            W = torch.randn(D, D, device=device).to(dtype=dtype)
+            x = torch.randn(D, device=device, dtype=torch.bfloat16)

            compiled_fn = torch.compile(gemv, dynamic=False)

            for _ in range(WARMUP_ITER):
                compiled_fn(W, x)

-            us_per_iter = benchmarker.benchmark_gpu(lambda: compiled_fn(W, x)) * 1000
+            benchmark_fn = (
+                benchmarker.benchmark_gpu
+                if device == "cuda"
+                else benchmarker.benchmark_cpu
+            )
+            us_per_iter = benchmark_fn(lambda: compiled_fn(W, x)) * 1000
            memory_bandwidth += (1e6 / us_per_iter) * D * D * dtype.itemsize / 1e9

        memory_bandwidth = memory_bandwidth / len(input_shapes)
@ -207,6 +234,7 @@ def run_gemv(device: str = "cuda"):
                f"{memory_bandwidth:.02f}",
                dtype_str,
                device,
+                get_arch_name(),
            )
        )
    return results
@ -252,7 +280,13 @@ def main(output_file=DEFAULT_OUTPUT_FILE):
    results = []

    for func in all_experiments:
-        lst = func()
+        try:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        except AssertionError:
+            # This happens when torch is compiled with CUDA turning off completely
+            device = "cpu"
+
+        lst = func(device)
        for x in lst:
            results.append(dataclasses.astuple(x))

--- a/benchmarks/gpt_fast/generate.py
+++ b/benchmarks/gpt_fast/generate.py
@ -1,5 +1,6 @@
 import dataclasses
 import itertools
+import platform
 import time
 from typing import Optional, Tuple

@ -41,6 +42,14 @@ def device_sync(device):
        print(f"device={device} is not yet suppported")


+def get_arch_name() -> str:
+    if torch.cuda.is_available():
+        return torch.cuda.get_device_name()
+    else:
+        # This returns x86_64 or arm64 (for aarch64)
+        return platform.machine()
+
+
 def multinomial_sample_one_no_sync(
    probs_sort,
 ):  # Does multinomial sampling without a cuda synchronization
@ -198,7 +207,7 @@ def run_experiment(
 ) -> None:
    print(f"Loading model {x.name}")
    t0 = time.time()
-    model = _load_model(x)
+    model = _load_model(x, device=device)
    device_sync(device=device)  # MKG
    print(f"Time to load model: {time.time() - t0:.02f} seconds")

@ -257,7 +266,9 @@ def run_llama2_7b_bf16(device: str = "cuda"):
        1253,
        162,
    )
-    token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
+    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
+        model, device=device
+    )
    return [
        Experiment(
            model.name,
@ -266,6 +277,7 @@ def run_llama2_7b_bf16(device: str = "cuda"):
            f"{token_per_sec:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
        Experiment(
@ -275,6 +287,7 @@ def run_llama2_7b_bf16(device: str = "cuda"):
            f"{memory_bandwidth:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
        Experiment(
@ -284,6 +297,7 @@ def run_llama2_7b_bf16(device: str = "cuda"):
            f"{compilation_time:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
    ]
@ -302,7 +316,9 @@ def run_llama2_7b_int8(device: str = "cuda"):
        957,
        172,
    )
-    token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
+    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
+        model, device=device
+    )
    return [
        Experiment(
            model.name,
@ -311,6 +327,7 @@ def run_llama2_7b_int8(device: str = "cuda"):
            f"{token_per_sec:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
        Experiment(
@ -320,6 +337,7 @@ def run_llama2_7b_int8(device: str = "cuda"):
            f"{memory_bandwidth:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
        Experiment(
@ -329,6 +347,7 @@ def run_llama2_7b_int8(device: str = "cuda"):
            f"{compilation_time:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
    ]
@ -348,7 +367,9 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
        1130,
        162,
    )
-    token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
+    token_per_sec, memory_bandwidth, compilation_time = run_experiment(
+        model, device=device
+    )
    return [
        Experiment(
            model.name,
@ -357,6 +378,7 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
            f"{token_per_sec:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
        Experiment(
@ -366,6 +388,7 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
            f"{memory_bandwidth:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
        Experiment(
@ -375,6 +398,7 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
            f"{compilation_time:.02f}",
            model.mode,
            device,
+            get_arch_name(),
            True,
        ),
    ]
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@ -149,6 +149,8 @@ const char* toString(DispatchKey t) {
      return "AutocastXLA";
    case DispatchKey::AutocastPrivateUse1:
      return "AutocastPrivateUse1";
+    case DispatchKey::AutocastMPS:
+      return "AutocastMPS";

    case DispatchKey::FuncTorchBatched:
      return "FuncTorchBatched";
@ -297,6 +299,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
      {"AutocastCUDA", c10::DispatchKey::AutocastCUDA},
      {"AutocastXLA", c10::DispatchKey::AutocastXLA},
      {"AutocastPrivateUse1", c10::DispatchKey::AutocastPrivateUse1},
+      {"AutocastMPS", c10::DispatchKey::AutocastMPS},
      {"FuncTorchBatched", c10::DispatchKey::FuncTorchBatched},
      {"BatchedNestedTensor", c10::DispatchKey::BatchedNestedTensor},
      {"FuncTorchVmapMode", c10::DispatchKey::FuncTorchVmapMode},
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@ -359,6 +359,7 @@ enum class DispatchKey : uint16_t {
  AutocastXLA,
  // AutocastXLA is only being used for TPUs. XLA GPUs continue to use
  // AutocastCUDA.
+  AutocastMPS,
  AutocastCUDA,
  AutocastPrivateUse1,

--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@ -655,6 +655,7 @@ constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({

 constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({
    DispatchKey::AutocastCPU,
+    DispatchKey::AutocastMPS,
    DispatchKey::AutocastCUDA,
    DispatchKey::AutocastXPU,
    DispatchKey::AutocastIPU,
@ -671,6 +672,7 @@ constexpr DispatchKeySet default_included_set = DispatchKeySet({

 constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
    DispatchKey::AutocastCPU,
+    DispatchKey::AutocastMPS,
    DispatchKey::AutocastCUDA,
    DispatchKey::AutocastXPU,
    DispatchKey::AutocastIPU,
@ -863,6 +865,7 @@ inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
  constexpr auto autocast_xla_ks = DispatchKeySet(DispatchKey::AutocastXLA);
  constexpr auto autocast_privateuse1_ks =
      DispatchKeySet(DispatchKey::AutocastPrivateUse1);
+  constexpr auto autocast_mps_ks = DispatchKeySet(DispatchKey::AutocastMPS);
  switch (t) {
    case BackendComponent::CPUBit:
      return autocast_cpu_ks;
@ -878,6 +881,8 @@ inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
      return autocast_xla_ks;
    case BackendComponent::PrivateUse1Bit:
      return autocast_privateuse1_ks;
+    case BackendComponent::MPSBit:
+      return autocast_mps_ks;
    default:
      return DispatchKeySet();
  }
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -6,6 +6,7 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/CallOnce.h>
+#include <c10/util/Gauge.h>
 #include <c10/util/ScopeExit.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
@ -1429,6 +1430,12 @@ class DeviceCachingAllocator {
    if (block->size >= CUDAAllocatorConfig::max_split_size())
      stats.oversize_allocations.increase(1);

+    auto allocated_bytes_gauge =
+        STATIC_GAUGE(pytorch.CUDACachingAllocator.allocated_bytes);
+    allocated_bytes_gauge.record(
+        stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
+            .current);
+
    c10::reportMemoryUsageToProfiler(
        block->ptr,
        static_cast<int64_t>(block->size),
@ -1456,6 +1463,11 @@ class DeviceCachingAllocator {
      stats.allocation[stat_type].decrease(1);
      stats.allocated_bytes[stat_type].decrease(block->size);
    });
+    auto allocated_bytes_gauge =
+        STATIC_GAUGE(pytorch.CUDACachingAllocator.allocated_bytes);
+    allocated_bytes_gauge.record(
+        stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
+            .current);

    record_trace(
        TraceEntry::FREE_REQUESTED,
@ -2245,6 +2257,11 @@ class DeviceCachingAllocator {
    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
      stats.reserved_bytes[stat_type].increase(mapped_range.size);
    });
+    auto reserved_bytes_gauge =
+        STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
+    reserved_bytes_gauge.record(
+        stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
+            .current);

    stats.num_device_alloc++;
    record_trace(
@ -2683,6 +2700,11 @@ class DeviceCachingAllocator {
    });
    if (size >= CUDAAllocatorConfig::max_split_size())
      stats.oversize_segments.increase(1);
+    auto reserved_bytes_gauge =
+        STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
+    reserved_bytes_gauge.record(
+        stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
+            .current);

    // p.block came from new, not cudaMalloc. It should not be nullptr here.
    TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
@ -2820,6 +2842,11 @@ class DeviceCachingAllocator {
      stats.segment[stat_type].decrease(1);
      stats.reserved_bytes[stat_type].decrease(block->size);
    });
+    auto reserved_bytes_gauge =
+        STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
+    reserved_bytes_gauge.record(
+        stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
+            .current);

    if (block->size >= CUDAAllocatorConfig::max_split_size())
      stats.oversize_segments.decrease(1);
@ -2876,6 +2903,11 @@ class DeviceCachingAllocator {
    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
      stats.reserved_bytes[stat_type].decrease(unmapped.size);
    });
+    auto reserved_bytes_gauge =
+        STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
+    reserved_bytes_gauge.record(
+        stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
+            .current);

    if (block->pool->owner_PrivatePool) {
      // The cudaFreed block belonged to a CUDA graph's PrivatePool.
--- a/c10/util/BFloat16-math.h
+++ b/c10/util/BFloat16-math.h
@ -68,6 +68,12 @@ template <
 inline T expm1(T a) {
  return std::expm1(float(a));
 }
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline bool isfinite(T a) {
+  return std::isfinite(float(a));
+}
 template <
    typename T,
    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
@ -237,10 +243,9 @@ C10_HOST_DEVICE inline T nextafter(T from, T to) {
  // Reference:
  // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
  using int_repr_t = uint16_t;
-  using float_t = T;
  constexpr uint8_t bits = 16;
  union {
-    float_t f;
+    T f;
    int_repr_t i;
  } ufrom = {from}, uto = {to};

--- a/c10/util/complex.h
+++ b/c10/util/complex.h
@ -261,19 +261,19 @@ struct alignas(sizeof(T) * 2) complex {
 #endif

    if (abs_c >= abs_d) {
-      if (abs_c == 0 && abs_d == 0) {
+      if (abs_c == U(0) && abs_d == U(0)) {
        /* divide by zeros should yield a complex inf or nan */
        real_ = a / abs_c;
        imag_ = b / abs_d;
      } else {
        auto rat = d / c;
-        auto scl = 1.0 / (c + d * rat);
+        auto scl = U(1.0) / (c + d * rat);
        real_ = (a + b * rat) * scl;
        imag_ = (b - a * rat) * scl;
      }
    } else {
      auto rat = c / d;
-      auto scl = 1.0 / (d + c * rat);
+      auto scl = U(1.0) / (d + c * rat);
      real_ = (a * rat + b) * scl;
      imag_ = (b * rat - a) * scl;
    }
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -9,6 +9,8 @@

 namespace c10::xpu::XPUCachingAllocator {

+using namespace c10::CachingDeviceAllocator;
+
 // newly allocated memory with 512-byte alignment.
 constexpr size_t kDeviceAlignment = 512;
 // all sizes are rounded to at least 512 bytes
@ -117,6 +119,7 @@ struct AllocParams {
  BlockPool* pool;
  size_t alloc_size;
  Block* block;
+  StatTypes stat_types = {};
 };

 } // anonymous namespace
@ -124,6 +127,7 @@ struct AllocParams {
 class DeviceCachingAllocator {
 private:
  mutable std::recursive_mutex mutex;
+  DeviceStats stats;
  BlockPool large_blocks; // unallocated cached blocks larger than 1 MB
  BlockPool small_blocks; // unallocated cached blocks 1 MB or smaller
  ska::flat_hash_set<Block*> active_blocks; // allocated or in use by a stream
@ -173,6 +177,12 @@ class DeviceCachingAllocator {
    active_blocks.erase(block);
    bool inserted = pool.blocks.insert(block).second;
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
+
+    StatTypes stat_types = get_stat_types_for_pool(pool);
+    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+      stats.active_bytes[stat_type].decrease(block->size);
+      stats.requested_bytes[stat_type].decrease(block->requested_size);
+    });
  }

  void process_events() {
@ -250,6 +260,9 @@ class DeviceCachingAllocator {
      return false;
    }
    p.block = new Block(device, p.queue(), size, p.pool, ptr);
+    for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
+      stats.reserved_bytes[stat_type].increase(size);
+    });
    return true;
  }

@ -281,6 +294,12 @@ class DeviceCachingAllocator {
    sycl::free(block->ptr, xpu::get_device_context());
    auto* pool = block->pool;
    pool->blocks.erase(block);
+
+    StatTypes stat_types = get_stat_types_for_pool(*pool);
+    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+      stats.reserved_bytes[stat_type].decrease(block->size);
+    });
+
    delete block;
  }

@ -314,6 +333,14 @@ class DeviceCachingAllocator {
    }
  }

+  StatTypes get_stat_types_for_pool(const BlockPool& pool) {
+    StatTypes stat_types = {};
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(
+        pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
+    return stat_types;
+  }
+
  Block* alloc_found_block(
      AllocParams params,
      size_t orig_size,
@ -350,6 +377,12 @@ class DeviceCachingAllocator {
    bool inserted = active_blocks.insert(block).second;
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted)

+    for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+      stats.allocated_bytes[stat_type].increase(block->size);
+      stats.active_bytes[stat_type].increase(block->size);
+      stats.requested_bytes[stat_type].increase(block->requested_size);
+    });
+
    return block;
  }

@ -376,6 +409,7 @@ class DeviceCachingAllocator {
    auto& pool = get_pool(size);
    const size_t alloc_size = get_allocation_size(size);
    AllocParams params(device, size, &queue, &pool, alloc_size);
+    params.stat_types = get_stat_types_for_pool(pool);

    // First, try to get a block from the existing pool.
    bool block_found = get_free_block(params);
@ -384,9 +418,32 @@ class DeviceCachingAllocator {
      block_found = alloc_block(params) ||
          (release_cached_blocks() && alloc_block(params));
    }
-    TORCH_CHECK(
-        block_found,
-        "XPU out of memory, please use `empty_cache` to release all unoccupied cached memory.");
+    if (!block_found) {
+      c10::xpu::DeviceProp device_prop;
+      c10::xpu::get_device_properties(&device_prop, device);
+      auto device_total = device_prop.global_mem_size;
+      auto allocated_bytes =
+          stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
+              .current;
+      auto reserved_bytes =
+          stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
+              .current;
+      TORCH_CHECK_WITH(
+          OutOfMemoryError,
+          false,
+          "XPU out of memory. Tried to allocate ",
+          format_size(alloc_size),
+          ". GPU ",
+          static_cast<int>(device),
+          " has a total capacity of ",
+          format_size(device_total),
+          ". Of the allocated memory ",
+          format_size(allocated_bytes),
+          " is allocated by PyTorch, and ",
+          format_size(reserved_bytes - allocated_bytes),
+          " is reserved by PyTorch but unallocated.",
+          " Please use `empty_cache` to release all unoccupied cached memory.");
+    }
    bool split_remainder = should_split(params.block, params.size());
    return alloc_found_block(std::move(params), orig_size, split_remainder);
  }
@ -395,6 +452,11 @@ class DeviceCachingAllocator {
    std::scoped_lock<std::recursive_mutex> lock(mutex);
    block->allocated = false;

+    StatTypes stat_types = get_stat_types_for_pool(*block->pool);
+    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+      stats.allocated_bytes[stat_type].decrease(block->size);
+    });
+
    if (!block->stream_uses.empty()) {
      insert_events(block);
    } else {
@ -414,6 +476,35 @@ class DeviceCachingAllocator {
    std::scoped_lock<std::recursive_mutex> lock(mutex);
    release_cached_blocks();
  }
+
+  DeviceStats getStats() {
+    std::scoped_lock<std::recursive_mutex> lock(mutex);
+    return stats;
+  }
+
+  void resetAccumulatedStats() {
+    std::scoped_lock<std::recursive_mutex> lock(mutex);
+
+    for (const auto statType :
+         c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
+      stats.allocated_bytes[statType].reset_accumulated();
+      stats.reserved_bytes[statType].reset_accumulated();
+      stats.active_bytes[statType].reset_accumulated();
+      stats.requested_bytes[statType].reset_accumulated();
+    }
+  }
+
+  void resetPeakStats() {
+    std::scoped_lock<std::recursive_mutex> lock(mutex);
+
+    for (const auto statType :
+         c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
+      stats.allocated_bytes[statType].reset_peak();
+      stats.reserved_bytes[statType].reset_peak();
+      stats.active_bytes[statType].reset_peak();
+      stats.requested_bytes[statType].reset_peak();
+    }
+  }
 };

 void local_raw_delete(void* ptr);
@ -547,6 +638,30 @@ class XPUAllocator : public Allocator {
  void copy_data(void* dest, const void* src, std::size_t count) const final {
    xpu::getCurrentXPUStream().queue().memcpy(dest, src, count);
  }
+
+  void assertValidDevice(DeviceIndex device) {
+    const auto device_num = device_allocators.size();
+    TORCH_CHECK(
+        0 <= device && device < static_cast<int64_t>(device_num),
+        "Invalid device argument ",
+        device,
+        ": did you call init?");
+  }
+
+  DeviceStats getDeviceStats(DeviceIndex device) {
+    assertValidDevice(device);
+    return device_allocators[device]->getStats();
+  }
+
+  void resetPeakStats(DeviceIndex device) {
+    assertValidDevice(device);
+    device_allocators[device]->resetPeakStats();
+  }
+
+  void resetAccumulatedStats(DeviceIndex device) {
+    assertValidDevice(device);
+    device_allocators[device]->resetAccumulatedStats();
+  }
 };

 static XPUAllocator allocator;
@ -567,6 +682,18 @@ void emptyCache() {
  return allocator.emptyCache();
 }

+void resetPeakStats(DeviceIndex device) {
+  return allocator.resetPeakStats(device);
+}
+
+void resetAccumulatedStats(DeviceIndex device) {
+  return allocator.resetAccumulatedStats(device);
+}
+
+DeviceStats getDeviceStats(DeviceIndex device) {
+  return allocator.getDeviceStats(device);
+}
+
 void* raw_alloc(size_t size) {
  return allocator.raw_alloc(size);
 }
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@ -1,6 +1,6 @@
 #pragma once

-#include <c10/core/Allocator.h>
+#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/xpu/XPUStream.h>

 namespace c10::xpu::XPUCachingAllocator {
@ -11,6 +11,13 @@ C10_XPU_API void init(DeviceIndex device_count);

 C10_XPU_API void emptyCache();

+C10_XPU_API void resetPeakStats(DeviceIndex device);
+
+C10_XPU_API void resetAccumulatedStats(DeviceIndex device);
+
+C10_XPU_API c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    DeviceIndex device);
+
 C10_XPU_API void* raw_alloc(size_t size);

 C10_XPU_API void raw_delete(void* ptr);
--- a/caffe2/utils/string_utils.cc
+++ b/caffe2/utils/string_utils.cc
@ -3,6 +3,7 @@
 #include <algorithm>
 #include <sstream>
 #include <vector>
+#include <cstdint>

 namespace caffe2 {

--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@ -43,7 +43,9 @@ IF(NOT MKLDNN_FOUND)
      endif()
    endif()
    if(LINUX)
-      set(ABI_NEUTRAL_FLAGS -fpreview-breaking-changes)
+      set(DNNL_CXX_FLAGS "-DCMAKE_CXX_FLAGS=-fpreview-breaking-changes")
+    else()
+      set(DNNL_CXX_FLAGS "")
    endif()
    ExternalProject_Add(xpu_mkldnn_proj
      SOURCE_DIR ${MKLDNN_ROOT}
@ -51,7 +53,7 @@ IF(NOT MKLDNN_FOUND)
      BUILD_IN_SOURCE 0
      CMAKE_ARGS  -DCMAKE_C_COMPILER=icx
      -DCMAKE_CXX_COMPILER=${SYCL_CXX_DRIVER}
-      -DCMAKE_CXX_FLAGS=${ABI_NEUTRAL_FLAGS}
+      ${DNNL_CXX_FLAGS}
      -DDNNL_GPU_RUNTIME=SYCL
      -DDNNL_CPU_RUNTIME=THREADPOOL
      -DDNNL_BUILD_TESTS=OFF
@ -85,13 +87,18 @@ IF(NOT MKLDNN_FOUND)
    SET(ONEDNN_BUILD_GRAPH ON CACHE BOOL "" FORCE)
  ENDIF(NOT APPLE AND NOT WIN32 AND NOT BUILD_LITE_INTERPRETER)

+  IF(EXISTS "${MKLDNN_ROOT}/include/oneapi/dnnl/dnnl_ukernel.hpp")
+    MESSAGE("-- Will build oneDNN UKERNEL")
+    SET(DNNL_EXPERIMENTAL_UKERNEL ON CACHE BOOL "" FORCE)
+  ENDIF(EXISTS "${MKLDNN_ROOT}/include/oneapi/dnnl/dnnl_ukernel.hpp")
+
  FIND_PACKAGE(BLAS)
  FIND_PATH(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include)
-  FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include/oneapi/dnnl)
+  FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h dnnl_ukernel.hpp dnnl_ukernel.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include/oneapi/dnnl)
  IF(NOT MKLDNN_INCLUDE_DIR)
    MESSAGE("MKLDNN_INCLUDE_DIR not found")
    EXECUTE_PROCESS(COMMAND git${CMAKE_EXECUTABLE_SUFFIX} submodule update --init mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT})
-    FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+    FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h dnnl_ukernel.hpp dnnl_ukernel.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
  ENDIF(NOT MKLDNN_INCLUDE_DIR)
  IF(BUILD_ONEDNN_GRAPH)
    FIND_PATH(LLGA_INCLUDE_DIR dnnl_graph.hpp PATHS ${LLGA_ROOT} PATH_SUFFIXES include/oneapi/dnnl)
--- a/docs/source/masked.rst
+++ b/docs/source/masked.rst
@ -283,9 +283,11 @@ The following ops are currently supported:
    kron
    meshgrid
    narrow
+    nn.functional.unfold
    ravel
    select
    split
+    stack
    t
    transpose
    vsplit
@ -294,6 +296,7 @@ The following ops are currently supported:
    Tensor.expand_as
    Tensor.reshape
    Tensor.reshape_as
+    Tensor.unfold
    Tensor.view

 .. This module needs to be documented. Adding here in the meantime
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@ -398,3 +398,4 @@ The following utility functions are related to serialization:
 .. autofunction:: clear_safe_globals
 .. autofunction:: get_safe_globals
 .. autoclass:: safe_globals
+.. autoclass:: skip_data
--- a/docs/source/onnx_dynamo.rst
+++ b/docs/source/onnx_dynamo.rst
@ -28,7 +28,6 @@ The exporter is designed to be modular and extensible. It is composed of the fol
  - **FX Graph Extractor**: :class:`FXGraphExtractor` extracts the FX graph from the PyTorch model.
  - **Fake Mode**: :class:`ONNXFakeContext` is a context manager that enables fake mode for large scale models.
  - **ONNX Program**: :class:`ONNXProgram` is the output of the exporter that contains the exported ONNX graph and diagnostics.
-  - **ONNX Program Serializer**: :class:`ONNXProgramSerializer` serializes the exported model to a file.
  - **ONNX Diagnostic Options**: :class:`DiagnosticOptions` has a set of options that control the diagnostics emitted by the exporter.

 Dependencies
@ -144,15 +143,9 @@ API Reference
 .. autoclass:: torch.onnx.ONNXProgram
    :members:

-.. autoclass:: torch.onnx.ONNXProgramSerializer
-    :members:
-
 .. autoclass:: torch.onnx.ONNXRuntimeOptions
    :members:

-.. autoclass:: torch.onnx.InvalidExportOptionsError
-    :members:
-
 .. autoclass:: torch.onnx.OnnxExporterError
    :members:

--- a/docs/source/xpu.rst
+++ b/docs/source/xpu.rst
@ -13,7 +13,6 @@ torch.xpu
    device
    device_count
    device_of
-    empty_cache
    get_device_capability
    get_device_name
    get_device_properties
@ -51,7 +50,25 @@ Streams and events
    Stream


+Memory management
+-----------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+     empty_cache
+     max_memory_allocated
+     max_memory_reserved
+     memory_allocated
+     memory_reserved
+     memory_stats
+     memory_stats_as_nested_dict
+     reset_accumulated_memory_stats
+     reset_peak_memory_stats
+
+
 .. This module needs to be documented. Adding here in the meantime
 .. for tracking purposes
+.. py:module:: torch.xpu.memory
 .. py:module:: torch.xpu.random
-.. py:module:: torch.xpu.streams
+.. py:module:: torch.xpu.streams
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@
 # Python dependencies required for development
 astunparse
-expecttest!=0.2.0
+expecttest>=0.2.1
 hypothesis
 numpy
 psutil
--- a/test/HowToWriteTestsUsingFileCheck.md
+++ b/test/HowToWriteTestsUsingFileCheck.md
@ -93,7 +93,7 @@ annotations from the example above one would write:
 * `CHECK-COUNT-EXACTLY-<num>: <pattern>`
  Scans the input and succeeds when a line containing exactly `NUM` entries of
  `PATTERN` is found.
-* `CHECK-DAG: pattern`
+* `CHECK-DAG: <pattern>`
  Works similar to the usual `CHECK` pragma, but also matches if there exists a
  way to reorder the CHECK-DAG pragmas to satisfy all patterns.
  For example the following pattern:
@ -110,3 +110,18 @@ annotations from the example above one would write:
  bar
  end
  ```
+* `CHECK-SOURCE-HIGHLIGHTED: <pattern>`
+  Check for highlighted source ranges. This is useful when writing tests regarding generated error messages that require source code highlighting.
+  For example the following pattern:
+  ```
+  # CHECK-SOURCE-HIGHLIGHTED: raise Exception("raised exception
+  ```
+  would match the following input:
+  ```
+  def method_that_raises() -> torch.Tensor:
+      raise Exception("raised exception")  # noqa: TRY002
+      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+  builtins.Exception: raised exception
+  ```
+* `CHECK-REGEX: <pattern>`
+  Scans the input until `PATTERN` is matched, accepts RE syntax for std::regex.
--- a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
@ -0,0 +1,111 @@
+# Owner(s): ["oncall: distributed"]
+import copy
+
+import torch
+import torch.nn as nn
+from torch.amp.grad_scaler import GradScaler, OptState
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed._tensor import init_device_mesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest, MLP
+from torch.testing._internal.common_utils import run_tests, skipIfRocm
+
+
+class TestFullyShardGradientScaler(FSDPTest):
+    @skip_if_lt_x_gpu(4)
+    @skipIfRocm
+    def test_gradient_scaler(self):
+        self.run_subtests(
+            {"has_inf": [True, False], "test_2d": [True, False]},
+            self._test_gradient_scaler,
+        )
+
+    def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
+        torch.manual_seed(0)
+        model = nn.Sequential(
+            *[nn.Linear(4, 4, device="cuda", bias=False) for _ in range(2)]
+        )
+        for layer in model:
+            fully_shard(layer)
+        fully_shard(model)
+        input = torch.randn([4, 4], device="cuda")
+
+        if test_2d:
+            mesh_2d = init_device_mesh(
+                "cuda", (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
+            )
+            dp_mesh, tp_mesh = mesh_2d["dp"], mesh_2d["tp"]
+            model = nn.Sequential(MLP(2), MLP(2), MLP(2))
+            tp_parallelize_plan = {
+                "0.in_proj": ColwiseParallel(),
+                "0.out_proj": RowwiseParallel(),
+                "1.in_proj": ColwiseParallel(),
+                "1.out_proj": RowwiseParallel(),
+                "2.in_proj": ColwiseParallel(),
+                "2.out_proj": RowwiseParallel(),
+            }
+            model = parallelize_module(
+                model,
+                device_mesh=tp_mesh,
+                parallelize_plan=tp_parallelize_plan,
+            )
+            for module in model:
+                fully_shard(module, mesh=dp_mesh)
+            fully_shard(model, mesh=dp_mesh)
+            input = torch.randn((2,), device="cuda")
+
+        loss = model(input).sum()
+        scaler = GradScaler(init_scale=2.0, enabled=True)
+        opt = torch.optim.Adam(model.parameters(), lr=1e-2)
+        scaler.scale(loss).backward()
+        inv_scale = scaler._scale.double().reciprocal().float()
+        if (
+            has_inf is True
+            and opt.param_groups[0]["params"][0].grad._local_tensor.device.index == 1
+        ):
+            opt.param_groups[0]["params"][0].grad._local_tensor[0, 0].fill_(
+                float("inf")
+            )
+        inital_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
+
+        scaler.unscale_(opt)
+        for found_inf in scaler._per_optimizer_states[id(opt)][
+            "found_inf_per_device"
+        ].values():
+            self.assertEqual(found_inf, has_inf)
+        self.assertEqual(
+            scaler._per_optimizer_states[id(opt)]["stage"].value,
+            OptState.UNSCALED.value,
+        )
+        unscaled_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
+        self.assertEqual(unscaled_grad, inital_grad * inv_scale)
+        initial_scale = scaler.get_scale()
+        initial_state = copy.copy(opt.state)
+
+        scaler.step(opt)
+        steped_state = copy.copy(opt.state)
+        if has_inf:
+            # assert parameters are the same before/after
+            self.assertEqual(steped_state, initial_state)
+        else:
+            # new parameters here if no inf found during .unscale_()
+            self.assertNotEqual(steped_state.items(), initial_state.items())
+
+        scaler.update()
+        updated_scale = scaler.get_scale()
+        if has_inf:
+            # assert scale is updated
+            backoff_factor = scaler.get_backoff_factor()
+            self.assertEqual(updated_scale, initial_scale * backoff_factor)
+        else:
+            # scale is not updated
+            self.assertEqual(updated_scale, initial_scale)
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@ -32,6 +32,7 @@ from torch.testing._internal.common_distributed import (
    skip_if_rocm,
 )
 from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed.fake_pg import FakeStore
 from torch.utils._triton import has_triton
 from torch.utils.checkpoint import checkpoint

@ -367,35 +368,28 @@ class ReplicateTest(MultiProcessInductorTestCase):
        fc.run(code)


-class DDP_TP_Test(MultiProcessInductorTestCase):
-    @property
-    def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+class DDP_TP_Test(InductorTestCase):
+    def setUp(self):
+        self.rank = 0
+        self.world_size = 4
+        torch.cuda.set_device("cuda:0")

-    def setUp(self) -> None:
-        super().setUp()
-        self._spawn_processes()
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )

    def tearDown(self):
-        super().tearDown()
-        try:
-            os.remove(self.file_name)
-        except OSError:
-            pass
+        dist.destroy_process_group()

    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
    @skip_if_rocm
-    @skip_if_lt_x_gpu(4)
    def test_ddp_tp(self):
-        torch.cuda.set_device(f"cuda:{self.rank}")
-        dist.init_process_group(
-            backend="nccl",
-            rank=self.rank,
-            world_size=self.world_size,
-            store=dist.FileStore(self.file_name, self.world_size),
-        )
-        model = Net().cuda()
-        compiled_replicate_model = deepcopy(model)
+        ref_model = Net()
+        compiled_replicate_model = deepcopy(ref_model)
        mesh_2d = init_device_mesh(
            "cuda", (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
        )
@ -407,8 +401,8 @@ class DDP_TP_Test(MultiProcessInductorTestCase):
            "fc3": ColwiseParallel(),
            "fc4": RowwiseParallel(),
        }
-        model = parallelize_module(model, tp_mesh, parallelize_plan)
-        model = replicate(model, device_mesh=dp_mesh)
+        ref_model = parallelize_module(ref_model, tp_mesh, parallelize_plan)
+        ref_model = replicate(ref_model, device_mesh=dp_mesh)
        compiled_replicate_model = parallelize_module(
            compiled_replicate_model, tp_mesh, parallelize_plan
        )
@ -416,15 +410,23 @@ class DDP_TP_Test(MultiProcessInductorTestCase):
            compiled_replicate_model, device_mesh=dp_mesh
        )
        compiled_replicate_model = torch.compile(compiled_replicate_model)
-        data = torch.randn([1, DIM]).cuda()
+        data = torch.randn([1, DIM])
        with compiled_autograd.enable(compiler_fn()):
            loss = compiled_replicate_model(data).sum()
-            loss.backward()
+            # TODO: We need "pre-dispatch tracing of backward graph" to make this work:
+            # https://github.com/pytorch/pytorch/issues/127797#issuecomment-2291695474
+            with self.assertRaisesRegex(
+                AssertionError,
+                "Expected ProxyTensor, got <class 'torch.distributed._tensor.api.DTensor'>",
+            ):
+                loss.backward()

-        loss = model(data).sum()
-        loss.backward()
-        for p1, p2 in zip(model.parameters(), compiled_replicate_model.parameters()):
-            self.assertEqual(p1.grad, p2.grad)
+        # ref_loss = ref_model(data).sum()
+        # ref_loss.backward()
+        # for p1, p2 in zip(
+        #     ref_model.parameters(), compiled_replicate_model.parameters()
+        # ):
+        #     self.assertEqual(p1.grad, p2.grad)


 if __name__ == "__main__":
--- a/test/distributed/_tensor/test_dtensor_compile.py
+++ b/test/distributed/_tensor/test_dtensor_compile.py
@ -299,6 +299,70 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
        self.assertEqual(res, ref)
        self.assertEqual(cnt.frame_count, 2)

+    def test_dynamo_dtensor_from_local_dynamic_shapes(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        # Case 1: all dims dynamic
+        def fn(x):
+            dt = DTensor.from_local(
+                x,
+                mesh,
+                [Replicate()],
+                run_check=False,
+                shape=x.shape,
+                stride=x.stride(),
+            )
+            return dt.to_local() + 2
+
+        inp = torch.randn(4, 6, requires_grad=True)
+        ref = fn(inp)
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        res = torch.compile(fn, backend=cnt, fullgraph=True, dynamic=True)(inp)
+        res.sum().backward()
+
+        self.assertEqual(res, ref)
+        self.assertEqual(cnt.frame_count, 1)
+
+        # Case 2: only sizes are dynamic, strides are static
+        def fn(x):
+            dt = DTensor.from_local(
+                x, mesh, [Replicate()], run_check=False, shape=x.shape, stride=(1,)
+            )
+            return dt.to_local() + 2
+
+        inp = torch.randn(4, requires_grad=True)
+        torch._dynamo.mark_dynamic(inp, 0)
+        ref = fn(inp)
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        res = torch.compile(fn, backend=cnt, fullgraph=True)(inp)
+        res.sum().backward()
+
+        self.assertEqual(res, ref)
+        self.assertEqual(cnt.frame_count, 1)
+
+        # Case 3: both sizes and strides have a mix of dynamic and static dims
+        def fn(x):
+            dt = DTensor.from_local(
+                x,
+                mesh,
+                [Replicate()],
+                run_check=False,
+                shape=(x.shape[0], x.shape[1], 2),
+                stride=(x.stride()[0], 2, 1),
+            )
+            return dt.to_local() + 2
+
+        inp = torch.randn(4, 6, 2, requires_grad=True)
+        torch._dynamo.mark_dynamic(inp, 0)
+        torch._dynamo.mark_dynamic(inp, 1)
+        ref = fn(inp)
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        res = torch.compile(fn, backend=cnt, fullgraph=True)(inp)
+        res.sum().backward()
+
+        self.assertEqual(res, ref)
+        self.assertEqual(cnt.frame_count, 1)
+
    def test_dynamo_dtensor_recompile(self):
        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))

--- a/test/distributed/_tensor/test_view_ops.py
+++ b/test/distributed/_tensor/test_view_ops.py
@ -7,7 +7,13 @@ from typing import cast, List
 import torch
 import torch.distributed as dist
 from torch import rand, randn, Tensor
-from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
+from torch.distributed._tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    init_device_mesh,
+    Replicate,
+    Shard,
+)
 from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed._tensor.ops._view_ops import (
    Broadcast,
@ -29,6 +35,10 @@ from torch.utils import _pytree as pytree


 class TestViewOps(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 6
+
    def test_view_groups(self):
        self.assertEqual(
            view_groups([2, 3], [3, 2]),
@ -106,8 +116,8 @@ class TestViewOps(DTensorTestBase):
            view_groups([1, 1, 3, 2, 1, 1], [6, 1, 1, 1]),
            (
                Flatten((InputDim(2), InputDim(3))),
-                Singleton(),
-                Singleton(),
+                InputDim(4),
+                InputDim(5),
                Singleton(),
            ),
        )
@ -116,7 +126,7 @@ class TestViewOps(DTensorTestBase):
            (
                Split(InputDim(2), (3, 4), 0),
                Split(InputDim(2), (3, 4), 1),
-                Singleton(),
+                InputDim(3),
                Flatten((InputDim(6), InputDim(7))),
            ),
        )
@ -125,10 +135,6 @@ class TestViewOps(DTensorTestBase):
            (InputDim(0), InputDim(1), InputDim(2)),
        )

-    @property
-    def world_size(self) -> int:
-        return 6
-
    def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
        dim_map = dim_maps[op]
        rules = dim_map(*args, **kwargs)
@ -429,7 +435,7 @@ class TestViewOps(DTensorTestBase):
        self.dimmap_test(
            Tensor.view,
            (randn(1, 1, 42, 1, 24, 1), -1),
-            (Flatten((InputDim(2), InputDim(4))),),
+            (Flatten((InputDim(2), InputDim(input_dim=3), InputDim(4))),),
        )

        self.dimmap_test(
@ -525,6 +531,46 @@ class TestViewOps(DTensorTestBase):
            )
            self.assertEqual(out, out_dt.full_tensor())

+    @with_comms
+    def test_dtensor_view_op_uneven(self):
+        """
+        Test two uneven cases for view op:
+            1) the sharded tensor dim is 1 so that only the first rank has an non-empty shard.
+            2) the sharded tensor dim is uneven such that some ranks have full shards,
+                smaller non-empty shards, and empty shards.
+        """
+        dim0_sizes = [1, self.world_size + 1]
+        for dim0_size in dim0_sizes:
+            p = torch.randn(dim0_size, 2, 2, 2)
+            mesh = init_device_mesh(self.device_type, (self.world_size,))
+            dtensor = distribute_tensor(p, mesh, [Shard(0)])
+
+            with CommDebugMode() as comm_mode:
+                view = dtensor.view(dim0_size, 2, 4)
+                self.assertEqual(len(comm_mode.get_comm_counts()), 0)
+                # when no communication happens, the data pointer should be the same.
+                self.assertEqual(
+                    view.to_local().data_ptr(), dtensor.to_local().data_ptr()
+                )
+
+                view = dtensor.view(dim0_size, 4, 2)
+                self.assertEqual(
+                    view.to_local().data_ptr(), dtensor.to_local().data_ptr()
+                )
+                self.assertEqual(len(comm_mode.get_comm_counts()), 0)
+
+                view = dtensor.view(dim0_size, 8)
+                self.assertEqual(
+                    view.to_local().data_ptr(), dtensor.to_local().data_ptr()
+                )
+                self.assertEqual(len(comm_mode.get_comm_counts()), 0)
+
+                view = dtensor.view(dtensor.shape)
+                self.assertEqual(
+                    view.to_local().data_ptr(), dtensor.to_local().data_ptr()
+                )
+                self.assertEqual(len(comm_mode.get_comm_counts()), 0)
+

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/checkpoint/test_fsdp_model_state.py
+++ b/test/distributed/checkpoint/test_fsdp_model_state.py
@ -34,7 +34,7 @@ class FsdpModelStateCheckpoint(DTensorTestBase):
                "model": model.state_dict(),
            }

-            dist_cp.save_state_dict(
+            dist_cp.save(
                state_dict=state_dict,
                storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
                planner=DefaultSavePlanner(),
@ -55,7 +55,7 @@ class FsdpModelStateCheckpoint(DTensorTestBase):
                "model": model_2.state_dict(),
            }

-            dist_cp.load_state_dict(
+            dist_cp.load(
                state_dict=state_dict,
                storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
                planner=DefaultLoadPlanner(),
--- a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
+++ b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
@ -40,7 +40,7 @@ class TestFsdpTpCheckpointConversion(DTensorTestBase):
        fsdp_state_dict = fsdp_model.state_dict()

        # save fsdp_state_dict to storage
-        dist_cp.save_state_dict(
+        dist_cp.save(
            state_dict=fsdp_state_dict,
            storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
        )
--- a/test/distributed/checkpoint/test_hsdp_checkpoint.py
+++ b/test/distributed/checkpoint/test_hsdp_checkpoint.py
@ -94,7 +94,7 @@ class TestHSDPCheckpoint(DTensorTestBase):
        state_dict = {"model": model.state_dict()}
        state_dict_to_save = deepcopy(state_dict)

-        dist_cp.save_state_dict(
+        dist_cp.save(
            state_dict=state_dict_to_save,
            storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
            planner=DefaultSavePlanner(),
@ -113,7 +113,7 @@ class TestHSDPCheckpoint(DTensorTestBase):
            self.assertEqual(v1.placements, v2.placements)
            self.assertNotEqual(v1.to_local(), v2.to_local())

-        dist_cp.load_state_dict(
+        dist_cp.load(
            state_dict=state_dict_to_save,
            storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
            planner=DefaultLoadPlanner(),
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@ -6,6 +6,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import asyncio
 import ctypes
 import multiprocessing
 import os
@ -362,6 +363,9 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
            self.assertTrue(pc._stderr_tail.stopped())
            self.assertTrue(pc._stdout_tail.stopped())

+        def test_pcontext_wait_on_a_child_thread(self):
+            asyncio.run(asyncio.to_thread(self.test_pcontext_wait))
+
        def test_multiprocess_context_close(self):
            pc = start_processes(
                name="sleep",
--- a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
@ -25,6 +25,7 @@ from torch.distributed.elastic.rendezvous.c10d_rendezvous_backend import (
    C10dRendezvousBackend,
    create_backend,
 )
+from torch.distributed.elastic.utils.distributed import get_free_port


 class TCPStoreBackendTest(TestCase, RendezvousBackendTestMixin):
@ -69,9 +70,11 @@ class CreateBackendTest(TestCase):
        # For testing, the default parameters used are for tcp. If a test
        # uses parameters for file store, we set the self._params to
        # self._params_filestore.
+
+        port = get_free_port()
        self._params = RendezvousParameters(
            backend="dummy_backend",
-            endpoint="localhost:29300",
+            endpoint=f"localhost:{port}",
            run_id="dummy_run_id",
            min_nodes=1,
            max_nodes=1,
@ -95,7 +98,7 @@ class CreateBackendTest(TestCase):
        self._expected_temp_dir = tempfile.gettempdir()

        self._expected_endpoint_host = "localhost"
-        self._expected_endpoint_port = 29300
+        self._expected_endpoint_port = port
        self._expected_store_type = TCPStore
        self._expected_read_timeout = timedelta(seconds=10)

@ -173,11 +176,14 @@ class CreateBackendTest(TestCase):
    def test_create_backend_returns_backend_if_endpoint_port_is_not_specified(
        self,
    ) -> None:
-        self._params.endpoint = self._expected_endpoint_host
+        # patch default port and pass endpoint with no port specified
+        with mock.patch(
+            "torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.DEFAULT_PORT",
+            self._expected_endpoint_port,
+        ):
+            self._params.endpoint = self._expected_endpoint_host

-        self._expected_endpoint_port = 29400
-
-        self._assert_create_backend_returns_backend()
+            self._assert_create_backend_returns_backend()

    def test_create_backend_returns_backend_if_endpoint_file_is_not_specified(
        self,
--- a/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
+++ b/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
@ -1597,6 +1597,23 @@ class CreateHandlerTest(TestCase):
                create_handler(self._store, self._backend, self._params)
                record_mock.assert_called_once()

+    def test_create_handler_rdzv_local_addr(self) -> None:
+        params = RendezvousParameters(
+            backend=self._backend.name,
+            endpoint="dummy_endpoint",
+            run_id="dummy_run_id",
+            min_nodes=1,
+            max_nodes=1,
+            join_timeout="50",
+            last_call_timeout="60",
+            close_timeout="70",
+            local_addr="127.0.0.2",
+        )
+        store = HashStore()
+        handler = create_handler(store, self._backend, params)
+        rdzv_info = handler.next_rendezvous()
+        self.assertEqual(rdzv_info.bootstrap_store_info.master_addr, "127.0.0.2")
+

 def _ignore_exception(exception_type: Exception, fn: Callable):
    try:
@ -1656,7 +1673,7 @@ class IntegrationTest(TestCase):
            "min_nodes": 2,
            "max_nodes": 2,
            "join_timeout": "5",
-            "local_addr": f"address_{len(self._handlers)}",
+            "local_addr": f"127.0.0.{len(self._handlers)}",
        }
        params.update(**kwargs)

@ -1714,7 +1731,7 @@ class IntegrationTest(TestCase):
        state_and_token = self._backend.get_state()
        state = pickle.loads(state_and_token[0])
        addresses = [node.addr for node in state.redundancy_list]
-        self.assertListEqual(addresses, ["address_2"])
+        self.assertListEqual(addresses, ["127.0.0.2"])

    def test_redundancy_transition_to_wait_list_then_join_rendezvous(self) -> None:
        handler1 = self._create_handler(
--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@ -6,6 +6,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import multiprocessing as mp
+import os
 import signal
 import time
 import unittest
@ -37,7 +38,7 @@ if not (IS_WINDOWS or IS_MACOS):
        def setUp(self):
            super().setUp()
            self.max_interval = 0.01
-            self.file_path = "/tmp/test_file_path_" + str(uuid.uuid4())
+            self.file_path = f"/tmp/test_file_path_{os.getpid()}_{uuid.uuid4()}"
            self.server = timer.FileTimerServer(
                self.file_path, "test", self.max_interval
            )
@ -204,7 +205,7 @@ if not (IS_WINDOWS or IS_MACOS):
    class FileTimerServerTest(TestCase):
        def setUp(self):
            super().setUp()
-            self.file_path = "/tmp/test_file_path_" + str(uuid.uuid4())
+            self.file_path = f"/tmp/test_file_path_{os.getpid()}_{uuid.uuid4()}"
            self.max_interval = 0.01
            self.server = timer.FileTimerServer(
                self.file_path, "test", self.max_interval
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@ -914,9 +914,6 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            torch._dynamo.utils.clear_compilation_metrics()

-            # TODO: This should be possible to do inside the function, but
-            device = f"cuda:{self.rank}"
-
            @torch.compile()
            def f(x, y):
                zx = x.shape
@ -940,6 +937,28 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
            for r in res[1:]:
                self.assertEqual(res[0], r)

+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @config.patch(enable_compiler_collectives=True)
+    def test_compiler_collectives_missing_source(self):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+            torch._dynamo.utils.clear_compilation_metrics()
+
+            @torch.compile()
+            def f(rank, xs):
+                return xs[rank].sum()
+
+            xs = []
+            for _ in range(self.world_size):
+                xs.append(torch.randn(10, device=self.rank))
+
+            f(self.rank, xs)
+
+            metrics = torch._dynamo.utils.get_compilation_metrics()
+            res = [None] * self.world_size
+            torch.distributed.all_gather_object(res, len(metrics))
+            for r in res[1:]:
+                self.assertEqual(res[0], r)
+
    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
    @patch.object(torch._inductor.config, "fx_graph_cache", False)
    @patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@ -1013,22 +1013,17 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            return ar

        input = torch.ones(4, 4, device="cuda", requires_grad=True)
-        # TODO implement backwards
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "element 0 of tensors does not require grad and does not have a grad_fn",
-        ):
-            compiled = torch.compile(
-                func, backend="aot_eager"
-            )  # inductor bug with single-op allreduce graph
-            out = compiled(input)
-            out.sum().backward()
+        compiled = torch.compile(
+            func, backend="aot_eager"
+        )  # inductor bug with single-op allreduce graph
+        out = compiled(input)
+        out.sum().backward()

-            correct_input = input.clone().detach().requires_grad_()
-            correct = func(correct_input)
-            correct.sum().backward()
-            self.assertTrue(same(out, correct))
-            self.assertTrue(same(input.grad, correct_input.grad))
+        correct_input = input.clone().detach().requires_grad_()
+        correct = func(correct_input)
+        correct.sum().backward()
+        self.assertTrue(same(out, correct))
+        self.assertTrue(same(input.grad, correct_input.grad))

    def test_meta(self):
        x = torch.rand((2, 3, 4), device="meta")
--- a/test/dynamo/test_exc.py
+++ b/test/dynamo/test_exc.py
@ -107,7 +107,7 @@ due to:
 Traceback (most recent call last):
  File "test_exc.py", line N, in f
    raise NotImplementedError
-torch._dynamo.exc.InternalTorchDynamoError:
+torch._dynamo.exc.InternalTorchDynamoError: NotImplementedError:

 from user code:
   File "test_exc.py", line N, in fn001
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@ -239,6 +239,22 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
            v = v + x
        return v

+    def test_itertools_reconstruct(self):
+        def fn(a):
+            it1 = itertools.repeat(1)
+            it2 = itertools.count(2)
+            for _ in range(3):
+                a += next(it1)
+                a += next(it2)
+            return it1, it2, a
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        i1, i2, a = fn(torch.ones(3, 3))
+        it1, it2, b = opt_fn(torch.ones(3, 3))
+        self.assertEqual(next(i1), next(it1))
+        self.assertEqual(next(i2), next(it2))
+        self.assertEqual(a, b)
+
    @make_test
    def test_obj_eq(a, b):
        v = a + b
@ -507,8 +523,7 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        empty = collections.deque()
        d.extend(empty)

-        # dynamo same() util doesn't support deque so just return a list
-        return list(d)
+        return d

    @make_test
    def test_slice1(a):
@ -3115,6 +3130,199 @@ class GraphModule(torch.nn.Module):
            fn(arr, np.s_[..., 1], np.array([3, 3])), np.array([[1, 3], [2, 3]])
        )

+    def test_map_return(self):
+        def fn(a, b):
+            return map(lambda x: x + 1, [a, b])
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        m = opt_fn(torch.randn(3, 3), torch.randn(3, 3))
+        self.assertIsInstance(m, map)
+
+    @make_test
+    def test_map_max(a, b):
+        return max(map(lambda x: x.sum(), [a, b]))
+
+    # max(map(...)) graph breaks
+    @unittest.expectedFailure
+    @make_test
+    def test_map_max_const(a):
+        return max(map(lambda x: x, [1, 2, 3])), a + 1
+
+    @make_test
+    def test_map_list(a, b):
+        return list(map(lambda x: x + 1, [a, b]))
+
+    @make_test
+    def test_map_tuple(a, b):
+        return tuple(map(lambda x: x + 1, [a, b]))
+
+    @make_test
+    def test_map_iter(a, b):
+        it = iter(map(lambda x: x + 1, [a, b]))
+        return next(it)
+
+    @make_test
+    def test_map_zip_dict(a):
+        d = dict(
+            zip(
+                map(lambda x: x + 1, [0, 1, 2]),
+                [map(lambda x: x - 1, [y]) for y in [3, 4, 5]],
+            )
+        )
+        return list(d[3])[0], a + 1  # noqa: RUF015
+
+    @make_test
+    def test_map_dict_fromkeys(a):
+        return dict.fromkeys(map(lambda x: x + 1, [0, 1])), a + 1
+
+    @make_test
+    def test_map_set(a):
+        return set(map(lambda x: x + 1, [0, 1])), a + 1
+
+    # test_map_sum defined earlier
+
+    @make_test
+    def test_map_reduce(a, b):
+        return functools.reduce(lambda x, y: x + y, map(lambda x: x + 1, [a, b]))
+
+    @make_test
+    def test_map_sorted(a):
+        return sorted(map(lambda x: x + 1, [0, 4, 3, 1, 2])), a + 1
+
+    @make_test
+    def test_map_list_extend(a, b, c):
+        l = [a]
+        l.extend(map(lambda x: x + 1, [b, c]))
+        return l
+
+    @make_test
+    def test_map_list_slice_assign(a, b, c, d, e):
+        l = [a, b, c]
+        l[1:2] = map(lambda x: x + 1, [d, e])
+        return l
+
+    @make_test
+    def test_map_deque_extendleft(a, b, c):
+        d = collections.deque([a])
+        d.extendleft(map(lambda x: x + 1, [b, c]))
+        return d
+
+    @make_test
+    def test_map_str_join(a):
+        return "".join(map(lambda x: x, ["a", "b", "c"])), a + 1
+
+    def test_map_with_graph_break(self):
+        def f(a):
+            a += 1
+
+            def g(x):
+                nonlocal a
+                a += 1
+                return x + 1
+
+            m = map(g, [1, 2, 3, 4, 5])
+            a += next(m)  # won't graph break
+            torch._dynamo.graph_break()
+            a += next(m)  # will graph break
+            return a
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_f = torch.compile(f, backend=cnts)
+        self.assertEqual(f(torch.ones(3, 3)), opt_f(torch.ones(3, 3)))
+        self.assertEqual(cnts.frame_count, 3)
+
+    def test_map_reconstruct(self):
+        def fn(a):
+            return map(lambda x: x[0] + x[1], zip([1, 2, 3], [1, 2, 3])), a + 1
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        m = opt_fn(torch.ones(3, 3))[0]
+        self.assertIsInstance(m, map)
+        self.assertEqual(list(m), list(fn(torch.ones(3, 3))[0]))
+
+    def test_zip_reconstruct(self):
+        def fn(a):
+            return zip([1, 2, 3], map(lambda x: x + 1, [1, 2, 3])), a + 1
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        m = opt_fn(torch.ones(3, 3))[0]
+        self.assertIsInstance(m, zip)
+        self.assertEqual(list(m), list(fn(torch.ones(3, 3))[0]))
+
+    @make_test
+    def test_map_partial_unpack(a, b):
+        y = 1
+
+        def f(x):
+            nonlocal y
+            y += 1
+            return x
+
+        l = list(zip([a, b], map(f, [1, 2, 3, 4])))
+        return a + y
+
+    @make_test
+    def test_map_call_function_ex(a, b):
+        def f(x, y):
+            return x + y
+
+        return f(*map(lambda x: x + 1, [a, b]))
+
+    @make_test
+    def test_map_unpack_twice(a, b):
+        m = map(lambda x: x + 1, [a, b])
+        l1 = list(m)
+        l2 = list(m)
+        return l1, l2
+
+    @make_test
+    def test_enumerate(a, b):
+        return list(enumerate([a, b], start=1)), a + 1
+
+    @make_test
+    def test_map_enumerate(a, b):
+        return list(enumerate(map(lambda x: x + 1, [a, b]), start=1)), a + 1
+
+    @make_test
+    def test_map_infinite(a, b):
+        return list(map(lambda x, y: x + y, [a, b], itertools.count(3)))
+
+    @make_test
+    def test_map_unpack_vars(a, b):
+        x, y = map(lambda x: x + 1, [a, b])
+        return x + y
+
+    def test_enumerate_custom(self):
+        class MyClass:
+            def __iter__(self):
+                self.a = 1
+                return self
+
+            def __next__(self):
+                if self.a > 3:
+                    raise StopIteration
+                self.a += 1
+                return self.a
+
+        def fn(x):
+            for i, it in enumerate(MyClass()):
+                x += i + it
+            return x
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        self.assertEqual(fn(torch.ones(3, 3)), opt_fn(torch.ones(3, 3)))
+
+    def test_enumerate_reconstruct(self):
+        def fn(a, b):
+            return enumerate([a, b], start=1)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        inps = (torch.randn(3, 3), torch.randn(3, 3))
+        it1 = fn(*inps)
+        it2 = opt_fn(*inps)
+        self.assertIsInstance(it2, enumerate)
+        self.assertEqual(list(it1), list(it2))
+

 def udf_mul(x, y):
    return x * y
@ -3394,6 +3602,71 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
        ref = opt_fn(x)
        self.assertEqual(ref, res)

+    def test_frozenset_construction(self):
+        def fn(x):
+            s = frozenset({x})
+            t = frozenset(s)
+            return len(t)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        res = fn(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    def test_frozenset_reconstruction(self):
+        d = {}
+        f = frozenset()
+        d[f] = torch.randn(4)
+
+        def fn(x):
+            k = frozenset()
+            torch._dynamo.graph_break()
+            return d[k] * x
+
+        opt_fn = torch.compile(fn, backend="eager")
+        x = torch.randn(4)
+        res = fn(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    def test_frozenset_illegal_call_method(self):
+        def fn_add():
+            s = frozenset((1, 2, 3))
+            s.add({2})
+            return len(s)
+
+        def fn_pop():
+            s = frozenset((1, 2, 3))
+            s.pop()
+            return len(s)
+
+        def fn_update():
+            s = frozenset((1, 2, 3))
+            s.update({4, 5, 6})
+            return len(s)
+
+        def fn_remove():
+            s = frozenset((1, 2, 3))
+            s.remove(2)
+            return len(s)
+
+        def fn_discard():
+            s = frozenset((1, 2, 3))
+            s.discard(2)
+            return len(s)
+
+        def fn_clear():
+            s = frozenset((1, 2, 3))
+            s.clear()
+            return len(s)
+
+        for fn in [fn_add, fn_pop, fn_update, fn_remove, fn_discard, fn_clear]:
+            torch._dynamo.reset()
+            opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+            with self.assertRaises(torch._dynamo.exc.InternalTorchDynamoError):
+                opt_fn()
+
    def test_is_tensor_tensor(self):
        def fn(x, y):
            if x is y:
@ -3605,10 +3878,16 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
        with self.assertRaisesRegex(torch._dynamo.exc.UserError, "zip()"):
            nopython_fn(x, ys[:1], zs)

+        with self.assertRaisesRegex(torch._dynamo.exc.UserError, "zip()"):
+            nopython_fn(x, ys, zs[:1])
+
        # Should cause fallback if allow graph break
        with self.assertRaisesRegex(ValueError, "zip()"):
            opt_fn(x, ys[:1], zs)

+        with self.assertRaisesRegex(ValueError, "zip()"):
+            opt_fn(x, ys, zs[:1])
+
    def test_fn_with_attr(self):
        def fn(x):
            if fn.pred:
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@ -308,6 +308,19 @@ class MiscTests(torch._inductor.test_case.TestCase):
            "Graph break for an optree C/C++ function optree._C.PyCapsule.flatten. Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py",
        )

+    def test_scalar_device_movement(self):
+        if not torch._dynamo.config.assume_static_by_default:
+            self.skipTest("Doesn't work with symints")
+
+        def add_fn(a, b, out):
+            res = torch.add(a, b, out=out)
+            return res
+
+        res = add_fn(2, 3, torch.tensor(0.0))
+        add_fn = torch.compile(add_fn, backend="eager", fullgraph=True)
+        res_compiled = add_fn(2, 3, torch.tensor(0.0))
+        self.assertEqual(res, res_compiled)
+
    @skipIfNNModuleInlined("fails internal CI")
    @unittest.skipIf(IS_FBCODE, "inline cpp_extension doesn't work in fbcode")
    def test_cpp_extension_recommends_custom_ops(self):
@ -3367,6 +3380,21 @@ utils_device.CURRENT_DEVICE == None""".split(
        self.assertTrue(same(obj41.y, obj42.y))
        self.assertEqual(cnts.frame_count, 1)

+    def test_thread_local_setattr(self):
+        from threading import local
+
+        loc = local()
+
+        @torch.compile(fullgraph=True)
+        def fn(x, l):
+            l.x = x
+            return x + 1
+
+        x = torch.ones(2, 2)
+        fn(x, loc)
+
+        self.assertTrue(loc.x is x)
+
    def test_user_defined_class_name(self):
        class MyClassFoo:
            pass
--- a/test/dynamo/test_model_output.py
+++ b/test/dynamo/test_model_output.py
@ -47,6 +47,21 @@ class TestHFPretrained(torch._dynamo.test_case.TestCase):
        res = opt_fn(x, tmp)
        self.assertTrue(same(ref, res))

+    @maybe_skip
+    def test_pretrained_non_const_attr(self):
+        def fn(a, tmp):
+            if tmp.pruned_heads:
+                return a + 1
+            else:
+                return a - 1
+
+        x = torch.randn(2)
+        tmp = PretrainedConfig()
+        ref = fn(x, tmp)
+        opt_fn = torch.compile(backend="eager", fullgraph=True)(fn)
+        res = opt_fn(x, tmp)
+        self.assertTrue(same(ref, res))
+

 class TestModelOutput(torch._dynamo.test_case.TestCase):
    @maybe_skip
--- a/test/dynamo/test_modes.py
+++ b/test/dynamo/test_modes.py
@ -1,5 +1,4 @@
 # Owner(s): ["module: dynamo"]
-from unittest.mock import patch

 import torch
 import torch._dynamo.test_case
@ -14,6 +13,17 @@ from torch.utils._device import DeviceContext
 from torch.utils._python_dispatch import TorchDispatchMode


+class TestMode(BaseTorchFunctionMode):
+    def __torch_function__(self, func, types, args, kwargs=None):
+        if not kwargs:
+            kwargs = {}
+
+        if func == torch.add:
+            return torch.zeros(2, 2)
+
+        return super().__torch_function__(func, types, args, kwargs)
+
+
 class TorchDispatchModeTests(torch._dynamo.test_case.TestCase):
    @classmethod
    def setUpClass(cls):
@ -57,9 +67,11 @@ class TorchFunctionModeTests(torch._dynamo.test_case.TestCase):

    def setUp(self):
        torch.set_default_device(None)
+        torch._dynamo.reset()

    def tearDown(self):
        torch.set_default_device(None)
+        torch._dynamo.reset()

    def _run_torch_function_mode_guard_test(self):
        class TestMode1(BaseTorchFunctionMode):
@ -94,70 +106,6 @@ class TorchFunctionModeTests(torch._dynamo.test_case.TestCase):
            fn(inp)
        self.assertEqual(cnt.frame_count, 4)

-    def _run_ignored_mode_types_test(self):
-        class IgnoredMode(BaseTorchFunctionMode):
-            pass
-
-        cnt = torch._dynamo.testing.CompileCounter()
-
-        @torch.compile(backend=cnt.__call__, fullgraph=True)
-        def fn(x):
-            return x + 1
-
-        inp = torch.ones(2, 2)
-
-        with patch(
-            "torch._dynamo.variables.torch_function.IGNORED_MODES", {IgnoredMode}
-        ):
-            # initial compile
-            fn(inp)
-
-            # no recompile, mode ignored
-            # note: the ref stack is length 0, and the stack we are checking against has length 2
-            # we want to check both ref stack len > runtime stack, and ref stack len < runtime stack
-            with IgnoredMode(), IgnoredMode():
-                fn(inp)
-
-            self.assertEqual(cnt.frame_count, 1)
-
-            # recompile due to new mode on the stack
-            with BaseTorchFunctionMode(), BaseTorchFunctionMode(), BaseTorchFunctionMode():
-                fn(inp)
-
-            self.assertEqual(cnt.frame_count, 2)
-
-            # recompile
-            # tests both ref stack len > runtime stack len for the above guard check
-            # and ref stack len < runtime stack len for the initial zero mode case
-            with BaseTorchFunctionMode(), IgnoredMode(), BaseTorchFunctionMode():
-                fn(inp)
-
-            self.assertEqual(cnt.frame_count, 3)
-
-            # no recompile
-            with IgnoredMode(), IgnoredMode(), BaseTorchFunctionMode(), BaseTorchFunctionMode():
-                fn(inp)
-
-            self.assertEqual(cnt.frame_count, 3)
-
-        # This is tricky, basically the ignored modes are baked into the guard
-        # IgnoredMode will be ignored forever by that guard.
-        # This is okay since we don't expect to be modifying IGNORED_MODES
-        # in the middle of execution except for the purposes of testing.
-        torch._dynamo.reset()
-
-        with IgnoredMode():
-            fn(inp)
-
-        self.assertEqual(cnt.frame_count, 4)
-
-    @torch._dynamo.config.patch("enable_cpp_guard_manager", False)
-    def test_torch_function_mode_guards_ignored_types_py(self):
-        self._run_ignored_mode_types_test()
-
-    def test_torch_function_mode_guards_ignored_types_cpp(self):
-        self._run_ignored_mode_types_test()
-
    @torch._dynamo.config.patch("enable_cpp_guard_manager", False)
    def test_torch_function_mode_guards_py(self):
        self._run_torch_function_mode_guard_test()
@ -324,6 +272,218 @@ class TorchFunctionModeTests(torch._dynamo.test_case.TestCase):
            fn(inp)
        self.assertEqual(cnt.frame_count, 2)

+    def test_nested_torch_function_mode(self):
+        mode_1_called = False
+        mode_2_called = False
+
+        def reset_state():
+            nonlocal mode_1_called
+            nonlocal mode_2_called
+            mode_1_called = False
+            mode_2_called = False
+
+        ones = torch.ones(2, 2)
+        zeros = torch.zeros(2, 2)
+
+        class TestMode1(BaseTorchFunctionMode):
+            def __torch_function__(self, func, types, args, kwargs=None):
+                if not kwargs:
+                    kwargs = {}
+
+                nonlocal mode_1_called
+
+                mode_1_called = True
+
+                if func == torch.add:
+                    return zeros
+
+                return super().__torch_function__(func, types, args, kwargs)
+
+        class TestMode2(BaseTorchFunctionMode):
+            def __torch_function__(self, func, types, args, kwargs=None):
+                if not kwargs:
+                    kwargs = {}
+
+                nonlocal mode_2_called
+
+                mode_2_called = True
+
+                if func == torch.mul:
+                    return ones
+
+                return super().__torch_function__(func, types, args, kwargs)
+
+        def fn(x):
+            return torch.add(x, 3)
+
+        def fn_2(x):
+            return torch.mul(x, 3) + torch.add(x, 3)
+
+        inp = torch.ones(2, 2) + 1
+
+        for fn_i in [fn, fn_2]:
+            fn_opt = torch.compile(fn_i, fullgraph=True)
+            with TestMode1(), TestMode2():
+                expected = fn_i(inp), mode_1_called, mode_2_called
+                reset_state()
+                actual = fn_opt(inp), mode_1_called, mode_2_called
+                reset_state()
+
+            self.assertEqual(expected, actual)
+
+    def test_torch_function_mode_disable(self):
+        class TestSubclass(torch.Tensor):
+            @classmethod
+            def __torch_function__(cls, func, types, args, kwargs=None):
+                if not kwargs:
+                    kwargs = {}
+                if func == torch.add:
+                    return torch.ones(2, 2)
+                return super().__torch_function__(func, types, args, kwargs)
+
+        class TestMode(BaseTorchFunctionMode):
+            def __torch_function__(self, func, types, args, kwargs=None):
+                if not kwargs:
+                    kwargs = {}
+
+                if func == torch.add:
+                    return torch.zeros(2, 2)
+
+                return super().__torch_function__(func, types, args, kwargs)
+
+        def fn(x):
+            return torch.add(x, 3)
+
+        inp = (torch.ones(2, 2) + 1).as_subclass(TestSubclass)
+
+        fn_opt = torch.compile(fn, fullgraph=True)
+        with TestMode(), torch._dynamo.config.patch(
+            "traceable_tensor_subclasses", {TestSubclass}
+        ):
+            with torch._C.DisableTorchFunctionSubclass():
+                expected = fn(inp)
+                actual = fn_opt(inp)
+
+            self.assertEqual(expected, actual)
+
+            with torch._C.DisableTorchFunction():
+                expected = fn(inp)
+                actual = fn_opt(inp)
+
+            self.assertEqual(expected, actual)
+
+    def test_torch_function_mode_highest_priority(self):
+        class TestSubclass(torch.Tensor):
+            @classmethod
+            def __torch_function__(cls, func, types, args, kwargs=None):
+                if not kwargs:
+                    kwargs = {}
+                if func == torch.add:
+                    return torch.ones(2, 2)
+                return super().__torch_function__(func, types, args, kwargs)
+
+        def fn(x):
+            return torch.add(x, 3)
+
+        inp = (torch.ones(2, 2) + 1).as_subclass(TestSubclass)
+
+        fn_opt = torch.compile(fn, fullgraph=True)
+        with TestMode(), torch._dynamo.config.patch(
+            "traceable_tensor_subclasses", {TestSubclass}
+        ):
+            expected = fn(inp)
+            actual = fn_opt(inp)
+
+        self.assertEqual(expected, actual)
+
+    def test_torch_function_mode_enter_exit(self):
+        def fn(x, y):
+            with TestMode():
+                o = torch.add(x, 3)
+
+            return torch.add(o, y)
+
+        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
+        fn_opt = torch.compile(fn, fullgraph=True)
+
+        expected = fn(*inp)
+        actual = fn_opt(*inp)
+
+        self.assertEqual(expected, actual)
+
+    def test_torch_function_mode_graph_break(self):
+        def fn(x, y):
+            with TestMode():
+                torch._dynamo.graph_break()
+                o = torch.add(x, 3)
+
+            return torch.add(o, y)
+
+        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
+        fn_opt = torch.compile(fn)
+
+        expected = fn(*inp)
+        actual = fn_opt(*inp)
+
+        self.assertEqual(expected, actual)
+
+    def test_torch_function_mode_and_pop_graph_break(self):
+        def fn(x, y):
+            with TestMode():
+                z = _pop_torch_function_stack()
+                torch._dynamo.graph_break()
+                _push_on_torch_function_stack(z)
+                o = torch.add(x, 3)
+
+            return torch.add(o, y)
+
+        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
+        fn_opt = torch.compile(fn)
+
+        expected = fn(*inp)
+        actual = fn_opt(*inp)
+
+        self.assertEqual(expected, actual)
+
+    def test_torch_function_mode_restore_on_exc(self):
+        @torch._dynamo.disable()
+        def err():
+            raise RuntimeError("test")
+
+        @torch.compile()
+        def fn(x):
+            with TestMode():
+                x += 1
+                err()
+                x += 2
+                return x
+
+        try:
+            fn(torch.ones(2, 2))
+        except RuntimeError:
+            pass
+        self.assertEqual(_len_torch_function_stack(), 0)
+
+    def test_torch_function_mode_and_pop_graph_break_mutation(self):
+        def fn(x, y):
+            with TestMode():
+                z = _pop_torch_function_stack()
+                z.y = 5
+                torch._dynamo.graph_break()
+                _push_on_torch_function_stack(z)
+                o = torch.add(x, 3)
+                o = torch.mul(o, z.y)
+
+            return torch.add(o, y)
+
+        inp = (torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
+        fn_opt = torch.compile(fn)
+
+        expected = fn(*inp)
+        actual = fn_opt(*inp)
+
+        self.assertEqual(expected, actual)
+

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/Show More
+++ b/Show More