more wip

wip [ca] single step
[ca] side-effect free initial trace: RAII PyCompilerInterface
2025-10-28 18:54:57 +08:00 · 2025-03-03 13:36:11 -08:00 · 2025-02-28 19:25:39 -08:00 · 2025-02-26 20:17:18 -08:00 · 2025-02-25 19:57:55 -08:00 · 2025-02-25 19:57:55 -08:00
404 changed files with 13745 additions and 5651 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -39,7 +39,7 @@ def build_ArmComputeLibrary() -> None:
            "clone",
            "https://github.com/ARM-software/ComputeLibrary.git",
            "-b",
-            "v24.09",
+            "v25.02",
            "--depth",
            "1",
            "--shallow-submodules",
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -329,7 +329,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None
        ]
    )
    host.run_cmd(
-        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.09 {git_clone_flags}"
+        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
    )

    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail

-readonly version=v24.04
+readonly version=v25.02
 readonly src_host=https://github.com/ARM-software
 readonly src_repo=ComputeLibrary

--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -66,7 +66,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.28=*openmp*"
+    conda_install "openblas==0.3.29=*openmp*"
  else
    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
  fi
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,7 +4,7 @@
 set -ex

 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules


 OPENBLAS_BUILD_FLAGS="
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -54,7 +54,7 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
    12.8)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.6)
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -191,7 +191,7 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@ -377,8 +377,10 @@ else
    # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
    # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
    # 16 CPUs
-    MAX_JOBS=$(nproc --ignore=4)
-    export MAX_JOBS
+    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
+      MAX_JOBS=$(nproc --ignore=4)
+      export MAX_JOBS
+    fi

    # NB: Install outside of source directory (at the same level as the root
    # pytorch folder) so that it doesn't get cleaned away prior to docker push.
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -490,6 +490,13 @@ else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi

+test_cachebench() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  $TASKSET python "benchmarks/dynamo/cachebench.py" --output "$TEST_REPORTS_DIR/cachebench.json"
+}
+
 test_perf_for_dashboard() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
@ -518,6 +525,8 @@ test_perf_for_dashboard() {
    test_inductor_set_cpu_affinity
  elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
    device=cuda_a10g
+  elif [[ "${TEST_CONFIG}" == *h100* ]]; then
+    device=cuda_h100
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  fi
@ -1507,6 +1516,11 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
+elif [[ "${TEST_CONFIG}" == cachebench ]]; then
+  install_torchaudio cuda
+  install_torchvision
+  checkout_install_torchbench nanogpt BERT_pytorch resnet50
+  PYTHONPATH=$(pwd)/torchbench test_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    install_torchaudio cpu
--- a/.flake8
+++ b/.flake8
@ -38,6 +38,7 @@ per-file-ignores =
    torchgen/api/types/__init__.py: F401,F403
    torchgen/executorch/api/types/__init__.py: F401,F403
    test/dynamo/test_higher_order_ops.py: B950
+    test/dynamo/test_graph_break_messages.py: B950
    torch/testing/_internal/dynamo_test_failures.py: B950
    # TOR901 is only for test, we want to ignore it for everything else.
    # It's not easy to configure this without affecting other per-file-ignores,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -10,7 +10,6 @@ self-hosted-runner:
    - linux.9xlarge.ephemeral
    - am2.linux.9xlarge.ephemeral
    - linux.12xlarge
-    - linux.12xlarge.ephemeral
    - linux.24xlarge
    - linux.24xlarge.ephemeral
    - linux.arm64.2xlarge
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-f084f34bbb743fada85f66b0ed8041387565e69c
+c670ad81fda266b6598aeeef434583eb98197ae8
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -246,14 +246,8 @@ def generate_libtorch_matrix(
        if os == "linux":
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
-            # skip CUDA 12.8 builds for libtorch
-            if "12.8" in arches:
-                arches.remove("12.8")
        elif os == "windows":
            arches += CUDA_ARCHES
-            # skip CUDA 12.8 builds on Windows
-            if "12.8" in arches:
-                arches.remove("12.8")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -318,9 +312,6 @@ def generate_wheels_matrix(
            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
-            # skip CUDA 12.8 builds on Windows until available
-            if "12.8" in arches:
-                arches.remove("12.8")
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -1507,6 +1507,36 @@ def checks_to_markdown_bullets(
    ]


+def post_starting_merge_comment(
+    repo: GitRepo,
+    pr: GitHubPR,
+    explainer: TryMergeExplainer,
+    dry_run: bool,
+    ignore_current_checks_info: Optional[
+        list[tuple[str, Optional[str], Optional[int]]]
+    ] = None,
+) -> None:
+    """Post the initial merge starting message on the PR. Also post a short
+    message on all PRs in the stack."""
+    gh_post_pr_comment(
+        pr.org,
+        pr.project,
+        pr.pr_num,
+        explainer.get_merge_message(ignore_current_checks_info),
+        dry_run=dry_run,
+    )
+    if pr.is_ghstack_pr():
+        for additional_prs, _ in get_ghstack_prs(repo, pr):
+            if additional_prs.pr_num != pr.pr_num:
+                gh_post_pr_comment(
+                    additional_prs.org,
+                    additional_prs.project,
+                    additional_prs.pr_num,
+                    f"Starting merge as part of PR stack under #{pr.pr_num}",
+                    dry_run=dry_run,
+                )
+
+
 def manually_close_merged_pr(
    pr: GitHubPR,
    additional_merged_prs: list[GitHubPR],
@ -2130,13 +2160,7 @@ def merge(
    check_for_sev(pr.org, pr.project, skip_mandatory_checks)

    if skip_mandatory_checks:
-        gh_post_pr_comment(
-            pr.org,
-            pr.project,
-            pr.pr_num,
-            explainer.get_merge_message(),
-            dry_run=dry_run,
-        )
+        post_starting_merge_comment(repo, pr, explainer, dry_run)
        return pr.merge_into(
            repo,
            dry_run=dry_run,
@ -2159,12 +2183,12 @@ def merge(
        )
        ignore_current_checks_info = failing

-    gh_post_pr_comment(
-        pr.org,
-        pr.project,
-        pr.pr_num,
-        explainer.get_merge_message(ignore_current_checks_info),
-        dry_run=dry_run,
+    post_starting_merge_comment(
+        repo,
+        pr,
+        explainer,
+        dry_run,
+        ignore_current_checks_info=ignore_current_checks_info,
    )

    start_time = time.time()
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -4,6 +4,7 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}

 {%- set timeout_minutes = 240 -%}
+{%- set timeout_minutes_windows_binary = 300 -%}

 {%- macro concurrency(build_environment) -%}
 concurrency:
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -111,7 +111,10 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
-      {%- elif config["gpu_arch_type"] == "cuda" %}
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] == "12.8" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] != "12.8"%}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
      {%- else %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -71,7 +71,7 @@ jobs:
    {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
-    timeout-minutes: !{{ common.timeout_minutes }}
+    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
@ -110,7 +110,7 @@ jobs:
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
-    timeout-minutes: !{{ common.timeout_minutes }}
+    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    steps:
      !{{ common.setup_ec2_windows() }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -18,7 +18,7 @@ on:
        description: prefix for runner label
      runs_on:
        required: false
-        default: linux.12xlarge.ephemeral
+        default: linux.12xlarge.memory.ephemeral
        type: string
        description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge.
      timeout-minutes:
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -76,6 +76,11 @@ on:
        required: false
        type: boolean
        default: false
+      max-jobs:
+        description: |
+          Overwrite the number of jobs to use for the build
+        required: false
+        type: string

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -211,6 +216,7 @@ jobs:
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
+          MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@ -230,6 +236,12 @@ jobs:
            DOCKER_SHELL_CMD=
          fi

+          if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
+            MAX_JOBS="$(nproc --ignore=2)"
+          else
+            MAX_JOBS="${MAX_JOBS_OVERRIDE}"
+          fi
+
          # Leaving 1GB for the runner and other things
          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
@ -241,7 +253,8 @@ jobs:
          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e MAX_JOBS=${MAX_JOBS} \
+            -e MAX_JOBS_OVERRIDE \
            -e AWS_DEFAULT_REGION \
            -e PR_NUMBER \
            -e SHA1 \
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@ -301,6 +301,71 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  libtorch-cuda12_8-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_8-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_8-shared-with-deps-cxx11-abi-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_8-shared-with-deps-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_8-shared-with-deps-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  libtorch-rocm6_2_4-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -223,6 +223,6 @@ jobs:
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -424,7 +424,7 @@ jobs:
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-upload:  # Uploading
@ -1122,7 +1122,7 @@ jobs:
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-upload:  # Uploading
@ -1885,7 +1885,7 @@ jobs:
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-upload:  # Uploading
@ -2583,7 +2583,7 @@ jobs:
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-upload:  # Uploading
@ -3281,7 +3281,7 @@ jobs:
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-upload:  # Uploading
@ -3979,7 +3979,7 @@ jobs:
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-upload:  # Uploading
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -37,7 +37,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -151,7 +151,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -44,7 +44,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -158,7 +158,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -290,7 +290,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -405,7 +405,7 @@ jobs:
      - libtorch-cuda11_8-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -539,7 +539,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -654,7 +654,7 @@ jobs:
      - libtorch-cuda12_4-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -788,7 +788,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -903,7 +903,7 @@ jobs:
      - libtorch-cuda12_6-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -1033,3 +1033,252 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+  libtorch-cuda12_8-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cuda12_8-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_8-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_8-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda12_8-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_8-shared-with-deps-debug-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_8-shared-with-deps-debug-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+      build_name: libtorch-cuda12_8-shared-with-deps-debug
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -37,7 +37,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -151,7 +151,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -44,7 +44,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -158,7 +158,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -290,7 +290,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -405,7 +405,7 @@ jobs:
      - libtorch-cuda11_8-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -539,7 +539,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -654,7 +654,7 @@ jobs:
      - libtorch-cuda12_4-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -788,7 +788,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -903,7 +903,7 @@ jobs:
      - libtorch-cuda12_6-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -1033,3 +1033,252 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+  libtorch-cuda12_8-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cuda12_8-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_8-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_8-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda12_8-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_8-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_8-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+      build_name: libtorch-cuda12_8-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -0,0 +1,155 @@
+name: inductor-perf-nightly-h100
+
+on:
+  schedule:
+    - cron: 0 7 * * 1-6
+    - cron: 0 7 * * 0
+  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
+  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
+  workflow_dispatch:
+    inputs:
+      training:
+        description: Run training (on by default)?
+        required: false
+        type: boolean
+        default: true
+      inference:
+        description: Run inference (on by default)?
+        required: false
+        type: boolean
+        default: true
+      default:
+        description: Run inductor_default?
+        required: false
+        type: boolean
+        default: false
+      dynamic:
+        description: Run inductor_dynamic_shapes?
+        required: false
+        type: boolean
+        default: false
+      cppwrapper:
+        description: Run inductor_cpp_wrapper?
+        required: false
+        type: boolean
+        default: false
+      cudagraphs:
+        description: Run inductor_cudagraphs?
+        required: false
+        type: boolean
+        default: true
+      freezing_cudagraphs:
+        description: Run inductor_cudagraphs with freezing for inference?
+        required: false
+        type: boolean
+        default: false
+      aotinductor:
+        description: Run aot_inductor for inference?
+        required: false
+        type: boolean
+        default: false
+      maxautotune:
+        description: Run inductor_max_autotune?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  # NB: Keep this in sync with trunk.yml
+  build:
+    name: cuda12.4-py3.10-gcc9-sm90
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm90
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '9.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+    secrets: inherit
+
+  test-nightly:
+    name: cuda12.4-py3.10-gcc9-sm90
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '0 7 * * 1-6'
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm90
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # disable monitor in perf tests for more investigation
+      disable-monitor: true
+    secrets: inherit
+
+  test-weekly:
+    name: cuda12.4-py3.10-gcc9-sm90
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '0 7 * * 0'
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm90
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      timeout-minutes: 1440
+      # disable monitor in perf tests for more investigation
+      disable-monitor: true
+    secrets: inherit
+
+  test:
+    name: cuda12.4-py3.10-gcc9-sm90
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm90
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # disable monitor in perf tests for more investigation
+      disable-monitor: true
+    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -57,7 +57,7 @@ on:
        description: The list of configs used the benchmark
        required: false
        type: string
-        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf
+        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -105,6 +105,7 @@ jobs:
          { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
          { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "cachebench", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
        ]}
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets: inherit
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -77,21 +77,21 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
-          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
-          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2.c2" },
-          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
-          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
-          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
-          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
-          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2.c1" },
-          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
-          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2.c2" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
+          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
        ]}
    secrets: inherit

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -449,6 +449,26 @@ jobs:
        ]}
    secrets: inherit

+  unstable-linux-focal-cuda12_4-py3_10-gcc9-sm89-build-xfail:
+    # A version of the build that sets a larger number of jobs for a build.  May
+    # OOM
+    name: unstable-linux-focal-cuda12.4-py3.10-gcc9-sm89-xfail
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm89
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      cuda-arch-list: 8.9
+      max-jobs: 4
+      # Doesn't actually run tests, but need this in order to prevent the build
+      # from being skipped
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
  linux-focal-cuda12_4-py3_10-gcc9-sm89-test:
    name: linux-focal-cuda12.4-py3.10-gcc9-sm89
    uses: ./.github/workflows/_linux-test.yml
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats

 on:
  workflow_run:
-    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm]
+    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
    types:
      - completed

--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -330,7 +330,7 @@ at::BlasBackend Context::blasPreferredBackend() {
  if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
    static const bool hipblaslt_unsupported = []() {
      static const std::vector<std::string> archs = {
-          "gfx90a", "gfx940", "gfx941", "gfx942",
+          "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
          "gfx1100", "gfx1101"
 #endif
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -63,10 +63,12 @@ DLDataType getDLDataType(const Tensor& t) {
    case ScalarType::BFloat16:
      dtype.code = DLDataTypeCode::kDLBfloat;
      break;
+    // TODO(#146647): use macro here instead of spelling out each shell dtype
    case ScalarType::Float8_e5m2:
    case ScalarType::Float8_e5m2fnuz:
    case ScalarType::Float8_e4m3fn:
    case ScalarType::Float8_e4m3fnuz:
+    case ScalarType::Float8_e8m0fnu:
      TORCH_CHECK(false, "float8 types are not supported by dlpack");
      break;
    case ScalarType::QInt8:
--- a/aten/src/ATen/Dispatch_v2.h
+++ b/aten/src/ATen/Dispatch_v2.h
@ -87,7 +87,7 @@

 #define AT_FLOAT8_TYPES                                          \
  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
-      c10::kFloat8_e4m3fnuz
+      c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu

 #define AT_INTEGRAL_TYPES \
  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@ -13,6 +13,9 @@ inline void FunctionSchema::checkArg(
    // Fast-path for the common case
    return;
  }
+  if (value.isGenericDict() && value.toGenericDict().empty()) {
+    return;
+  }
  if (!value.type<T>()->isSubtypeOf(*argument.type())) {
    TORCH_CHECK(
        false,
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -197,7 +197,7 @@ public:
    return vector;
  }
 // Workaround for https: //gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
-#if __GNUC__ <= 12 && defined(__ARM_FEATURE_SVE)
+#if __GNUC__ <= 12 && !defined(__clang__) && defined(__ARM_FEATURE_SVE)
  static Vectorized<T>  __attribute__ ((optimize("-fno-tree-loop-vectorize"))) blendv(const Vectorized<T>& a,
 #else
  static Vectorized<T> blendv(const Vectorized<T>& a,
--- a/aten/src/ATen/cpu/vec/vec_half.h
+++ b/aten/src/ATen/cpu/vec/vec_half.h
@ -1,6 +1,7 @@
 #pragma once

 #include <ATen/cpu/vec/intrinsics.h>
+#include <c10/util/Exception.h>

 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
@ -46,5 +47,105 @@ static inline float half2float_scalar(uint16_t val) {

 #endif

+// Transpose a [2, 32] matrix to [32, 2]
+// Note: the output leading dimension should be 2,
+// that is, the output must be contiguous
+template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 2>>
+static inline void transpose_pad_2x32_block(
+    const scalar_t* src,
+    scalar_t* dst,
+    int64_t ld_src,
+    int krem = 2,
+    int nrem = 32) {
+#if defined(CPU_CAPABILITY_AVX512)
+  __m512i r0, r1;
+  __m512i d0, d1;
+  // load
+  if (nrem < 32) {
+    __mmask32 mask_krem_v = (1LL << nrem) - 1;
+    r0 = _mm512_maskz_loadu_epi16(mask_krem_v, src);
+    // if krem is not 2, pad with zeros
+    if (krem == 2) {
+      r1 = _mm512_maskz_loadu_epi16(mask_krem_v, src + ld_src);
+    } else {
+      r1 = _mm512_setzero_si512();
+    }
+  } else {
+    r0 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src));
+    if (krem == 2) {
+      r1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + ld_src));
+    } else {
+      r1 = _mm512_setzero_si512();
+    }
+  }
+  // transpose
+  d0 = _mm512_unpacklo_epi16(r0, r1);
+  d1 = _mm512_unpackhi_epi16(r0, r1);
+  r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
+  r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
+  d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
+  d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
+
+  // store
+  if (nrem < 16) {
+    __mmask32 mask_rem_v = (1LL << (nrem * 2)) - 1;
+    _mm512_mask_storeu_epi16(dst, mask_rem_v, d0);
+  } else if (nrem == 16) {
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
+  } else if (nrem < 32) {
+    __mmask32 mask_rem_v = (1LL << (nrem * 2 - 32)) - 1;
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
+    _mm512_mask_storeu_epi16(
+        reinterpret_cast<__m512i*>(dst + 32), mask_rem_v, d1);
+  } else {
+    // normal store
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 32), d1);
+  }
+#else
+TORCH_CHECK(false, "transpose_pad_2x32_block is only supported when avx512 is supported")
+#endif
+}
+
+// To use AMX to accelerate GEMM,
+// reorder the memory format [K, N] -> [K/2, N, 2]
+// Note: If K % 2 != 0, pad K implicitly
+template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 2>>
+static inline void pack_vnni2(
+    const scalar_t* src,
+    scalar_t* dst,
+    int64_t ld_src,
+    int64_t K,
+    int64_t N) {
+#if defined(CPU_CAPABILITY_AVX512)
+  int64_t bk = 0;
+  int64_t _K = K / 2 * 2;
+  int64_t _N = N / 32 * 32;
+  for (; bk < _K; bk += 2) {
+    int64_t bn = 0;
+    for (; bn < _N; bn += 32) {
+      transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src);
+    }
+    int64_t nrem = N - bn;
+    if (nrem > 0) {
+      transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem);
+    }
+  }
+  if (K % 2 == 1) {
+    int64_t bn = 0;
+    for (; bn < _N; bn += 32) {
+      transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1);
+    }
+    int64_t nrem = N - bn;
+    if (nrem > 0) {
+      transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1, nrem);
+    }
+  }
+#else
+TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported")
+#endif
+}
+
+
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cuda/detail/UnpackRaw.cuh
+++ b/aten/src/ATen/cuda/detail/UnpackRaw.cuh
@ -25,4 +25,10 @@ unpack(at::PhiloxCudaState arg) {
  }
 }

+// Adapted from TE
+// extract seed and offset from PhiloxCudaState
+__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr);
+
+void unpack_cudnn_wrapper(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr, cudaStream_t stream);
+
 } // namespace at::cuda::philox
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -227,15 +227,10 @@ TuningResultsValidator::TuningResultsValidator() {
  }
  // rocblas
  {
-#define STRINGIFY(s) #s
-#define XSTRINGIFY(s) STRINGIFY(s)
-    std::string rocblas_version = c10::str(
-        XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".",
-        XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".",
-        XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-",
-        XSTRINGIFY(ROCBLAS_VERSION_TWEAK));
-#undef XSTRINGIFY
-#undef STRINGIFY
+    size_t rocblas_version_size;
+    rocblas_get_version_string_size(&rocblas_version_size);
+    std::string rocblas_version(rocblas_version_size - 1, '\0');
+    rocblas_get_version_string(rocblas_version.data(), rocblas_version_size);
    RegisterValidator(
        "ROCBLAS_VERSION",
        [rocblas_version]() { return rocblas_version; },
--- a/aten/src/ATen/cuda/tunable/TunableOp.h
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@ -288,20 +288,23 @@ class TunableOp {
        }

        // for warmup does user set max duration, max iters, or both?
-        // warmup is allowed to be skipped by setting either iterations or duration to 0
+        // warmup is skipped by default, i.e. warmup_iter = 0
+        // warmup will be set to the non-zero value of max_warmup_duration
+        // or max_warmup_iter
+        // if both are non-zero, we take the smaller of the two.
        double max_warmup_duration = ctx->GetMaxWarmupDurationMs();
        int max_warmup_iter = ctx->GetMaxWarmupIterations();
-        int warmup_iter = 1; // default
-        if (max_warmup_duration >= 0) {
+        int warmup_iter = 0; // default
+        if (max_warmup_duration > 0) {
          int duration_iters = max_warmup_duration / approx_duration;
-          if (max_warmup_iter >= 0) {
+          if (max_warmup_iter > 0) {
            warmup_iter = std::min(max_warmup_iter, duration_iters);
          }
          else {
            warmup_iter = duration_iters;
          }
        }
-        else if (max_warmup_iter >= 0) {
+        else if (max_warmup_iter > 0) {
          warmup_iter = max_warmup_iter;
        }

--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@ -121,6 +121,16 @@ struct ConvolutionDescriptor
  }
 };

+struct DropoutDescriptor
+  : public Descriptor<miopenDropoutDescriptor,
+                      &miopenCreateDropoutDescriptor,
+                      &miopenDestroyDropoutDescriptor>
+{
+    void set(miopenHandle_t handle, float dropout, void* states, size_t stateSizeInBytes,
+             unsigned long long seed, bool use_mask, bool state_evo, miopenRNGType_t rng_mode) {
+      MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, dropout, states, stateSizeInBytes, seed, use_mask, state_evo, rng_mode));
+    }
+};

 struct RNNDescriptor
  : public Descriptor<miopenRNNDescriptor,
@ -128,9 +138,14 @@ struct RNNDescriptor
                      &miopenDestroyRNNDescriptor>
 {
    void set(int64_t hidden_size, int64_t num_layers, miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t direction, miopenRNNMode_t rnn_mode,
-              miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) {
+             miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) {
      MIOPEN_CHECK(miopenSetRNNDescriptor(mut_desc(), hidden_size, num_layers, input_mode, direction, rnn_mode, bias_mode, algorithm, datatype));
    }
+
+    void setWithDropout(DropoutDescriptor& dropout_desc, int64_t hidden_size, int64_t num_layers, miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t direction,
+                        miopenRNNMode_t rnn_mode, miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) {
+      MIOPEN_CHECK(miopenSetRNNDescriptor_V2(mut_desc(), hidden_size, num_layers, dropout_desc.mut_desc(), input_mode, direction, rnn_mode, bias_mode, algorithm, datatype));
+    }
 };

 union Constant
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@ -7,11 +7,6 @@
 #include <ATen/Config.h>

 #include <ATen/native/mkldnn/Matmul.h>
-#include <ATen/native/mkldnn/Linear.h>
-#include <ATen/native/Resize.h>
-#if !defined(__s390x__) && !defined(__powerpc__)
-#include <cpuinfo.h>
-#endif

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/CPUFunctions.h>
@ -29,12 +24,6 @@
 #include <ATen/ops/mv_native.h>
 #include <ATen/ops/scalar_tensor_native.h>
 #include <ATen/ops/vdot_native.h>
-#include <ATen/ops/_scaled_mm_native.h>
-#include <ATen/ops/mul.h>
-#include <ATen/ops/matmul.h>
-#endif
-#if AT_MKLDNN_ENABLED()
-#include <ideep.hpp>
 #endif

 namespace at::meta {
@ -233,79 +222,4 @@ Tensor vdot(const Tensor &self, const Tensor &other){

 }

-static Tensor&
-_scaled_mm_out_cpu_emulated(const Tensor& mat1, const Tensor& mat2,
-          const Tensor& scale_a,
-          const Tensor& scale_b,
-          const std::optional<at::Tensor>& bias,
-          const std::optional<at::Tensor>& scale_result,
-          std::optional<c10::ScalarType> out_dtype,
-          bool use_fast_accum,
-          Tensor& out) {
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
-  TORCH_CHECK(
-      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
-      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-
-  TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend.");
-  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
-       " but got ", bias->numel());
-
-  // Check types
-  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
-  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
-  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
-
-  auto mat1_c = mat1.contiguous();
-  auto mat2_c = mat2.contiguous();
-  IntArrayRef mat1_sizes = mat1_c.sizes();
-  IntArrayRef mat2_sizes = mat2_c.sizes();
-  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
-
-  float input_scale = scale_a.item<float>();
-  float weight_scale = scale_b.item<float>();
-  auto fp32_mat1 = at::mul(mat1.to(kFloat), input_scale);
-  auto fp32_mat2 = at::mul(mat2_c.to(kFloat), weight_scale);
-  auto out_tmp = at::matmul(fp32_mat1, fp32_mat2);
-  if (bias) {
-    out_tmp.add_(bias.value());
-  }
-  out_tmp = out_tmp.to(out.scalar_type());
-  out.copy_(out_tmp);
-  return out;
-}
-
-Tensor&
-_scaled_mm_out_cpu(const Tensor& mat1, const Tensor& mat2,
-          const Tensor& scale_a,
-          const Tensor& scale_b,
-          const std::optional<at::Tensor>& bias,
-          const std::optional<at::Tensor>& scale_result,
-          std::optional<c10::ScalarType> out_dtype,
-          bool use_fast_accum,
-          Tensor& out) {
-#if AT_MKLDNN_ENABLED() && !(IDEEP_VERSION_MAJOR <= 2 || (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR < 5))
-  if (at::globalContext().userEnabledMkldnn() && cpuinfo_has_x86_amx_int8()) {
-    return mkldnn_scaled_mm(mat1, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
-  } else
-#endif
-  {
-  return _scaled_mm_out_cpu_emulated(mat1, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
-  }
-}
-
-Tensor
-_scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
-          const Tensor& scale_a,
-          const Tensor& scale_b,
-          const std::optional<at::Tensor>& bias,
-          const std::optional<at::Tensor>& scale_result,
-          std::optional<c10::ScalarType> out_dtype,
-          bool use_fast_accum) {
-  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
-  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
-  return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
-}
-
 }  // namespace at::native
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -59,8 +59,8 @@ bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
 #if !defined(C10_MOBILE)
 #define _AT_DISPATCH_CP_TYPES(TYPE, NAME, ...)                              \
        AT_DISPATCH_V2(                             \
-            TYPE, NAME, AT_WRAP(__VA_ARGS__), kComplexHalf, kHalf, kBool, kBFloat16, kFloat8_e5m2,            \
-            kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            TYPE, NAME, AT_WRAP(__VA_ARGS__), kComplexHalf, kHalf, kBool, kBFloat16,            \
+            AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
 #define _AT_DISPATCH_CP_TYPES(TYPE, NAME, ...)     \
        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(    \
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -365,9 +365,13 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
  for (const auto i : c10::irange(2, ndim)) {
    reduce_dims[i - 1] = i;
  }
-
-  auto sum = at::sum(grad_out_, /*dim=*/reduce_dims);
-  auto sum_a = sum.accessor<scalar_t, 1>();
+  // Using float data type for Half sum to avoid overflow
+  // since the representation range of Half is small.
+  auto sum = grad_out_.scalar_type() == kHalf
+      ? at::sum(grad_out_.to(ScalarType::Float), /*dim=*/reduce_dims)
+      : at::sum(grad_out_, /*dim=*/reduce_dims);
+  using sum_t = std::conditional_t<std::is_same_v<scalar_t, at::Half>, float, scalar_t>;
+  auto sum_a = sum.accessor<sum_t, 1>();

  auto reduce_iter = TensorIteratorConfig()
      .add_const_input(input)
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -460,7 +460,8 @@ Tensor isinf(const Tensor& self) {

 Tensor isfinite(const Tensor& self) {
  // Note: Integral tensor values are always finite
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true) ||
+      self.scalar_type() == kFloat8_e8m0fnu) {
    return at::ones_like(self, at::kBool, at::MemoryFormat::Preserve);
  }

--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@ -204,12 +204,12 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
 #define _AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...)                                       \
        AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),                                       \
            kComplexHalf, kHalf, kBool,              \
-            kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \
-            kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), \
+            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #define _AT_DISPATCH_ALL_TYPES_NO_CF(TYPE, NAME, ...)              \
        AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),                    \
-            kBool, kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \
-            kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            kBool, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), \
+            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
 #define _AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...)                                               \
        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                                               \
--- a/aten/src/ATen/native/cpu/FillKernel.cpp
+++ b/aten/src/ATen/native/cpu/FillKernel.cpp
@ -51,6 +51,9 @@ void fill_kernel(TensorIterator& iter, const Scalar& value_scalar) {
    fill_non_native_type<at::Float8_e4m3fnuz>(iter, value_scalar);
  } else if (iter.dtype() == ScalarType::Float8_e5m2fnuz) {
    fill_non_native_type<at::Float8_e5m2fnuz>(iter, value_scalar);
+  } else if (iter.dtype() == ScalarType::Float8_e8m0fnu) {
+    // TODO(#146647): use macro here instead of spelling out each float8 dtype
+    fill_non_native_type<at::Float8_e8m0fnu>(iter, value_scalar);
  } else {
    AT_DISPATCH_V2(
      iter.dtype(), "fill_cpu", AT_WRAP([&]() {
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@ -4,6 +4,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/vec_half.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/cpu/utils.h>
@ -293,103 +294,6 @@ inline void pad_remain_row_col_zero(
 }


-// Transpose a [2, 32] matrix to [32, 2]
-// Note: the output leading dimension should be 2,
-// that is, the output must be contiguous
-static inline void transpose_pad_2x32_block(
-    const uint16_t* src,
-    uint16_t* dst,
-    int64_t ld_src,
-    int krem = 2,
-    int nrem = 32) {
-#if defined(CPU_CAPABILITY_AVX512)
-  __m512i r0, r1;
-  __m512i d0, d1;
-  // load
-  if (nrem < 32) {
-    __mmask32 mask_krem_v = (1LL << nrem) - 1;
-    r0 = _mm512_maskz_loadu_epi16(mask_krem_v, src);
-    // if krem is not 2, pad with zeros
-    if (krem == 2) {
-      r1 = _mm512_maskz_loadu_epi16(mask_krem_v, src + ld_src);
-    } else {
-      r1 = _mm512_setzero_si512();
-    }
-  } else {
-    r0 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src));
-    if (krem == 2) {
-      r1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + ld_src));
-    } else {
-      r1 = _mm512_setzero_si512();
-    }
-  }
-  // transpose
-  d0 = _mm512_unpacklo_epi16(r0, r1);
-  d1 = _mm512_unpackhi_epi16(r0, r1);
-  r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
-  r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
-  d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
-  d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
-
-  // store
-  if (nrem < 16) {
-    __mmask32 mask_rem_v = (1LL << (nrem * 2)) - 1;
-    _mm512_mask_storeu_epi16(dst, mask_rem_v, d0);
-  } else if (nrem == 16) {
-    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
-  } else if (nrem < 32) {
-    __mmask32 mask_rem_v = (1LL << (nrem * 2 - 32)) - 1;
-    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
-    _mm512_mask_storeu_epi16(
-        reinterpret_cast<__m512i*>(dst + 32), mask_rem_v, d1);
-  } else {
-    // normal store
-    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
-    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 32), d1);
-  }
-#else
-TORCH_CHECK(false, "transpose_pad_2x32_block is only supported when avx512 is supported")
-#endif
-}
-
-// To use AMX to accelerate GEMM,
-// reorder the memory format [K, N] -> [K/2, N, 2]
-// Note: If K % 2 != 0, pad K implicitly
-static inline void pack_vnni2(
-    const uint16_t* src,
-    uint16_t* dst,
-    int64_t ld_src,
-    int64_t K,
-    int64_t N) {
-#if defined(CPU_CAPABILITY_AVX512)
-  int64_t bk = 0;
-  int64_t _K = K / 2 * 2;
-  int64_t _N = N / 32 * 32;
-  for (; bk < _K; bk += 2) {
-    int64_t bn = 0;
-    for (; bn < _N; bn += 32) {
-      transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src);
-    }
-    int64_t nrem = N - bn;
-    if (nrem > 0) {
-      transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem);
-    }
-  }
-  if (K % 2 == 1) {
-    int64_t bn = 0;
-    for (; bn < _N; bn += 32) {
-      transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1);
-    }
-    int64_t nrem = N - bn;
-    if (nrem > 0) {
-      transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1, nrem);
-    }
-  }
-#else
-TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported")
-#endif
-}
-
 template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size, bool with_pack=false>
 void cpu_flash_attention(
    const Tensor& output,
@ -576,7 +480,7 @@ void cpu_flash_attention(
              /* ld_dst */ kvBlockSize);

          // Pack [headSize, kvBlockSize]
-          pack_vnni2(
+          at::vec::pack_vnni2(
            /* src */ reinterpret_cast<const uint16_t*>(transpose_ptr),
            /* dst */ reinterpret_cast<uint16_t*>(key_reorder_ptr + i * num_head * eheadSize * kvSize +
                    j * eheadSize * kvSize + n * eheadSize),
@ -585,7 +489,7 @@ void cpu_flash_attention(
            /* N */ kvBlockSize);

          // Pack [kvBlockSize, headSize]
-          pack_vnni2(
+          at::vec::pack_vnni2(
            /* src */ reinterpret_cast<const uint16_t*>(v_data + i * vStrideB + j * vStrideH + n * vStrideN),
            /* dst */ reinterpret_cast<uint16_t*>(value_reorder_ptr +
                    i * num_head * kv_padding_size * headSize +
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -184,7 +184,13 @@ void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef
      }
    }),
    AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-    AT_EXPAND(AT_FLOAT8_TYPES),
+    // AT_EXPAND(AT_FLOAT8_TYPES),
+    // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+    // should not be supported here, then reenable AT_FLOAT8_DTYPES
+    kFloat8_e4m3fn,
+    kFloat8_e5m2,
+    kFloat8_e4m3fnuz,
+    kFloat8_e5m2fnuz,
    kComplexHalf,
    kHalf,
    kBool,
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -191,7 +191,7 @@ static bool isSupportedHipLtROCmArch(int index) {
    hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
    std::string device_arch = prop->gcnArchName;
    static const std::vector<std::string> archs = {
-        "gfx90a", "gfx940", "gfx941", "gfx942",
+        "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
        "gfx1100", "gfx1101"
 #endif
@ -862,7 +862,7 @@ static bool _scaled_mm_allowed_device() {
    auto dprops = at::cuda::getCurrentDeviceProperties();
 #ifdef USE_ROCM
    std::string device_arch = dprops->gcnArchName;
-    static const std::vector<std::string> archs = {"gfx940", "gfx941", "gfx942"};
+    static const std::vector<std::string> archs = {"gfx942"};
    for (std::string arch : archs) {
        size_t substring = device_arch.find(arch);
        if (substring != std::string::npos) {
@ -879,7 +879,7 @@ static bool _scaled_mm_allowed_device() {
 static bool _scaled_mm_is_fnuz() {
    auto dprops = at::cuda::getCurrentDeviceProperties();
    std::string device_arch = dprops->gcnArchName;
-    static const std::vector<std::string> archs = {"gfx940", "gfx941", "gfx942"};
+    static const std::vector<std::string> archs = {"gfx942"};
    for (std::string arch : archs) {
        size_t substring = device_arch.find(arch);
        if (substring != std::string::npos) {
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@ -144,6 +144,28 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2fnuz x) { return x; });
         break;
    }
+  } else if (dtype == kFloat8_e8m0fnu) {
+    // TODO(#146647): clean this up, too much copy-pasta
+    switch (other_dtype) {
+      case kFloat:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
+             return Float8_e8m0fnu(value);
+         });
+         break;
+      case kHalf:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
+             return Float8_e8m0fnu(value);
+         });
+         break;
+      case kBFloat16:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
+             return Float8_e8m0fnu(value);
+         });
+         break;
+      default:
+         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e8m0fnu x) { return x; });
+         break;
+    }
  } else {
    TORCH_CHECK(false, "This supposed ot be called only for Float8 types");
  }
@ -157,7 +179,7 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
    AT_DISPATCH_QINT_TYPES(dtype, "copy_", [&] {
      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
    });
-  } else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn || dtype == kFloat8_e5m2fnuz || dtype == kFloat8_e4m3fnuz) {
+  } else if (isFloat8Type(dtype)) {
     float8_copy_kernel_cuda(iter);
  } else if (iter.dtype(1) == kFloat && (dtype == kBFloat16 || dtype == kHalf)) {
     if (dtype == kBFloat16) {
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -582,7 +582,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
            C10_CUDA_KERNEL_LAUNCH_CHECK();
          }),
          AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-          AT_EXPAND(AT_FLOAT8_TYPES),
+          // AT_EXPAND(AT_FLOAT8_TYPES),
+          // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+          // should not be supported here, then reenable AT_FLOAT8_DTYPES
+          kFloat8_e4m3fn,
+          kFloat8_e5m2,
+          kFloat8_e4m3fnuz,
+          kFloat8_e5m2fnuz,
          kComplexHalf,
          kHalf,
          kBool,
@ -606,7 +612,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
              C10_CUDA_KERNEL_LAUNCH_CHECK();
            }),
            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-            AT_EXPAND(AT_FLOAT8_TYPES),
+            // AT_EXPAND(AT_FLOAT8_TYPES),
+            // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+            // should not be supported here, then reenable AT_FLOAT8_DTYPES
+            kFloat8_e4m3fn,
+            kFloat8_e5m2,
+            kFloat8_e4m3fnuz,
+            kFloat8_e5m2fnuz,
            kComplexHalf,
            kHalf,
            kBool,
@ -630,7 +642,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
                C10_CUDA_KERNEL_LAUNCH_CHECK();
              }),
              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-              AT_EXPAND(AT_FLOAT8_TYPES),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
+              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+              // should not be supported here, then reenable AT_FLOAT8_DTYPES
+              kFloat8_e4m3fn,
+              kFloat8_e5m2,
+              kFloat8_e4m3fnuz,
+              kFloat8_e5m2fnuz,
              kComplexHalf,
              kHalf,
              kBool,
@ -652,7 +670,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
                C10_CUDA_KERNEL_LAUNCH_CHECK();
              }),
              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-              AT_EXPAND(AT_FLOAT8_TYPES),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
+              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+              // should not be supported here, then reenable AT_FLOAT8_DTYPES
+              kFloat8_e4m3fn,
+              kFloat8_e5m2,
+              kFloat8_e4m3fnuz,
+              kFloat8_e5m2fnuz,
              kComplexHalf,
              kHalf,
              kBool,
@ -677,7 +701,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
              C10_CUDA_KERNEL_LAUNCH_CHECK();
            }),
            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-            AT_EXPAND(AT_FLOAT8_TYPES),
+            // AT_EXPAND(AT_FLOAT8_TYPES),
+            // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+            // should not be supported here, then reenable AT_FLOAT8_DTYPES
+            kFloat8_e4m3fn,
+            kFloat8_e5m2,
+            kFloat8_e4m3fnuz,
+            kFloat8_e5m2fnuz,
            kComplexHalf,
            kHalf,
            kBool,
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -11,7 +11,7 @@
 #include <hip/hip_bf16.h>

 __device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
-#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
+#if (defined(__gfx942__)) && \
  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
@ -39,7 +39,7 @@ __device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* addre
 }

 __device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
-#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
+#if (defined(__gfx942__)) && \
  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
  // The api expects an ext_vector_type of half
  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@ -982,7 +982,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
            auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
              smem_reduction_sz) / sizeof(scalar_t);

-            bool can_use_smem = (size_t) dim_size < max_elements_per_smem;
+            bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
            can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
            can_use_smem &= !(dim_size % ILP);
@ -1061,7 +1061,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
            auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
              smem_reduction_sz) / sizeof(scalar_t);

-            bool can_use_smem = (size_t) dim_size < max_elements_per_smem;
+            bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
            can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
            can_use_smem &= !(dim_size % ILP);
@ -1125,10 +1125,10 @@ void dispatch_host_softmax_backward(int64_t dim_size, dim3 grid, Tensor &grad, T
  size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
  auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
    smem_reduction_sz) / sizeof(output_t);
-  bool can_use_smem = dim_size < max_elements_per_smem;
-  can_use_smem &= (!(reinterpret_cast<const uintptr_t>(gI.const_data_ptr<input_t>()) % ALIGN_BYTES));
-  can_use_smem &= (!(reinterpret_cast<const uintptr_t>(output.const_data_ptr<output_t>()) % ALIGN_BYTES));
-  can_use_smem &= !(reinterpret_cast<const uintptr_t>(grad.const_data_ptr<output_t>()) % ALIGN_BYTES);
+  bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
+  can_use_smem &= (!(reinterpret_cast<uintptr_t>(gI.const_data_ptr<input_t>()) % ALIGN_BYTES));
+  can_use_smem &= (!(reinterpret_cast<uintptr_t>(output.const_data_ptr<output_t>()) % ALIGN_BYTES));
+  can_use_smem &= !(reinterpret_cast<uintptr_t>(grad.const_data_ptr<output_t>()) % ALIGN_BYTES);
  can_use_smem &= !(dim_size % ILP);
  // This should not be needed on current generation GPUs because the size of shared memory is so low.
  // But we add this check to be defensive and future-proof just in case shared memory size goes up
--- a/aten/src/ATen/native/cuda/TensorTopK.cpp
+++ b/aten/src/ATen/native/cuda/TensorTopK.cpp
@ -34,6 +34,10 @@ void topk_out_with_sort(
 // TODO: remove this when CUDA <11.6 is no longer supported
 bool disable_sort_for_topk();
 bool should_use_sort(const Tensor& self, int64_t dim) {
+#if defined(USE_ROCM)
+  if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
+  return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
+#else
  if (disable_sort_for_topk()) return false;
  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
  if (self.dim() == 0) return false;
@ -42,6 +46,7 @@ bool should_use_sort(const Tensor& self, int64_t dim) {
  if (slice_size == 0) return false;
  int64_t num_slices = self.numel() / slice_size;
  return num_slices <= 10 && slice_size >= 100000;
+#endif
 }

 TORCH_IMPL_FUNC(topk_out_cuda)
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@ -596,17 +596,8 @@ int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
  constexpr int REGS_PER_BLOCK = REGS_PER_THREAD * BLOCK_THREADS;
  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
  int mpc = prop->multiProcessorCount;
-#if defined(USE_ROCM)
-  int regs_per_mp = prop->regsPerBlock;
-  int max_blocks_per_mp = 32;
-#else
  int regs_per_mp = prop->regsPerMultiprocessor;
-#if !defined(USE_ROCM)
  int max_blocks_per_mp = prop->maxBlocksPerMultiProcessor;
-#else
-  int max_blocks_per_mp = 32;
-#endif
-#endif
  int blocks_per_mp = std::min(regs_per_mp / REGS_PER_BLOCK, max_blocks_per_mp);
  int64_t items_per_thread = at::ceil_div((int64_t)(slice_size * num_slices), (int64_t)(mpc * blocks_per_mp * BLOCK_THREADS));
  items_per_thread = std::max(MIN_ITEMS_PER_THREAD, std::min((int)items_per_thread, MAX_ITEMS_PER_THREAD)); // clamp to (4, 64)
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@ -137,7 +137,7 @@ using VecT = T __attribute__((ext_vector_type(Rank)));
 static bool isCDNA2orLater(int index) {
    hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
    std::string device_arch = prop->gcnArchName;
-    static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
+    static const std::vector<std::string> archs = {"gfx90a", "gfx942"};
    for (std::string arch : archs) {
        size_t substring = device_arch.find(arch);
        if (substring != std::string::npos) {
@ -151,7 +151,7 @@ static bool isCDNA2orLater(int index) {
 constexpr int32_t kWarpSize = 32;
 #endif

-#if defined (__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined (__gfx90a__) || defined(__gfx942__)
 #define CDNA2_OR_LATER 1
 #else
 #define CDNA2_OR_LATER 0
--- a/aten/src/ATen/native/cuda/jit_utils.h
+++ b/aten/src/ATen/native/cuda/jit_utils.h
@ -228,6 +228,10 @@ template <> inline std::string typeName<at::Float8_e5m2fnuz>() {
 template <> inline std::string typeName<at::Float8_e4m3fnuz>() {
    return "at::Float8_e4m3fnuz";
 }
+template <> inline std::string typeName<at::Float8_e8m0fnu>() {
+    // TODO(#146647): Can the code here be made generic for any scalartype?
+    return "at::Float8_e8m0fnu";
+}

 #define TYPE_NAME_CASE(ctype, scalartype)                    \
  case ScalarType::scalartype:  return typeName<ctype>();
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -31,6 +31,33 @@ void run_cudnn_SDP_fprop(
      false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
 }

+void run_cudnn_SDP_fprop_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d_qk,
+    int64_t d_v,
+    float scaling_factor,
+    bool return_softmaxstats,
+    bool is_causal,
+    double dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    Tensor& softmaxstats,
+    Tensor& o,
+    Tensor& dropoutseed,
+    Tensor& dropoutoffset) {
+  TORCH_CHECK(
+      false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
+}
+
 void run_cudnn_SDP_bprop(
    int64_t b,
    int64_t h,
@ -461,16 +488,6 @@ auto build_graph_and_tensors(
                              .set_stride(attn_bias.value().strides().vec()));
    scaled_dot_product_flash_attention_options.set_bias(bias.value());
  }
-  auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                     .set_name("Seq_q")
-                                     .set_dim({b, 1, 1, 1})
-                                     .set_stride({1, 1, 1, 1})
-                                     .set_data_type(fe::DataType_t::INT32));
-  auto seq_kv = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                      .set_name("Seq_kv")
-                                      .set_dim({b, 1, 1, 1})
-                                      .set_stride({1, 1, 1, 1})
-                                      .set_data_type(fe::DataType_t::INT32));

  auto [O, Stats] =
      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
@ -500,6 +517,201 @@ auto build_graph_and_tensors(
      std::move(Stats));
 }

+auto build_graph_and_tensors_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d_qk,
+    int64_t d_v,
+    float scaling_factor,
+    bool return_softmaxstats,
+    bool is_causal,
+    double dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    Tensor& softmaxstats,
+    Tensor& o,
+    Tensor& dropoutseed,
+    Tensor& dropoutoffset,
+    cudnnHandle_t& handle) {
+  auto dtype = fe::DataType_t::HALF;
+  if (q.scalar_type() == kBFloat16) {
+    dtype = fe::DataType_t::BFLOAT16;
+  }
+  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  // We're baking in float accumulation and scale types
+  // in theory the graph may support other types, but they
+  // have not been tested
+  mha_graph->set_io_data_type(dtype)
+      .set_intermediate_data_type(fe::DataType_t::FLOAT)
+      .set_compute_data_type(fe::DataType_t::FLOAT);
+  auto attn_scale =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_name("Attn_scale")
+                            .set_dim({1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_is_pass_by_value(true)
+                            .set_data_type(fe::DataType_t::FLOAT));
+  auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+  auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(fe::DataType_t::INT32));
+  auto SEQ_LEN_Q = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("Seq_q")
+                                         .set_dim({b, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  auto SEQ_LEN_KV =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_name("Seq_kv")
+                            .set_dim({b, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+
+  auto scaled_dot_product_flash_attention_options =
+      fe::graph::SDPA_attributes()
+          .set_name("CUDNN_SDPA_NESTEDTENSOR")
+          .set_is_inference(return_softmaxstats == false)
+          .set_causal_mask(is_causal)
+          .set_attn_scale(attn_scale)
+          .set_dropout(dropout_probability, seed, offset)
+          .set_seq_len_q(SEQ_LEN_Q)
+          .set_seq_len_kv(SEQ_LEN_KV)
+          .set_padding_mask(true);
+  // We hardcode BSHD to cuDNN even though the underlying layout is THD
+  auto q_strides = q.strides();
+  auto k_strides = k.strides();
+  auto v_strides = v.strides();
+  constexpr int strideidx0 = 1;
+  constexpr int strideidx1 = 0;
+  constexpr int strideidx2 = 2;
+  auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("Q")
+                                 .set_dim({b, h_q, s_q, d_qk})
+                                 .set_stride(
+                                     {INT_MAX,
+                                      q_strides[strideidx0],
+                                      q_strides[strideidx1],
+                                      q_strides[strideidx2]}));
+  auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("K")
+                                 .set_dim({b, h_k, s_kv, d_qk})
+                                 .set_stride(
+                                     {INT_MAX,
+                                      k_strides[strideidx0],
+                                      k_strides[strideidx1],
+                                      k_strides[strideidx2]}));
+  auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("V")
+                                 .set_dim({b, h_v, s_kv, d_v})
+                                 .set_stride(
+                                     {INT_MAX,
+                                      v_strides[strideidx0],
+                                      v_strides[strideidx1],
+                                      v_strides[strideidx2]}));
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+  if (attn_bias.has_value()) {
+    TORCH_CHECK(
+        false,
+        "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    scaled_dot_product_flash_attention_options.set_bias(bias.value());
+  }
+  auto RAG_Q_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("cum_seq_q")
+                                         .set_dim({b + 1, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  auto RAG_K_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("cum_seq_k")
+                                         .set_dim({b + 1, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  auto RAG_V_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("cum_seq_v")
+                                         .set_dim({b + 1, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  auto RAG_O_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("cum_seq_o")
+                                         .set_dim({b + 1, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::INT32));
+  // auto RAG_STATS_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
+  //                                     .set_name("cum_seq_stats")
+  //                                     .set_dim({b + 1, 1, 1, 1})
+  //                                     .set_stride({1, 1, 1, 1})
+  //                                     .set_data_type(fe::DataType_t::INT32));
+  auto RAG_STATS_OFF = nullptr;
+  Q->set_ragged_offset(RAG_Q_OFF);
+  K->set_ragged_offset(RAG_K_OFF);
+  V->set_ragged_offset(RAG_V_OFF);
+  auto [O, Stats] =
+      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
+  auto o_strides = o.strides();
+  O->set_output(true)
+      .set_dim({b, h_q, s_q, d_v})
+      .set_stride(
+          {INT_MAX,
+           o_strides[strideidx0],
+           o_strides[strideidx1],
+           o_strides[strideidx2]});
+
+  O->set_ragged_offset(RAG_O_OFF);
+  if (Stats) {
+    TORCH_CHECK(
+        false,
+        "cuDNN SDPA Nested Tensor does not yet handle backwards/logsumexp computation");
+    // TODO(eqy): fix  when stats (backward) support is added
+    Stats->set_output(true)
+        .set_data_type(fe::DataType_t::FLOAT)
+        .set_dim({b, h_q, s_q, 1})
+        .set_stride({h_q * s_q * d_v, d_v, s_q * d_v, 1});
+    Stats->set_ragged_offset(RAG_STATS_OFF);
+  }
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
+  AT_CUDNN_FRONTEND_CHECK(
+      mha_graph->create_execution_plans({fe::HeurMode_t::A}));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
+  return std::make_tuple(
+      std::move(mha_graph),
+      std::move(Q),
+      std::move(K),
+      std::move(V),
+      std::move(bias),
+      std::move(attn_scale),
+      std::move(seed),
+      std::move(offset),
+      std::move(O),
+      std::move(Stats),
+      std::move(RAG_Q_OFF),
+      std::move(RAG_K_OFF),
+      std::move(RAG_V_OFF),
+      std::move(RAG_O_OFF),
+      std::move(RAG_STATS_OFF),
+      std::move(SEQ_LEN_Q),
+      std::move(SEQ_LEN_KV));
+}
+
 auto build_graph_and_tensors_backward(
    int64_t b,
    int64_t h,
@ -737,6 +949,119 @@ void run_cudnn_SDP_fprop(
  mhagraphcache.update(key, graph_and_tensors_values);
 }

+void run_cudnn_SDP_fprop_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d_qk,
+    int64_t d_v,
+    float scaling_factor,
+    bool return_softmaxstats,
+    bool is_causal,
+    double dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    Tensor& softmaxstats,
+    Tensor& o,
+    Tensor& dropoutseed,
+    Tensor& dropoutoffset) {
+  cudnnHandle_t handle = getCudnnHandle();
+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel()) {
+    return;
+  }
+
+  if (!o.defined()) {
+    o = at::empty({q.size(0), h_q, d_v}, q.options());
+  }
+
+  if (return_softmaxstats && !softmaxstats.defined()) {
+    softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat));
+  }
+  auto
+      [mha_graph,
+       Q,
+       K,
+       V,
+       bias,
+       attn_scale,
+       seed,
+       offset,
+       O,
+       Stats,
+       RAG_Q_OFF,
+       RAG_K_OFF,
+       RAG_V_OFF,
+       RAG_O_OFF,
+       RAG_STATS_OFF,
+       SEQ_LEN_Q,
+       SEQ_LEN_KV] =
+          build_graph_and_tensors_nestedtensor(
+              b,
+              h_q,
+              h_k,
+              h_v,
+              s_q,
+              s_kv,
+              d_qk,
+              d_v,
+              scaling_factor,
+              return_softmaxstats,
+              is_causal,
+              dropout_probability,
+              cum_seqlen_q,
+              cum_seqlen_kv,
+              q,
+              k,
+              v,
+              attn_bias,
+              softmaxstats,
+              o,
+              dropoutseed,
+              dropoutoffset,
+              handle);
+  auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
+  auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
+  auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
+  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_qk);
+  auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
+  auto rag_stats_off = cum_seqlen_q.mul(h_q);
+  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
+      variant_pack = {
+          {Q, q.data_ptr()},
+          {K, k.data_ptr()},
+          {V, v.data_ptr()},
+          {attn_scale, &scaling_factor},
+          {seed, dropoutseed.data_ptr()},
+          {offset, dropoutoffset.data_ptr()},
+          {O, o.data_ptr()},
+          {RAG_Q_OFF, rag_q_off.data_ptr()},
+          {RAG_O_OFF, rag_q_off.data_ptr()},
+          {RAG_K_OFF, rag_k_off.data_ptr()},
+          {RAG_V_OFF, rag_v_off.data_ptr()},
+          {SEQ_LEN_Q, seqlen_q.data_ptr()},
+          {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
+  if (return_softmaxstats) {
+    variant_pack[Stats] = softmaxstats.data_ptr();
+    variant_pack[RAG_STATS_OFF] = cum_seqlen_q.data_ptr();
+  }
+  if (attn_bias.has_value()) {
+    TORCH_CHECK("bias not supported with nestedtensor");
+  }
+  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  TORCH_CHECK(
+      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+}
+
 void run_cudnn_SDP_bprop(
    int64_t b,
    int64_t h,
--- a/aten/src/ATen/native/cudnn/MHA.h
+++ b/aten/src/ATen/native/cudnn/MHA.h
@ -23,6 +23,30 @@ void run_cudnn_SDP_fprop(
    Tensor& dropoutseed,
    Tensor& dropoutoffset);

+void run_cudnn_SDP_fprop_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t max_s_q,
+    int64_t max_s_kv,
+    int64_t d_k,
+    int64_t d_v,
+    float scaling_factor,
+    bool isTraining,
+    bool is_causal,
+    double dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    Tensor& softmaxstats,
+    Tensor& o,
+    Tensor& dropoutseed,
+    Tensor& dropoutoffset);
+
 void run_cudnn_SDP_bprop(
    int64_t b,
    int64_t h,
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@ -619,7 +619,7 @@ Workspace chooseAlgorithm(
  try {
    return Workspace(workspace_size);
  } catch (const std::exception& e) {
-    hipGetLastError(); // clear OOM error
+    std::ignore = hipGetLastError(); // clear OOM error

    // switch to default algorithm and record it in the cache to prevent
    // further OOM errors
@ -640,7 +640,7 @@ Workspace chooseSolution(const ConvolutionArgs& args, uint64_t* solution_id)
    *solution_id = solution.solution_id;
    return Workspace(solution.workspace_size);
  } catch (const std::exception& e) {
-    hipGetLastError(); // clear OOM error
+    std::ignore = hipGetLastError(); // clear OOM error

    // switch to default algorithm
    solution = search::getSolution(args, true);
--- a/aten/src/ATen/native/miopen/RNN_miopen.cpp
+++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp
@ -57,6 +57,10 @@ namespace at::native {

 #include <ATen/TensorUtils.h>

+#include <c10/hip/HIPCachingAllocator.h>
+
+#include <rocrand/rocrand_xorwow.h>
+
 #include <functional>
 #include <iterator>
 #include <sstream>
@ -68,10 +72,36 @@ namespace at::native {

 namespace at { namespace native {

+// Workspace copied from Conv_miopen.cpp but is put here inside anonymous namespace
+// to avoid duplicate symbols and to avoid the need to expose as a public struct.
+
+namespace {
+
+struct Workspace {
+  Workspace(size_t size) : size(size), data(NULL) {
+    data = c10::hip::HIPCachingAllocator::raw_alloc(size);
+  }
+  Workspace(const Workspace&) = delete;
+  Workspace(Workspace&&) = default;
+  Workspace& operator=(Workspace&&) = default;
+  ~Workspace() {
+    if (data) {
+      c10::hip::HIPCachingAllocator::raw_delete(data);
+    }
+  }
+
+  size_t size;
+  void* data;
+};
+
+} // anonymous
+
 //RNNDescriptor.
 struct RNNDescriptorParams {
    int64_t hidden_size;
    int64_t num_layers;
+    double dropout_rate;
+    uint64_t dropout_seed;
    miopenRNNDirectionMode_t direction;
    miopenRNNMode_t rnn_mode;
    miopenDataType_t datatype;
@ -114,6 +144,16 @@ struct RNNDescriptorParams {
        }
    }

+    void set_dropout(double dropout_rate, uint64_t dropout_seed = 0) {
+        this->dropout_rate = dropout_rate;
+        if (dropout_seed == 0) {
+            // rand() returns 32 bit values so we combine two of them
+            this->dropout_seed = rand() << 32 | rand();
+        } else {
+            this->dropout_seed = dropout_seed;
+        }
+    }
+
    void set(int64_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional, miopenDataType_t datatype, miopenRNNBiasMode_t bias_mode) {
        this->set_mode(mode);
        this->hidden_size = hidden_size;
@ -128,6 +168,12 @@ struct RNNDescriptorParams {
        rnn_desc.set(hidden_size, num_layers, input_mode, direction, rnn_mode, bias_mode, algo, datatype);
        return rnn_desc;
    }
+
+    RNNDescriptor descriptorWithDropout(DropoutDescriptor& dropout_desc) const {
+        RNNDescriptor rnn_desc;
+        rnn_desc.setWithDropout(dropout_desc, hidden_size, num_layers, input_mode, direction, rnn_mode, bias_mode, algo, datatype);
+        return rnn_desc;
+    }
 };

 //TensorDescriptor list.
@ -204,6 +250,8 @@ struct RNNParams {

 struct RNNDescriptors {
    RNNDescriptor rnn_desc;
+    DropoutDescriptor dropout_desc;
+    std::unique_ptr<Workspace> dropout_states;
    std::vector<TensorDescriptor> x_descs;
    std::vector<TensorDescriptor> y_descs;
    TensorDescriptor hx_desc;
@ -212,7 +260,25 @@ struct RNNDescriptors {
    TensorDescriptor cy_desc;

    RNNDescriptors(const RNNParams& fn, miopenHandle_t handle, Tensor x, Tensor y, Tensor hx, Tensor cx) {
-        rnn_desc = fn.rnn.descriptor();
+        if (fn.rnn.dropout_rate == 0.0) {
+            rnn_desc = fn.rnn.descriptor();
+        } else {
+            size_t statesSizeInBytes = 0;
+            miopenDropoutGetStatesSize(handle, &statesSizeInBytes);
+            size_t states_size = statesSizeInBytes / sizeof(rocrand_state_xorwow);
+
+            dropout_states = std::unique_ptr<Workspace>(new Workspace(states_size * sizeof(rocrand_state_xorwow)));
+            dropout_desc.set(handle,
+                             fn.rnn.dropout_rate,
+                             dropout_states->data,
+                             dropout_states->size,
+                             fn.rnn.dropout_seed,
+                             false,
+                             false,
+                             miopenRNGType_t::MIOPEN_RNG_PSEUDO_XORWOW);
+            rnn_desc = fn.rnn.descriptorWithDropout(dropout_desc);
+        }
+
        x_descs = fn.tensors.descriptors(x);
        y_descs = fn.tensors.descriptors(y);
        hx_desc.set(hx, 5);
@ -492,7 +558,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
    auto handle = getMiopenHandle();
    miopenRNNAlgo_t algo = miopenRNNdefault;
    fn.rnn.set_algo(algo);
-
+    fn.rnn.set_dropout(fn_dropout);
    RNNDescriptors descs(fn, handle, x, y, hx, cx);

    FilterDescriptor w_desc;
@ -551,7 +617,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
    }

    return std::make_tuple(output, hy, cy, reserve, weight_buf);
-
 }

 std::tuple<Tensor, Tensor, Tensor, Tensor> miopen_rnn_backward_input(
@ -626,6 +691,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> miopen_rnn_backward_input(

    miopenRNNAlgo_t algo = miopenRNNdefault;
    fn.rnn.set_algo(algo);
+    fn.rnn.set_dropout(fn_dropout);
    RNNDescriptors descs(fn, handle, x, y, hx, cx);

    FilterDescriptor w_desc;
@ -720,6 +786,7 @@ std::vector<Tensor> miopen_rnn_backward_weight(

    miopenRNNAlgo_t algo = miopenRNNdefault;
    fn.rnn.set_algo(algo);
+    fn.rnn.set_dropout(fn_dropout);
    RNNDescriptors descs(fn, handle, x, y, hx, cx);

    FilterDescriptor w_desc;
@ -909,6 +976,6 @@ REGISTER_CUDA_DISPATCH(lstm_miopen_stub, &lstm_miopen)
 REGISTER_CUDA_DISPATCH(lstm_packed_miopen_stub, &lstm_packed_miopen)

 } // anonymous namespace
-}} //namespace native.
+}} // namespace native

 #endif
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@ -4,7 +4,6 @@
 #include <ATen/core/Tensor.h>
 #include <torch/library.h>
 #include <ATen/native/mkldnn/Linear.h>
-#include <ATen/native/Resize.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -47,20 +46,9 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_linear_backward(
  TORCH_CHECK(false, "mkldnn_linear_backward: ATen not compiled with MKLDNN support");
 }

-Tensor&
-mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
-          const Tensor& scale_a,
-          const Tensor& scale_b,
-          const std::optional<at::Tensor>& bias,
-          const std::optional<at::Tensor>& scale_result,
-          std::optional<c10::ScalarType> out_dtype,
-          bool use_fast_accum,
-          Tensor& out) {
-  TORCH_INTERNAL_ASSERT(false, "mkldnn_scaled_mm: ATen not compiled with MKLDNN support");
-}
-
 } // namespace at::native

+
 #else // AT_MKLDNN_ENABLED

 #include <ATen/native/mkldnn/MKLDNNCommon.h>
@ -459,119 +447,6 @@ TORCH_LIBRARY_IMPL(mkldnn, MkldnnCPU, m) {
      TORCH_FN(mkldnn_linear_pointwise_binary));
 }

-Tensor&
-mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
-          const Tensor& scale_a,
-          const Tensor& scale_b,
-          const std::optional<at::Tensor>& bias,
-          const std::optional<at::Tensor>& scale_result,
-          std::optional<c10::ScalarType> out_dtype,
-          bool use_fast_accum,
-          Tensor& out) {
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
-  TORCH_CHECK(
-      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
-      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-
-  TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend.");
-  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
-       " but got ", bias->numel());
-
-  // Check types
-  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
-  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
-  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
-  // TODO: This check of mat1 and mat2 must have the same data type will be removed after oneDNN v3.6.
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "Expected mat1 and mat2 must have the same data type");
-
-  // Validation checks have passed lets resize the output to actual size
-  auto mat1_c = mat1.contiguous();
-  auto mat2_c = mat2.contiguous();
-  IntArrayRef mat1_sizes = mat1_c.sizes();
-  IntArrayRef mat2_sizes = mat2_c.sizes();
-  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
-
-  float input_scale = scale_a.item<float>();
-  float weight_scale = scale_b.item<float>();
-  auto src = at::native::itensor_view_from_dense(mat1_c);
-  auto weight_t = at::native::itensor_view_from_dense(mat2_c);
-  bool with_bias = bias.has_value();
-  int64_t K = mat1_sizes[1], M = mat1_sizes[0],
-          N = mat2_sizes[1];
-
-  std::vector<int64_t> src_dims = {M, K};
-  std::vector<int64_t> weight_dims = {K, N};
-  std::vector<int64_t> dst_dims = {M, N};
-
-  ideep::tensor dst = at::native::itensor_view_from_dense(out);
-  auto src_desc = ideep::tensor::desc(
-      src_dims,
-      get_mkldnn_dtype(mat1.scalar_type()),
-      ideep::format_tag::any);
-  auto weights_desc = ideep::tensor::desc(
-      weight_dims,
-      get_mkldnn_dtype(mat2.scalar_type()),
-      ideep::format_tag::any);
-  auto dst_desc = ideep::tensor::desc(
-      dst_dims,
-      get_mkldnn_dtype(out.scalar_type()),
-      ideep::format_tag::any);
-  ideep::tensor onednn_bias;
-  if (with_bias) {
-    auto bias_value = bias.value();
-    if (bias_value.dim() == 1) {
-      auto b_reshape = bias_value.reshape({1, bias_value.size(0)});
-      onednn_bias = at::native::itensor_view_from_dense(b_reshape);
-    } else {
-      onednn_bias = at::native::itensor_view_from_dense(bias_value);
-    }
-  }
-  auto bias_desc = ideep::tensor::desc();
-  if (with_bias) {
-    bias_desc = ideep::tensor::desc(onednn_bias.get_dims(),
-                        get_mkldnn_dtype(bias.value().scalar_type()),
-                        ideep::format_tag::any);
-  }
-  auto op_attr = ideep::attr_t();
-  if (input_scale != 1.0f) {
-    op_attr.set_scales_mask(DNNL_ARG_SRC, 0);
-  }
-  if (weight_scale != 1.0f) {
-    op_attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
-  }
-
-  op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-  auto engine = ideep::engine::cpu_engine();
-  dnnl::matmul::primitive_desc primitive_desc = with_bias
-      ? dnnl::matmul::primitive_desc(
-            engine, src_desc, weights_desc, bias_desc, dst_desc, op_attr)
-      : dnnl::matmul::primitive_desc(
-            engine, src_desc, weights_desc, dst_desc, op_attr);
-  auto primitive = dnnl::matmul(primitive_desc);
-
-  // Prepare args and execute primitive
-  ideep::tensor scratchpad(primitive_desc.scratchpad_desc());
-  ideep::exec_args args;
-  args.insert({DNNL_ARG_SRC, src});
-  args.insert({DNNL_ARG_WEIGHTS, weight_t});
-  args.insert({DNNL_ARG_DST, dst});
-  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
-  if (with_bias) {
-    args.insert({DNNL_ARG_BIAS, onednn_bias});
-  }
-  ideep::tensor src_scales_t = ideep::tensor(ideep::scale_t(1, input_scale));
-  ideep::tensor wei_scales_t = ideep::tensor(ideep::scale_t(1, weight_scale));
-
-  if (input_scale != 1.0f) {
-    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_t});
-  }
-  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_t});
-
-  primitive.execute(ideep::stream::default_stream(), args);
-  return out;
-}
-
 } // namespace at

 #endif // AT_MKLDNN_ENABLED
--- a/aten/src/ATen/native/mkldnn/Linear.h
+++ b/aten/src/ATen/native/mkldnn/Linear.h
@ -35,15 +35,3 @@ C10_API Tensor mkl_linear(
 } // namespace at

 #endif // AT_MKLDNN_ENABLED()
-
-namespace at::native {
-Tensor&
-mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
-          const Tensor& scale_a,
-          const Tensor& scale_b,
-          const std::optional<at::Tensor>& bias,
-          const std::optional<at::Tensor>& scale_result,
-          std::optional<c10::ScalarType> out_dtype,
-          bool use_fast_accum,
-          Tensor& out);
-} // namespace at::native
--- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
@ -54,15 +54,9 @@ ideep::tensor::data_type get_mkldnn_dtype(ScalarType type) {
    case ScalarType::Byte:
      return ideep::tensor::data_type::u8;
    case ScalarType::BFloat16:
-#if !(IDEEP_VERSION_MAJOR <= 2 || (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR < 5))
      return ideep::tensor::data_type::bf16;
    case ScalarType::Half:
      return ideep::tensor::data_type::f16;
-    case ScalarType::Float8_e4m3fn:
-      return ideep::tensor::data_type::f8_e4m3;
-    case ScalarType::Float8_e5m2:
-      return ideep::tensor::data_type::f8_e5m2;
-#endif
    default:
      TORCH_CHECK(false, "get_mkldnn_dtype: unsupported data type");
  }
@ -167,26 +161,8 @@ ideep::tensor itensor_view_from_dense(const Tensor& tensor, bool from_const_data
              const_cast<void*>(tensor.const_data_ptr()) :
              tensor.data_ptr()};
  }
-#if !(IDEEP_VERSION_MAJOR <= 2 || (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR < 5))
-  else if (tensor.scalar_type() == ScalarType::Float8_e4m3fn) {
-    return {{tensor.sizes().vec(),
-            ideep::tensor::data_type::f8_e4m3,
-            tensor.strides().vec()},
-            from_const_data_ptr ?
-              const_cast<void*>(tensor.const_data_ptr()) :
-              tensor.data_ptr()};
-  }
-  else if (tensor.scalar_type() == ScalarType::Float8_e5m2) {
-    return {{tensor.sizes().vec(),
-            ideep::tensor::data_type::f8_e5m2,
-            tensor.strides().vec()},
-            from_const_data_ptr ?
-              const_cast<void*>(tensor.const_data_ptr()) :
-              tensor.data_ptr()};
-  }
-#endif
  else {
-    TORCH_CHECK(false, "itensor_view_from_dense expects float/bfloat16/half/int8/fp8 tensor input");
+    TORCH_CHECK(false, "itensor_view_from_dense expects float/bfloat16/half/int8 tensor input");
  }
 }

--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
@ -177,7 +177,7 @@ class Attr {
      float sum_q_scale = 1.f,
      int64_t zp = 0) {
    ops_params_.push_back(
-        PostOpParam(/*scale_sum*/ sum_scale * sum_q_scale, kind_t::sum));
+        PostOpParam(/*scale_sum*/ sum_scale * sum_q_scale, zp, kind_t::sum));
    return *this;
  }

@ -261,10 +261,7 @@ class Attr {
    return *this;
  }

-  dnnl::post_ops extract_post_ops(
-      const at::Tensor& dst,
-      bool is_quantized = false,
-      bool int8_output = false) {
+  dnnl::post_ops extract_post_ops(const at::Tensor& dst) {
    // this function is used to extract post ops params from the ops_params_
    // and put them into onednn post ops
    for (size_t i = 0; i < ops_params_.size(); ++i) {
@ -303,11 +300,6 @@ class Attr {
      }
    }

-    // if output is quantized, then append the eltwise linear to adjust the
-    // output scale/zero_point
-    if (is_quantized && int8_output) {
-      dnnl_post_ops_.append_eltwise(kind_with_linear, q_scale_, q_zero_point_);
-    }
    return dnnl_post_ops_;
  }

@ -410,6 +402,7 @@ static inline void construct_attr_by_post_op(
    double binary_alpha,
    double input1_scale,
    int64_t input1_zero_point,
+    std::optional<at::Tensor> accum,
    const std::string_view& unary_post_op,
    const torch::List<std::optional<at::Scalar>>& unary_post_op_args,
    const std::string_view& unary_post_op_algorithm,
@ -418,11 +411,46 @@ static inline void construct_attr_by_post_op(
      (binary_post_op == "none" && unary_post_op == "none"); // not post-ops
  bool is_unary_post_op_only =
      (binary_post_op == "none" && unary_post_op != "none"); // ex., conv + relu
+  bool is_valid_binary_combination =
+      (binary_post_op == "add" || binary_post_op == "sum") &&
+      (unary_post_op == "none" || unary_post_op == "relu");
  TORCH_INTERNAL_ASSERT(
-      is_unary_post_op_only || is_none_post_op,
-      "Currently, quantization backend for Intel GPU only supports convolution or convolution with unary post operation like ReLU");
-  construct_attr_for_unary(
-      unary_post_op, unary_post_op_args, unary_post_op_algorithm, attr);
+      is_unary_post_op_only || is_none_post_op || is_valid_binary_combination,
+      "Please provide valid combination of unary post operators and binary post operators");
+
+  if (binary_post_op == "none") {
+    construct_attr_for_unary(
+        unary_post_op, unary_post_op_args, unary_post_op_algorithm, attr);
+  } else if (binary_post_op == "sum") {
+    if (unary_post_op == "none") {
+      if (input1_zero_point != 0)
+        attr = attr.append_post_eltwise(
+            /*scale*/ 1.f,
+            /*alpha*/ 1.f,
+            -input1_zero_point * input1_scale,
+            attr.kind_with_linear);
+      attr = attr.append_post_sum(1, input1_scale, /*input1_zero_point*/ 0);
+    } else if (unary_post_op == "relu") {
+      if (input1_zero_point != 0)
+        attr = attr.append_post_eltwise(
+            /*scale*/ 1.f,
+            /*alpha*/ 1.f,
+            -input1_zero_point * input1_scale,
+            attr.kind_with_linear);
+      attr = attr.append_post_sum(1, input1_scale, /*input1_zero_point*/ 0);
+      attr = attr.append_post_eltwise(
+          /* scale */ 1.f,
+          /* alpha */ 0.f,
+          /* beta */ 0.f,
+          attr.kind_with_relu);
+    }
+  } else if (binary_post_op == "add") {
+    TORCH_CHECK(accum.has_value());
+    attr = attr.append_post_binary(attr.kind_with_binary_add, accum.value());
+    if (unary_post_op == "relu") {
+      attr = attr.append_post_eltwise(1.f, 0.f, 0.f, attr.kind_with_relu);
+    }
+  }
 }

 } // namespace at::native::onednn
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@ -11,14 +11,19 @@

 namespace at::native::onednn {

-static std::tuple<dnnl::memory::desc, dnnl::memory::desc, dnnl::memory::desc>
+static std::tuple<
+    dnnl::memory::desc,
+    dnnl::memory::desc,
+    dnnl::memory::desc,
+    dnnl::memory::desc>
 qconv_get_md(
    const at::Tensor& src,
    const at::Tensor& wgh,
+    std::optional<at::Tensor> bias,
    const at::Tensor& dst,
    int64_t groups) {
  // create dnnl::memory desc from the src/wgh/dst tensors
-  dnnl::memory::desc src_usr_md, wgh_usr_md, dst_usr_md;
+  dnnl::memory::desc src_usr_md, wgh_usr_md, dst_usr_md, bias_usr_md;
  auto ndim = src.ndimension();
  bool src_is_cl =
      (src.suggest_memory_format() == at::MemoryFormat::ChannelsLast) ||
@ -44,7 +49,14 @@ qconv_get_md(
  auto fmt_wgh = conv_weight_fmt(ndim, groups != 1, wgh_is_cl);
  wgh_usr_md = dnnl::memory::desc(wgh_tz, wei_data_t, fmt_wgh);

-  return {src_usr_md, wgh_usr_md, dst_usr_md};
+  if (bias.has_value()) {
+    bias_usr_md = dnnl::memory::desc(
+        bias.value().sizes().vec(),
+        dnnl::memory::data_type::f32,
+        dnnl::memory::format_tag::x);
+  }
+
+  return {src_usr_md, wgh_usr_md, bias_usr_md, dst_usr_md};
 }

 at::Tensor quantized_convolution(
@ -76,14 +88,12 @@ at::Tensor quantized_convolution(
      Attr(/*q_scale=*/1.0 / inv_output_scale, /*zp=*/output_zero_point);

  auto ndim = act.ndimension();
-  if (bias.has_value()) {
-    attr = attr.append_bias(bias.value(), ndim - 2);
-  }
  construct_attr_by_post_op(
      binary_attr.has_value() ? binary_attr.value() : "none",
      binary_alpha.has_value() ? binary_alpha.value().to<double>() : 1.0,
      accum_scale,
      accum_zero_point,
+      accum,
      unary_attr.has_value() ? unary_attr.value() : "none",
      unary_scalars,
      unary_algorithm.has_value() ? unary_algorithm.value() : "",
@ -110,10 +120,7 @@ at::Tensor quantized_convolution(
  dnnl::memory::dims _dilation = compatible_dilation(dilation);
  dnnl::post_ops po;
  // extract post ops
-  po = attr.extract_post_ops(
-      output,
-      /*is_quantized*/ true,
-      output.scalar_type() == at::kByte || output.scalar_type() == at::kChar);
+  po = attr.extract_post_ops(output);
  int mask_ac = 0, mask_weight;
  // [Note: Per-channel quantization mask setting]
  // Per-channel quantization is on weight output channel mostly, mask_weight=
@ -127,10 +134,11 @@ at::Tensor quantized_convolution(
  dnnl::primitive_attr pattr;

  bool src_need_zp = (act_scale != 0);
+  bool dst_need_zp = (output_zero_point != 0);

  // create usr_md for tensors, and md for conv primitive
-  auto [src_md, weight_md, output_md] =
-      qconv_get_md(act, weight, output, groups);
+  auto [src_md, weight_md, bias_md, output_md] =
+      qconv_get_md(act, weight, bias, output, groups);

  // get tensor md
  auto ic = act.size(1);
@ -139,11 +147,14 @@ at::Tensor quantized_convolution(
      compatible_weight_dims(ndim, groups, oc, ic, weight.sizes());

  pattr.set_scales_mask(DNNL_ARG_SRC, mask_ac);
+  pattr.set_scales_mask(DNNL_ARG_DST, mask_ac);
  pattr.set_scales_mask(DNNL_ARG_WEIGHTS, mask_weight);
  pattr.set_post_ops(po);

  if (src_need_zp)
    pattr.set_zero_points_mask(DNNL_ARG_SRC, mask_ac);
+  if (dst_need_zp)
+    pattr.set_zero_points_mask(DNNL_ARG_DST, mask_ac);
  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);

  // create primitive
@ -153,7 +164,7 @@ at::Tensor quantized_convolution(
      dnnl::algorithm::convolution_direct,
      src_md,
      weight_md,
-      dnnl::memory::desc(),
+      bias.has_value() ? bias_md : dnnl::memory::desc(),
      output_md,
      _stride,
      _dilation,
@ -164,11 +175,14 @@ at::Tensor quantized_convolution(
  dnnl::convolution_forward conv_forward =
      dnnl::convolution_forward(conv_fwd_pd);

-  dnnl::memory src_m, weight_m, output_m;
+  dnnl::memory src_m, weight_m, output_m, bias_m;

  src_m = make_onednn_memory(src_md, engine, act.data_ptr());
  output_m = make_onednn_memory(output_md, engine, output.data_ptr());
  weight_m = make_onednn_memory(weight_md, engine, weight.data_ptr());
+  if (bias.has_value()) {
+    bias_m = make_onednn_memory(bias_md, engine, bias.value().data_ptr());
+  }

  std::unordered_map<int, dnnl::memory> args;
  if (attr.with_binary())
@ -176,6 +190,9 @@ at::Tensor quantized_convolution(
  args.insert({DNNL_ARG_SRC, src_m});
  args.insert({DNNL_ARG_WEIGHTS, weight_m});
  args.insert({DNNL_ARG_DST, output_m});
+  if (bias.has_value()) {
+    args.insert({DNNL_ARG_BIAS, bias_m});
+  }

  dnnl::memory src_sc_m, src_zp_m;
  Tensor src_sc_tensor, src_zp_tensor;
@ -188,7 +205,17 @@ at::Tensor quantized_convolution(
    args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zp_m});
  }

-  // dst scale is no need for setting, since it is fused in postop via linear
+  dnnl::memory dst_sc_m, dst_zp_m;
+  Tensor dst_sc_tensor, dst_zp_tensor;
+  dst_sc_m = dnnl_memory_from_host_scalar(
+      static_cast<float>(inv_output_scale), dst_sc_tensor, engine);
+  dst_zp_m = dnnl_memory_from_host_scalar(
+      static_cast<int32_t>(output_zero_point), dst_zp_tensor, engine);
+  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_m});
+  if (dst_need_zp) {
+    args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zp_m});
+  }
+
  size_t scratchpad_size = conv_fwd_pd.scratchpad_desc().get_size();
  Tensor scratchpad_tensor = at::empty(
      {static_cast<int64_t>(scratchpad_size)},
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@ -118,6 +118,7 @@ void quantized_matmul(
      binary_alpha,
      input_scale,
      input_zero_point,
+      other,
      unary_post_op,
      unary_post_op_args,
      unary_post_op_algorithm,
@ -210,11 +211,9 @@ void quantized_matmul(
  std::unordered_map<int, dnnl::memory> args;

  dnnl::post_ops po;
-  po = attr.extract_post_ops(
-      dst,
-      true,
-      dst.scalar_type() == at::kByte || dst.scalar_type() == at::kChar);
+  po = attr.extract_post_ops(dst);
  bool m1_need_zp = (input_zero_point != 0);
+  bool dst_need_zp = (output_zero_point != 0);
  bool wgh_is_per_channel = weight_scales.numel() > 1;

  dnnl::matmul matmul_p;
@ -242,6 +241,10 @@ void quantized_matmul(
  if (m1_need_zp) {
    pattr.set_zero_points_mask(DNNL_ARG_SRC, mask_ac);
  }
+  pattr.set_scales_mask(DNNL_ARG_DST, mask_ac);
+  if (dst_need_zp) {
+    pattr.set_zero_points_mask(DNNL_ARG_DST, mask_ac);
+  }

  if (with_bias) {
    b_md = dnnl::memory::desc(bias_dims, bias_dt, bias_strides);
@ -309,6 +312,17 @@ void quantized_matmul(
    args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, m1_zp_m});
  }

+  dnnl::memory dst_sc_m, dst_zp_m;
+  Tensor dst_sc_tensor, dst_zp_tensor;
+  dst_sc_m = dnnl_memory_from_host_scalar(
+      static_cast<float>(output_scale), dst_sc_tensor, engine);
+  dst_zp_m = dnnl_memory_from_host_scalar(
+      static_cast<int32_t>(output_zero_point), dst_zp_tensor, engine);
+  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_m});
+  if (dst_need_zp) {
+    args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zp_m});
+  }
+
  auto qmatmul_event = dnnl::sycl_interop::execute(matmul_p, stream, args);

  if (!dst.is_same(result))
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@ -54,7 +54,7 @@ class QConvoneDNNXPU final {
      TORCH_CHECK(
          attr == "none" || attr == "relu" || attr == "hardtanh" ||
              attr == "hardswish" || attr == "swish",
-          "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, and Hardswish are supported. However, encountered unsupported post operation:",
+          "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
          attr,
          ".");
    }
@ -76,7 +76,7 @@ class QConvoneDNNXPU final {
        dilation.vec());

    Tensor output = at::empty(
-        dst_tz, device(c10::kXPU).dtype(output_dtype).memory_format(mfmt));
+        dst_tz, act.options().dtype(output_dtype).memory_format(mfmt));

    return quantized_convolution(
        act,
@ -104,6 +104,95 @@ class QConvoneDNNXPU final {
        /*unary_scalars*/ scalars,
        /*unary_algorithm*/ algorithm);
  }
+
+  static at::Tensor run_pointwise_binary(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      at::Tensor accum,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double accum_scale,
+      int64_t accum_zero_point,
+      std::string_view binary_attr,
+      std::optional<at::Scalar> alpha,
+      std::optional<std::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<std::string_view> unary_algorithm) {
+    TORCH_CHECK(
+        act.dim() == 4 && binary_attr == "sum" &&
+            (!unary_attr.has_value() ||
+             (unary_attr.has_value() &&
+              (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
+        "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
+        binary_attr,
+        " unary_post_op: ",
+        unary_attr.has_value() ? unary_attr.value() : "none",
+        ".")
+
+    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+    auto mfmt = is_channels_last_suggested
+        ? get_cl_tag_by_ndim(act.ndimension())
+        : at::MemoryFormat::Contiguous;
+    Tensor input_ = act.contiguous(mfmt);
+    Tensor weight_ = weight.contiguous(mfmt);
+
+    auto dst_tz = conv_dst_size(
+        input_.ndimension(),
+        input_.sizes(),
+        weight_.sizes(),
+        padding.vec(),
+        padding.vec(),
+        stride.vec(),
+        dilation.vec());
+
+    bool has_accum_postop_sum = binary_attr == "sum";
+    Tensor output = has_accum_postop_sum
+        ? accum
+        : at::empty(
+              dst_tz, act.options().dtype(output_dtype).memory_format(mfmt));
+
+    output = quantized_convolution(
+        act,
+        act_scale,
+        act_zero_point,
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        /*transposed*/ false,
+        groups,
+        output,
+        output_scale,
+        output_zero_point,
+        /*accum*/ accum,
+        /*accum_scale*/ accum_scale,
+        /*accum_zero_point*/ accum_zero_point,
+        /*output_dtype*/ output_dtype,
+        /*binary_attr*/ binary_attr,
+        /*binary_alpha*/ alpha,
+        /*unary_attr*/ unary_attr,
+        /*unary_scalars*/ unary_scalars,
+        /*unary_algorithm*/ unary_algorithm);
+
+    if (!has_accum_postop_sum) {
+      return output;
+    } else {
+      return accum;
+    }
+  }
 };

 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
@ -119,6 +208,9 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qconv3d_pointwise"),
      QConvoneDNNXPU::run_pointwise);
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary"),
+      QConvoneDNNXPU::run_pointwise_binary);
 }

 } // namespace at::native::xpu
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
@ -125,6 +125,132 @@ Tensor q_linear_pointwise_tensor(
  return qout;
 }

+Tensor q_linear_pointwise_binary(
+    Tensor act,
+    double act_scale,
+    int64_t act_zero_point,
+    Tensor weight,
+    Tensor weight_scales,
+    Tensor weight_zero_points,
+    std::optional<at::Tensor> other,
+    std::optional<Tensor> bias,
+    double output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    c10::string_view binary_post_op,
+    double binary_alpha,
+    c10::string_view unary_post_op,
+    torch::List<std::optional<at::Scalar>> unary_post_op_args,
+    c10::string_view unary_post_op_algorithm) {
+  TORCH_CHECK(
+      act.device() == weight.device() &&
+          act.device() == weight_scales.device() &&
+          act.device() == weight_zero_points.device(),
+      "qlinear xpu: input tensors(act, weight, weight scale, weight zero-points) should be on the same device");
+  Tensor b_raw = bias.has_value() ? bias.value() : at::Tensor();
+
+  const int64_t dim = act.dim();
+  int64_t K = act.size(dim - 1);
+  int64_t M = act.numel() / K;
+  // [M, K] x [K, N]
+  int64_t N = weight.size(1);
+
+  std::vector<int64_t> src_dims = {M, K};
+  std::vector<int64_t> dst_dims = {M, N};
+  auto out_dtype =
+      output_dtype.has_value() ? output_dtype.value() : act.scalar_type();
+  Tensor qout = at::empty(dst_dims, act.options().dtype(out_dtype));
+
+  quantized_matmul(
+      act.contiguous(),
+      act_scale,
+      act_zero_point,
+      weight.contiguous(),
+      weight_scales,
+      weight_zero_points,
+      b_raw,
+      qout,
+      output_scale,
+      output_zero_point,
+      output_dtype,
+      /*other*/ other,
+      /*other scale*/ other_scale,
+      /*other zp*/ other_zero_point,
+      /*binary post op*/ binary_post_op,
+      /*binary alpha*/ binary_alpha,
+      unary_post_op,
+      unary_post_op_args,
+      unary_post_op_algorithm,
+      /*m2_trans*/ true);
+
+  return qout;
+}
+
+Tensor q_linear_pointwise_binary_tensor(
+    Tensor act,
+    Tensor act_scale,
+    Tensor act_zero_point,
+    Tensor weight,
+    Tensor weight_scales,
+    Tensor weight_zero_points,
+    std::optional<at::Tensor> other,
+    std::optional<Tensor> bias,
+    double output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    c10::string_view binary_post_op,
+    double binary_alpha,
+    c10::string_view unary_post_op,
+    torch::List<std::optional<at::Scalar>> unary_post_op_args,
+    c10::string_view unary_post_op_algorithm) {
+  TORCH_CHECK(
+      act.device() == weight.device() &&
+          act.device() == weight_scales.device() &&
+          act.device() == weight_zero_points.device(),
+      "qlinear xpu: input tensors(act, weight, weight scale, weight zero-points) should be on the same device");
+  Tensor b_raw = bias.has_value() ? bias.value() : at::Tensor();
+
+  const int64_t dim = act.dim();
+  int64_t K = act.size(dim - 1);
+  int64_t M = act.numel() / K;
+  // [M, K] x [K, N]
+  int64_t N = weight.size(1);
+
+  std::vector<int64_t> src_dims = {M, K};
+  std::vector<int64_t> dst_dims = {M, N};
+  auto out_dtype =
+      output_dtype.has_value() ? output_dtype.value() : act.scalar_type();
+  Tensor qout = at::empty(dst_dims, act.options().dtype(out_dtype));
+
+  quantized_matmul(
+      act.contiguous(),
+      act_scale.item().toDouble(),
+      act_zero_point.item().toLong(),
+      weight.contiguous(),
+      weight_scales,
+      weight_zero_points,
+      b_raw,
+      qout,
+      output_scale,
+      output_zero_point,
+      output_dtype,
+      /*other*/ other,
+      /*other scale*/ other_scale,
+      /*other zp*/ other_zero_point,
+      /*binary post op*/ binary_post_op,
+      /*binary alpha*/ binary_alpha,
+      unary_post_op,
+      unary_post_op_args,
+      unary_post_op_algorithm,
+      /*m2_trans*/ true);
+
+  return qout;
+}
+
 at::Tensor q_linear_prepack_onednn(
    at::Tensor weight,
    std::optional<torch::List<int64_t>> input_shape) {
@ -142,6 +268,12 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
      TORCH_FN(q_linear_prepack_onednn));
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
+      TORCH_FN(q_linear_pointwise_binary));
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
+      TORCH_FN(q_linear_pointwise_binary_tensor));
 }

 } // namespace at::native::xpu
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -7066,13 +7066,11 @@
 - func: _scaled_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
  variants: function
  dispatch:
-    CPU: _scaled_mm_cpu
    CUDA: _scaled_mm_cuda

 - func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
  variants: function
  dispatch:
-    CPU: _scaled_mm_out_cpu
    CUDA: _scaled_mm_out_cuda

 # NOTE [ Sparse: autograd and API ]
@ -14904,6 +14902,7 @@
 - func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
  dispatch:
    CUDA: _scaled_dot_product_cudnn_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_cudnn_attention_nestedtensor_cuda
  tags: nondeterministic_seeded

 - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
@ -14936,6 +14935,11 @@
  dispatch:
    CUDA: _efficient_attention_backward

+- func: _cudnn_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  dispatch:
+    CUDA: _cudnn_attention_forward
+  tags: nondeterministic_seeded
+
 - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
  variants: function
  dispatch:
--- a/aten/src/ATen/native/nested/NestedTensorTransformerUtils.h
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerUtils.h
@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#pragma once
+#include <ATen/core/Tensor.h>

 namespace at::native::preprocessing {

--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@ -18,6 +18,8 @@
 #include <ATen/native/transformers/cuda/sdp_utils.h>

 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>

 namespace at::native {
 namespace {
@ -320,6 +322,33 @@ _scaled_dot_product_efficient_attention_nestedtensor_cuda(
  return std::make_tuple(std::move(attention), std::move(log_sumexp), std::move(seed), std::move(offset));
 }

+std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Tensor, Tensor>
+_scaled_dot_product_cudnn_attention_nestedtensor_cuda(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const std::optional<Tensor>& attn_bias,
+    bool compute_logsumexp,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    std::optional<double> scale) {
+
+  auto [
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_kv,
+      max_seqlen_batch_q,
+      max_seqlen_batch_kv,
+      output_shape] = preprocessing::sdpa_nested_preprocessing(query, key, value);
+  auto [attention, log_sumexp, ignore1, ignore2, ignore3, ignore4, cudnn_seed, cudnn_offset, ignore5] = at::_cudnn_attention_forward(query_buffer_reshaped, key_buffer_reshaped, value_buffer_reshaped, attn_bias, cumulative_sequence_length_q, cumulative_sequence_length_kv, max_seqlen_batch_q, max_seqlen_batch_kv, compute_logsumexp, dropout_p, is_causal, return_debug_mask, scale);
+
+  attention = wrap_buffer(attention.view(-1), output_shape).transpose(1, 2);
+  return std::make_tuple(std::move(attention), std::move(log_sumexp), cumulative_sequence_length_q, cumulative_sequence_length_kv, max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
+}
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attention_backward_nested(
    const at::Tensor& grad_out_,
    const at::Tensor& query,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
@ -464,6 +464,7 @@ def define_qnnpack(third_party, labels = []):
        apple_sdks = (IOS, MACOSX, APPLETVOS),
        compiler_flags = [
            "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
+            "-Wno-unused-command-line-argument",
        ],
        fbobjc_preprocessor_flags = [
            "-DQNNP_PRIVATE=",
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@ -26,6 +26,8 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_cudnn_attention_forward.h>
+#include <ATen/ops/_cudnn_attention_forward_native.h>
 #include <ATen/ops/_efficient_attention_forward.h>
 #include <ATen/ops/_efficient_attention_forward_native.h>
 #include <ATen/ops/_fill_mem_eff_dropout_mask_native.h>
@ -63,6 +65,7 @@

 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/nested/NestedTensorUtils.h>
+#include <ATen/native/nested/NestedTensorTransformerUtils.h>
 #include <ATen/native/nested/NestedTensorTransformerFunctions.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
 #include <ATen/native/transformers/sdp_utils_cpp.h>
@ -87,6 +90,25 @@

 namespace at {

+namespace cuda::philox {
+
+__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr) {
+  if (arg.captured_) {
+    *seed_ptr = static_cast<int64_t>(*arg.seed_.ptr);
+    *offset_ptr = static_cast<int64_t>(
+                    *(arg.offset_.ptr) + static_cast<int64_t>(arg.offset_intragraph_));
+  } else {
+    *seed_ptr = static_cast<int64_t>(arg.seed_.val);
+    *offset_ptr = static_cast<int64_t>(arg.offset_.val);
+  }
+}
+
+void unpack_cudnn_wrapper(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr, cudaStream_t stream) {
+at::cuda::philox::unpack_cudnn<<<1, 1, 0, stream>>>(arg, seed_ptr, offset_ptr);
+}
+
+} // namespace cuda::philox
+
 namespace native {

 namespace {
@ -732,16 +754,177 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
  return std::make_tuple(attention, logsumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, philox_seed, philox_offset, debug_attn_mask);
 }

-// Adapted from TE
-// extract seed and offset from PhiloxCudaState
-__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr) {
-  if (arg.captured_) {
-    *seed_ptr = static_cast<int64_t>(*arg.seed_.ptr);
-    *offset_ptr = static_cast<int64_t>(
-                    *(arg.offset_.ptr) + static_cast<int64_t>(arg.offset_intragraph_));
+std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Tensor, Tensor> _cudnn_attention_forward(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const std::optional<Tensor>& attn_bias,
+    const std::optional<Tensor>& cumulative_sequence_length_q,
+    const std::optional<Tensor>& cumulative_sequence_length_kv,
+    long max_seqlen_batch_q,
+    long max_seqlen_batch_kv,
+    bool compute_logsumexp,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    std::optional<double> scale) {
+  // TODO(eqy): debug mask support
+  // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
+  // Key   (Batch x Num_heads x KV_seq_len x Dim_per_head)
+  // Value (Batch x Num_heads x KV_seq_len x Dim_per_head)
+  const bool is_nested = cumulative_sequence_length_q.has_value();
+  if (!is_nested) {
+    const int64_t batch_size = query.size(0);
+    const int64_t num_heads = query.size(1);
+    const int64_t head_dim_qk = query.size(3);
+    const int64_t head_dim_v = value.size(3);
+    auto attn_bias_ = attn_bias;
+    if (attn_bias_.has_value()) {
+      const auto bias_dim = attn_bias_.value().dim();
+      if (bias_dim == 2) {
+        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_kv});
+      } else if (bias_dim == 3) {
+        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_kv});
+      } else {
+        TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+        attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_kv});
+      }
+    }
+
+    Tensor attention, log_sumexp;
+    at::Tensor cudnn_seed, cudnn_offset;
+    cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
+    cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
+
+    const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+
+    // See Note [Seed and Offset Device] in _efficient_attention_forward
+    at::PhiloxCudaState philox_state;
+    const bool in_capture_stream =
+        at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
+    if (use_dropout) {
+      // Device
+      auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+          std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+      // See Note [Acquire lock when using random generators]
+      std::lock_guard<std::mutex> lock(gen->mutex_);
+      // if using dropout, we produce 1 random number for each element of the
+      // attention tensor
+      // TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
+      philox_state = gen->philox_cuda_state(batch_size * num_heads * max_seqlen_batch_q * max_seqlen_batch_kv);
+      at::cuda::philox::unpack_cudnn_wrapper(
+                                        philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()), at::cuda::getCurrentCUDAStream());
+    }
+
+    const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
+    Tensor debugmask;
+
+    run_cudnn_SDP_fprop(batch_size/*int64_t b*/,
+                        num_heads/*int64_t h*/,
+                        max_seqlen_batch_q/*int64_t s_q*/,
+                        max_seqlen_batch_kv/*int64_t s_kv*/,
+                        head_dim_qk/*int64_t d_qk*/,
+                        head_dim_v/*int64_t d_v*/,
+                        softmax_scale/*float scaling_factor*/,
+                        compute_logsumexp/* bool */,
+                        is_causal/* bool */,
+                        dropout_p/*double dropout_probability*/,
+                        query/* Tensor q*/,
+                        key/* Tensor k*/,
+                        value/* Tensor v*/,
+                        attn_bias_ /* std::optional<Tensor> */,
+                        log_sumexp/*Tensor softmaxstats*/,
+                        attention/*Tensor o*/,
+                        cudnn_seed/*Tensor dropoutseed*/,
+                        cudnn_offset/*Tensor dropoutoffset*/);
+
+    // TODO(eqy): support debug_attn_mask
+    return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
  } else {
-    *seed_ptr = static_cast<int64_t>(arg.seed_.val);
-    *offset_ptr = static_cast<int64_t>(arg.offset_.val);
+    //auto [
+    //    query_buffer_reshaped,
+    //    key_buffer_reshaped,
+    //    value_buffer_reshaped,
+    //    cumulative_sequence_length_q,
+    //    cumulative_sequence_length_kv,
+    //    max_seqlen_batch_q,
+    //    max_seqlen_batch_kv,
+    //    output_shape] = preprocessing::sdpa_nested_preprocessing(query, key, value);
+    // C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
+    // TODO(eqy): debug mask support
+    // BHSD ...
+    const int64_t batch_size = cumulative_sequence_length_q.value().size(0) - 1;
+    const int64_t num_heads_q = query.size(-2);
+    const int64_t num_heads_k = key.size(-2);
+    const int64_t num_heads_v = value.size(-2);
+    const int64_t head_dim_qk = query.size(-1);
+    const int64_t head_dim_v = value.size(-1);
+    auto attn_bias_ = attn_bias;
+    if (attn_bias_.has_value()) {
+      const auto bias_dim = attn_bias_.value().dim();
+      if (bias_dim == 2) {
+        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_kv});
+      } else if (bias_dim == 3) {
+        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_kv});
+      } else {
+        attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_kv});
+        TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+      }
+    }
+
+    Tensor attention, log_sumexp;
+
+    at::Tensor cudnn_seed, cudnn_offset;
+    cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
+    cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
+
+    const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+
+    // See Note [Seed and Offset Device] in _efficient_attention_forward
+    at::PhiloxCudaState philox_state;
+    const bool in_capture_stream =
+        at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
+    if (use_dropout) {
+      // Device
+      auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+          std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+
+      // See Note [Acquire lock when using random generators]
+      std::lock_guard<std::mutex> lock(gen->mutex_);
+      // if using dropout, we produce 1 random number for each element of the
+      // attention tensor
+      // TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
+      philox_state = gen->philox_cuda_state(batch_size * num_heads_q * max_seqlen_batch_q * max_seqlen_batch_kv);
+      at::cuda::philox::unpack_cudnn_wrapper(philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()), at::cuda::getCurrentCUDAStream());
+    }
+
+    const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
+
+    run_cudnn_SDP_fprop_nestedtensor(batch_size/*int64_t b*/,
+                                     num_heads_q/*int64_t h*/,
+                                     num_heads_k,
+                                     num_heads_v,
+                                     max_seqlen_batch_q/*int64_t s_q*/,
+                                     max_seqlen_batch_kv/*int64_t s_kv*/,
+                                     head_dim_qk/*int64_t d_qk*/,
+                                     head_dim_v/*int64_t d_v*/,
+                                     softmax_scale/*float scaling_factor*/,
+                                     compute_logsumexp/* bool */,
+                                     is_causal/* bool */,
+                                     dropout_p/*double dropout_probability*/,
+                                     cumulative_sequence_length_q.value(),
+                                     cumulative_sequence_length_kv.value(),
+                                     query/* Tensor q*/,
+                                     key/* Tensor k*/,
+                                     value/* Tensor v*/,
+                                     attn_bias_ /* std::optional<Tensor> */,
+                                     log_sumexp/*Tensor softmaxstats*/,
+                                     attention/*Tensor o*/,
+                                     cudnn_seed/*Tensor dropoutseed*/,
+                                     cudnn_offset/*Tensor dropoutoffset*/);
+    //attention = wrap_buffer(attention.view(-1), output_shape).transpose(1, 2);
+    return std::make_tuple(std::move(attention), std::move(log_sumexp), cumulative_sequence_length_q.value(), cumulative_sequence_length_kv.value(), max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
  }
 }

@ -757,84 +940,88 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
    std::optional<double> scale) {
  // Used for tracking usage statistics
  C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
-  // TODO(eqy): debug mask support
-  // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
-  // Key   (Batch x Num_heads x KV_seq_len x Dim_per_head)
-  // Value (Batch x Num_heads x KV_seq_len x Dim_per_head)
-  const int64_t batch_size = query.size(0);
-  const int64_t num_heads = query.size(1);
  const int64_t max_seqlen_batch_q = query.size(2);
-  const int64_t head_dim_qk = query.size(3);
-  const int64_t head_dim_v = value.size(3);
  const int64_t max_seqlen_batch_k = key.size(2);
-  const int64_t max_seqlen_batch_v = value.size(2);
-  TORCH_CHECK(
-      max_seqlen_batch_k == max_seqlen_batch_v,
-      "Key and Value must have the same sequence length");
-  auto attn_bias_ = attn_bias;
-  if (attn_bias_.has_value()) {
-    const auto bias_dim = attn_bias_.value().dim();
-    if (bias_dim == 2) {
-      attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
-    } else if (bias_dim == 3) {
-      attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
-    } else {
-      TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
-      attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
-    }
-  }

-  Tensor attention, log_sumexp;
+  return at::_cudnn_attention_forward(query, key, value, attn_bias, std::nullopt, std::nullopt, max_seqlen_batch_q, max_seqlen_batch_k, compute_logsumexp, dropout_p, is_causal, return_debug_mask, scale);
+  //// TODO(eqy): debug mask support
+  //// Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
+  //// Key   (Batch x Num_heads x KV_seq_len x Dim_per_head)
+  //// Value (Batch x Num_heads x KV_seq_len x Dim_per_head)
+  //const int64_t batch_size = query.size(0);
+  //const int64_t num_heads = query.size(1);
+  //const int64_t max_seqlen_batch_q = query.size(2);
+  //const int64_t head_dim_qk = query.size(3);
+  //const int64_t head_dim_v = value.size(3);
+  //const int64_t max_seqlen_batch_k = key.size(2);
+  //const int64_t max_seqlen_batch_v = value.size(2);
+  //TORCH_CHECK(
+  //    max_seqlen_batch_k == max_seqlen_batch_v,
+  //    "Key and Value must have the same sequence length");
+  //auto attn_bias_ = attn_bias;
+  //if (attn_bias_.has_value()) {
+  //  const auto bias_dim = attn_bias_.value().dim();
+  //  if (bias_dim == 2) {
+  //    attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+  //  } else if (bias_dim == 3) {
+  //    attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+  //  } else {
+  //    TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+  //    attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+  //  }
+  //}

-  at::Tensor cudnn_seed, cudnn_offset;
-  cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
-  cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
+  //Tensor attention, log_sumexp;

-  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
+  //at::Tensor cudnn_seed, cudnn_offset;
+  //cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
+  //cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));

-  // See Note [Seed and Offset Device] in _efficient_attention_forward
-  at::PhiloxCudaState philox_state;
-  const bool in_capture_stream =
-      at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
-  if (use_dropout) {
-    // Device
-    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
-        std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+  //const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;

-    // See Note [Acquire lock when using random generators]
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    // if using dropout, we produce 1 random number for each element of the
-    // attention tensor
-    // TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
-    philox_state = gen->philox_cuda_state(batch_size * num_heads * max_seqlen_batch_q * max_seqlen_batch_k);
-    unpack_cudnn<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
-                                      philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()));
-  }
+  //// See Note [Seed and Offset Device] in _efficient_attention_forward
+  //at::PhiloxCudaState philox_state;
+  //const bool in_capture_stream =
+  //    at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
+  //if (use_dropout) {
+  //  // Device
+  //  auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+  //      std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());

-  const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
-  Tensor debugmask;
+  //  // See Note [Acquire lock when using random generators]
+  //  std::lock_guard<std::mutex> lock(gen->mutex_);
+  //  // if using dropout, we produce 1 random number for each element of the
+  //  // attention tensor
+  //  // TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
+  //  philox_state = gen->philox_cuda_state(batch_size * num_heads * max_seqlen_batch_q * max_seqlen_batch_k);
+  //  at::cuda::philox::unpack_cudnn_wrapper(
+  //                                    philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()), at::cuda::getCurrentCUDAStream());
+  //}

-  run_cudnn_SDP_fprop(batch_size/*int64_t b*/,
-                      num_heads/*int64_t h*/,
-                      max_seqlen_batch_q/*int64_t s_q*/,
-                      max_seqlen_batch_k/*int64_t s_kv*/,
-                      head_dim_qk/*int64_t d_qk*/,
-                      head_dim_v/*int64_t d_v*/,
-                      softmax_scale/*float scaling_factor*/,
-                      compute_logsumexp/* bool */,
-                      is_causal/* bool */,
-                      dropout_p/*double dropout_probability*/,
-                      query/* Tensor q*/,
-                      key/* Tensor k*/,
-                      value/* Tensor v*/,
-                      attn_bias_ /* std::optional<Tensor> */,
-                      log_sumexp/*Tensor softmaxstats*/,
-                      attention/*Tensor o*/,
-                      cudnn_seed/*Tensor dropoutseed*/,
-                      cudnn_offset/*Tensor dropoutoffset*/);
+  //const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
+  //Tensor debugmask;

-  // TODO(eqy): support debug_attn_mask
-  return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
+  //run_cudnn_SDP_fprop(batch_size/*int64_t b*/,
+  //                    num_heads/*int64_t h*/,
+  //                    max_seqlen_batch_q/*int64_t s_q*/,
+  //                    max_seqlen_batch_k/*int64_t s_kv*/,
+  //                    head_dim_qk/*int64_t d_qk*/,
+  //                    head_dim_v/*int64_t d_v*/,
+  //                    softmax_scale/*float scaling_factor*/,
+  //                    compute_logsumexp/* bool */,
+  //                    is_causal/* bool */,
+  //                    dropout_p/*double dropout_probability*/,
+  //                    query/* Tensor q*/,
+  //                    key/* Tensor k*/,
+  //                    value/* Tensor v*/,
+  //                    attn_bias_ /* std::optional<Tensor> */,
+  //                    log_sumexp/*Tensor softmaxstats*/,
+  //                    attention/*Tensor o*/,
+  //                    cudnn_seed/*Tensor dropoutseed*/,
+  //                    cudnn_offset/*Tensor dropoutoffset*/);
+
+  //// TODO(eqy): support debug_attn_mask
+  //return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
 }

 std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -505,10 +505,23 @@ bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
 }

 bool check_for_nested_inputs(sdp_params const& params, bool debug) {
-  // Check that the input is nested
-  if (has_for_nested_inputs(params)) {
+  static const bool enable_cudnn_nested = c10::utils::check_env("TORCH_CUDNN_SDPA_NESTED_TENSOR_ENABLED") == true;
+  if (has_for_nested_inputs(params) && !enable_cudnn_nested) {
    if (debug) {
-      TORCH_WARN("CuDNN currently does not support nested inputs.");
+      TORCH_WARN("Experimental cuDNN SDPA nested tensor support is not enabled.");
+    }
+    return false;
+  } else if (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad()) {
+    if (debug) {
+      TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");
+    }
+  }
+
+  const auto dprop = at::cuda::getCurrentDeviceProperties();
+  // Check that the input is nested
+  if (dprop->major != 9 && has_for_nested_inputs(params)) {
+    if (debug) {
+      TORCH_WARN("CuDNN SDPA supports nested tensors on SM 9.0.");
    }
    return false;
  }
@ -574,7 +587,6 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
          check_runtime_disabled_cudnn,
          check_for_nested_inputs,
          check_nonzero_sequence_lengths_dense,
-          check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim>*/>,
          check_all_tensors_on_device,
          check_tensor_shapes,
          check_cudnn_tensor_shapes,
@ -588,6 +600,18 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
      return false;
    }
  }
+  constexpr auto dense_constraints =
+      c10::array_of<bool (*)(sdp_params const&, bool)>(
+      check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>
+  );
+
+  if (has_only_dense_inputs(params)) {
+    for (auto& constraint : dense_constraints) {
+      if (!constraint(params, debug)) {
+        return false;
+      }
+    }
+  }
  return true;
 }

--- a/benchmarks/dynamo/cachebench.py
+++ b/benchmarks/dynamo/cachebench.py
@ -0,0 +1,172 @@
+import argparse
+import dataclasses
+import json
+import logging
+import os
+import subprocess
+import sys
+import tempfile
+
+from torch._inductor.utils import fresh_inductor_cache
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+TIMEOUT: int = 2000
+
+MODELS: list[str] = ["nanogpt", "BERT_pytorch", "resnet50"]
+
+
+@dataclasses.dataclass
+class RunResult:
+    model: str
+    mode: str  # inference or training
+    dynamic: bool
+    device: str  # cuda or cpu
+    cold_compile_s: float
+    warm_compile_s: float
+    speedup: float
+
+
+def get_compile_time(file: tempfile._TemporaryFileWrapper) -> float:
+    lines = file.readlines()
+    # Decode from byte string, remove new lines, parse csv
+    lines = [line.decode("utf-8").strip().split(",") for line in lines]
+    compilation_time_idx = lines[0].index("compilation_latency")
+    compilation_time = lines[1][compilation_time_idx]
+    return float(compilation_time)
+
+
+def _run_torchbench_from_args(model: str, args: list[str]) -> tuple[float, float]:
+    with fresh_inductor_cache():
+        env = os.environ.copy()
+        with tempfile.NamedTemporaryFile(suffix=".csv") as file:
+            args.append("--output=" + file.name)
+            logger.info(f"Performing cold-start run for {model}")  # noqa: G004
+            subprocess.check_call(args, timeout=TIMEOUT, env=env)
+            cold_compile_time = get_compile_time(file)
+
+        args.pop()
+        with tempfile.NamedTemporaryFile(suffix=".csv") as file:
+            args.append("--output=" + file.name)
+            logger.info(f"Performing warm-start run for {model}")  # noqa: G004
+            subprocess.check_call(args, timeout=TIMEOUT, env=env)
+            warm_compile_time = get_compile_time(file)
+
+        return cold_compile_time, warm_compile_time
+
+
+def _run_torchbench_model(results: list[RunResult], model: str, device: str) -> None:
+    cur_file = os.path.abspath(__file__)
+    torchbench_file = os.path.join(os.path.dirname(cur_file), "torchbench.py")
+    assert os.path.exists(
+        torchbench_file
+    ), f"Torchbench does not exist at {torchbench_file}"
+
+    base_args = [
+        sys.executable,
+        torchbench_file,
+        f"--only={model}",
+        "--repeat=1",
+        "--performance",
+        "--backend=inductor",
+        f"--device={device}",
+    ]
+    for mode, mode_args in [
+        ("inference", ["--inference", "--bfloat16"]),
+        ("training", ["--training", "--amp"]),
+    ]:
+        for dynamic, dynamic_args in [
+            (False, []),
+            (True, ["--dynamic-shapes", "--dynamic-batch-only"]),
+        ]:
+            args = list(base_args)
+            args.extend(mode_args)
+            args.extend(dynamic_args)
+
+            logger.info(f"Command: {args}")  # noqa: G004
+            try:
+                cold_compile_t, warm_compile_t = _run_torchbench_from_args(model, args)
+                results.append(
+                    RunResult(
+                        "model",
+                        mode,
+                        dynamic,
+                        device,
+                        cold_compile_t,
+                        warm_compile_t,
+                        cold_compile_t / warm_compile_t,
+                    )
+                )
+            except Exception as e:
+                print(e)
+                return None
+
+
+def _write_results_to_json(results: list[RunResult], output_filename: str) -> None:
+    records = []
+    for result in results:
+        for metric_name, value in [
+            ("cold_compile_time(s)", result.cold_compile_s),
+            ("warm_compile_time(s)", result.warm_compile_s),
+            ("speedup", result.speedup),
+        ]:
+            records.append(
+                {
+                    "benchmark": {
+                        "name": "cache_benchmarks",
+                        "mode": result.mode,
+                        "extra_info": {
+                            "is_dynamic": result.dynamic,
+                            "device": result.device,
+                        },
+                    },
+                    "model": {
+                        "name": result.model,
+                        "backend": "inductor",
+                    },
+                    "metric": {
+                        "name": metric_name,
+                        "type": "OSS model",
+                        "benchmark_values": [value],
+                    },
+                }
+            )
+    with open(output_filename, "w") as f:
+        json.dump(records, f)
+
+
+def parse_cmd_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run a TorchBench ServiceLab benchmark."
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        help="Name of the model to run",
+    )
+    parser.add_argument("-d", "--device", default="cuda", help="cpu or cuda")
+    parser.add_argument(
+        "--output",
+        required=True,
+        help="The output filename (json)",
+    )
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def main() -> None:
+    args = parse_cmd_args()
+
+    results: list[RunResult] = []
+
+    if args.model is not None:
+        _run_torchbench_model(results, args.model, args.device)
+    else:
+        for model in MODELS:
+            _run_torchbench_model(results, model, args.device)
+    _write_results_to_json(results, args.output)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1787000000,0.02



-sum_floordiv_regression,compile_time_instruction_count,1055000000,0.015
+sum_floordiv_regression,compile_time_instruction_count,1076000000,0.015



--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -841,6 +841,7 @@ libtorch_python_core_sources = [
    "torch/csrc/dynamo/cpp_shim.cpp",
    "torch/csrc/dynamo/cpython_defs.c",
    "torch/csrc/dynamo/eval_frame.c",
+    "torch/csrc/dynamo/eval_frame_cpp.cpp",
    "torch/csrc/dynamo/extra_state.cpp",
    "torch/csrc/dynamo/framelocals_mapping.cpp",
    "torch/csrc/dynamo/guards.cpp",
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@ -49,16 +49,9 @@ class C10_API Scalar {
 #define DEFINE_IMPLICIT_CTOR(type, name) \
  Scalar(type vv) : Scalar(vv, true) {}

-  AT_FORALL_SCALAR_TYPES_AND7(
-      Half,
-      BFloat16,
-      Float8_e5m2,
-      Float8_e4m3fn,
-      Float8_e5m2fnuz,
-      Float8_e4m3fnuz,
-      ComplexHalf,
-      DEFINE_IMPLICIT_CTOR)
+  AT_FORALL_SCALAR_TYPES_AND3(Half, BFloat16, ComplexHalf, DEFINE_IMPLICIT_CTOR)
  AT_FORALL_COMPLEX_TYPES(DEFINE_IMPLICIT_CTOR)
+  AT_FORALL_FLOAT8_TYPES(DEFINE_IMPLICIT_CTOR)

  // Helper constructors to allow Scalar creation from long and long long types
  // As std::is_same_v<long, long long> is false(except Android), one needs to
--- a/c10/core/ScalarType.cpp
+++ b/c10/core/ScalarType.cpp
@ -222,6 +222,9 @@ std::pair<std::string, std::string> getDtypeNames(c10::ScalarType scalarType) {
      return std::make_pair("float8_e5m2fnuz", "");
    case c10::ScalarType::Float8_e4m3fnuz:
      return std::make_pair("float8_e4m3fnuz", "");
+    case c10::ScalarType::Float8_e8m0fnu:
+      // TODO(#146647): macroify all of this
+      return std::make_pair("float8_e8m0fnu", "");
    default:
      throw std::runtime_error("Unimplemented scalar type");
  }
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@ -7,6 +7,7 @@
 #include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
 #include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Float8_e8m0fnu.h>
 #include <c10/util/Half.h>
 #include <c10/util/bits.h>
 #include <c10/util/complex.h>
@ -102,7 +103,8 @@ struct dummy_int1_7_t {};
  _(c10::dummy_int1_7_t<4>, Int4) /* 40 */               \
  _(c10::dummy_int1_7_t<5>, Int5) /* 41 */               \
  _(c10::dummy_int1_7_t<6>, Int6) /* 42 */               \
-  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */
+  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */               \
+  _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */

 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
@ -146,7 +148,8 @@ struct dummy_int1_7_t {};
  _(at::Float8_e5m2, Float8_e5m2)              \
  _(at::Float8_e4m3fn, Float8_e4m3fn)          \
  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz)      \
-  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)      \
+  _(at::Float8_e8m0fnu, Float8_e8m0fnu)

 enum class ScalarType : int8_t {
 #define DEFINE_ST_ENUM_VAL_(_1, n) n,
@ -317,6 +320,13 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
  _(c10::quint4x2, QUInt4x2)    \
  _(c10::quint2x4, QUInt2x4)

+#define AT_FORALL_FLOAT8_TYPES(_)         \
+  _(at::Float8_e5m2, Float8_e5m2)         \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)     \
+  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz) \
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \
+  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
+
 #define AT_FORALL_COMPLEX_TYPES(_)     \
  _(c10::complex<float>, ComplexFloat) \
  _(c10::complex<double>, ComplexDouble)
@ -372,7 +382,8 @@ inline bool isIntegralType(ScalarType t) {

 inline bool isFloat8Type(ScalarType t) {
  return t == ScalarType::Float8_e5m2 || t == ScalarType::Float8_e5m2fnuz ||
-      t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz;
+      t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz ||
+      t == ScalarType::Float8_e8m0fnu;
 }

 inline bool isReducedFloatingType(ScalarType t) {
@ -446,6 +457,10 @@ inline bool isSignedType(ScalarType t) {
    return std::numeric_limits< \
        ::c10::impl::ScalarTypeToCPPTypeT<ScalarType::name>>::is_signed;

+  // TODO(#146647): If we expect to have numeric_limits for everything,
+  // let's just have a big macro for the whole thing.
+  // If we're hardcoding it, let's just use the macro and a "true"/"false"
+  // below?
  switch (t) {
    case ScalarType::QInt8:
    case ScalarType::QUInt8:
@ -467,6 +482,7 @@ inline bool isSignedType(ScalarType t) {
      CASE_ISSIGNED(Float8_e5m2fnuz);
      CASE_ISSIGNED(Float8_e4m3fn);
      CASE_ISSIGNED(Float8_e4m3fnuz);
+      CASE_ISSIGNED(Float8_e8m0fnu);
      CASE_ISSIGNED(Byte);
      CASE_ISSIGNED(Char);
      CASE_ISSIGNED(Short);
--- a/c10/util/Float8_e8m0fnu-inl.h
+++ b/c10/util/Float8_e8m0fnu-inl.h
@ -0,0 +1,112 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/floating_point_utils.h>
+#include <cstring>
+#include <limits>
+
+// TODO(#146647): Can we remove the below warning?
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e8m0fnu::Float8_e8m0fnu(float value)
+    : x(detail::fp8e8m0fnu_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e8m0fnu::operator float() const {
+  // TODO(#146647): maybe rewrite without control flow
+
+  // if exponent is zero, need to special case to return 2^-127 instead of zero
+  if (x == 0) {
+    return c10::detail::fp32_from_bits(0x00400000);
+  }
+
+  // if exponent is NaN, need to special case to return properly encoded NaN
+  if (isnan()) {
+    return c10::detail::fp32_from_bits(0x7f800001);
+  }
+
+  // leave sign at 0, set the exponent bits, leave stored mantissa at 0
+  uint32_t res = x << 23;
+
+  return c10::detail::fp32_from_bits(res);
+}
+
+/// Special values helper
+
+inline C10_HOST_DEVICE bool Float8_e8m0fnu::isnan() const {
+  return x == 0b11111111;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e8m0fnu to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e8m0fnu> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = false;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = false;
+  static constexpr auto has_denorm_loss = false;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 1;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 1; // just a 2!
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -126;
+  static constexpr int min_exponent10 = -38;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before = false;
+
+  static constexpr c10::Float8_e8m0fnu min() {
+    // 2^-127
+    return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu lowest() {
+    // 2^-127
+    return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu max() {
+    // 254 biased, which is 127 unbiased, so 2^127
+    return c10::Float8_e8m0fnu(0b11111110, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu epsilon() {
+    // according to https://en.cppreference.com/w/cpp/types/numeric_limits, this
+    // is "the difference between 1.0 and the next representable value of the
+    // given floating-point type". The next representable value is 2.0, so the
+    // difference is 1.0 which is 2^0. 0 unbiased is 127 biased.
+    return c10::Float8_e8m0fnu(0b01111111, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu round_error() {
+    // 0.5 in float, which is 2^-1, and -1 + 127 = 126
+    return c10::Float8_e8m0fnu(0b01111110, c10::Float8_e8m0fnu::from_bits());
+  }
+  static constexpr c10::Float8_e8m0fnu quiet_NaN() {
+    return c10::Float8_e8m0fnu(0b11111111, c10::Float8_e8m0fnu::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
--- a/c10/util/Float8_e8m0fnu.cpp
+++ b/c10/util/Float8_e8m0fnu.cpp
@ -0,0 +1,12 @@
+#include <c10/macros/Macros.h>
+#include <c10/util/Float8_e8m0fnu.h>
+
+namespace c10 {
+
+// TODO(#146647): Can we have these in a single shared cpp file
+// built with macro to remove the need for a new cpp file?
+static_assert(
+    std::is_standard_layout_v<Float8_e8m0fnu>,
+    "c10::Float8_e8m0fnu must be standard layout.");
+
+} // namespace c10
--- a/c10/util/Float8_e8m0fnu.h
+++ b/c10/util/Float8_e8m0fnu.h
@ -0,0 +1,120 @@
+#pragma once
+
+/// Defines the Float8_e8m0fnu type (8-bit floating-point) including
+/// conversions to standard C types
+/// Binary configuration :
+/// eeeeeeee
+/// no sign bits
+/// 8 exponent bits
+/// no mantissa bits
+///
+/// This is the E8M0 dtype from the OCP MX format spec
+/// (https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf,
+/// Section 5.4.1)
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+// TODO(#146647): do we need to special case OPENCL?
+#if defined(__cplusplus)
+#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#include <stdint.h>
+#endif
+
+#include <iosfwd>
+#include <ostream>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 e8m0fnu format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e8m0fnu_from_fp32_value(float f) {
+  // TODO(#146647): maybe rewrite without control flow
+
+  uint32_t f_bits = c10::detail::fp32_to_bits(f);
+
+  // extract the exponent
+  uint32_t exponent = (f_bits >> 23) & 0b11111111;
+
+  // special case float32 NaN and +-inf to map to e8m0 nan
+  if (exponent == 0b11111111) {
+    return exponent;
+  }
+
+  // next, we use guard, round, sticky bits and the LSB to implement round to
+  // nearest, with ties to even
+
+  // guard bit - bit 23, or 22 zero-indexed
+  uint8_t g = (f_bits & 0x400000) > 0;
+  // round bit - bit 22, or 21 zero-indexed
+  uint8_t r = (f_bits & 0x200000) > 0;
+  // sticky bit - bits 21 to 1, or 20 to 0 zero-indexed
+  uint8_t s = (f_bits & 0x1FFFFF) > 0;
+  // in casting to e8m0, LSB is the implied mantissa bit. It equals to 0 if the
+  // original float32 is denormal, and to 1 if the original float32 is normal.
+  uint8_t lsb = exponent > 0;
+
+  // implement the RNE logic
+  bool round_up = false;
+
+  // if g == 0, round down (no-op)
+  if (g == 1) {
+    if ((r == 1) || (s == 1)) {
+      // round up
+      round_up = true;
+    } else {
+      if (lsb == 1) {
+        // round up
+        round_up = true;
+      }
+      // if lsb == 0, round down (no-op)
+    }
+  }
+
+  if (round_up) {
+    // adjust exponent
+    // note that if exponent was 255 we would have already returned earlier, so
+    // we know we can add one safely without running out of bounds
+    exponent++;
+  }
+
+  return exponent;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e8m0fnu {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e8m0fnu() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e8m0fnu(uint8_t bits, from_bits_t)
+      : x(bits) {}
+  inline C10_HOST_DEVICE Float8_e8m0fnu(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+};
+
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e8m0fnu& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace c10
+
+#include <c10/util/Float8_e8m0fnu-inl.h> // IWYU pragma: keep
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@ -5,6 +5,7 @@
 #include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
 #include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Float8_e8m0fnu.h>
 #include <c10/util/Half.h>
 #include <c10/util/complex.h>
 #include <c10/util/overflows.h>
@ -151,6 +152,19 @@ struct static_cast_with_inter_type<
  }
 };

+// TODO(#146647): Can we make all these template specialization happen
+// based off our apply macros?
+template <>
+struct static_cast_with_inter_type<
+    c10::complex<c10::Half>,
+    c10::Float8_e8m0fnu> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::Float8_e8m0fnu src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
 template <>
 struct static_cast_with_inter_type<c10::complex<c10::Half>, c10::Half> {
  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@ -58,7 +58,7 @@ IF(NOT MKLDNN_FOUND)
      -DDNNL_CPU_RUNTIME=THREADPOOL
      -DDNNL_BUILD_TESTS=OFF
      -DDNNL_BUILD_EXAMPLES=OFF
-      -DONEDNN_BUILD_GRAPH=OFF
+      -DONEDNN_BUILD_GRAPH=ON
      -DDNNL_LIBRARY_TYPE=STATIC
      -DDNNL_DPCPP_HOST_COMPILER=${DNNL_HOST_COMPILER} # Use global cxx compiler as host compiler
      -G ${CMAKE_GENERATOR} # Align Generator to Torch
--- a/hack.py
+++ b/hack.py
@ -0,0 +1,118 @@
+
+import torch
+from torch._inductor.ir import NoneAsConstantBuffer
+import torch.nn as nn
+import torch.nn.functional as F
+import depyf
+depyf.install()
+
+
+def fn(loss):
+    gm = None
+    args = None
+
+    def noop(_gm):
+        nonlocal gm
+        gm = _gm
+        def _noop(*_args, **_kwargs):
+            assert not _kwargs
+            nonlocal args
+            args = _args
+            return []
+        return _noop
+
+    with torch._dynamo.compiled_autograd._enable(noop):
+        loss.backward()
+
+    return gm, args
+
+
+result = torch._dynamo.compiled_autograd.Op("FunctionalCompiledAutograd", fn, is_custom_function=False)
+setattr(torch._dynamo.compiled_autograd.ops, "FunctionalCompiledAutograd", torch._dynamo.allow_in_graph(result))
+
+
+x = torch.randn(64, 3)
+t = torch.randn(64, 1)
+
+model = nn.Linear(3, 1)
+
+torch._dynamo.config.compiled_autograd = True
+torch._dynamo.config.do_not_emit_runtime_asserts = True
+
+@torch.compile(backend="eager")
+def train(model, x, t):
+    y = model(x)
+    loss = F.mse_loss(y, t)
+    gm, args = torch._dynamo.compiled_autograd.ops.FunctionalCompiledAutograd(loss)
+    gm(*args)
+    return ()
+
+# with torch._dynamo.compiled_autograd._enable(noop):
+train(model, x, t)
+
+for p in model.parameters():
+    assert p.grad is not None
+
+
+"""
+# this kinda works, but not ideal
+ ===== __compiled_fn_1 =====
+ /home/xmfan/core/a/pytorch/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
+    def forward(self, L_model_parameters_weight_: "f32[1, 3][3, 1]cpu", L_model_parameters_bias_: "f32[1][1]cpu", L_x_: "f32[64, 3][3, 1]cpu", L_t_: "f32[64, 1][1, 1]cpu"):
+        l_model_parameters_weight_ = L_model_parameters_weight_
+        l_model_parameters_bias_ = L_model_parameters_bias_
+        l_x_ = L_x_
+        l_t_ = L_t_
+        
+         # File: /home/xmfan/core/a/pytorch/hack.py:44 in train, code: y = model(x)
+        y: "f32[64, 1][1, 1]cpu" = torch._C._nn.linear(l_x_, l_model_parameters_weight_, l_model_parameters_bias_);  l_x_ = l_model_parameters_weight_ = l_model_parameters_bias_ = None
+        
+         # File: /home/xmfan/core/a/pytorch/hack.py:45 in train, code: loss = F.mse_loss(y, t)
+        loss: "f32[][]cpu" = torch.nn.functional.mse_loss(y, l_t_);  y = l_t_ = None
+        
+         # File: /home/xmfan/core/a/pytorch/hack.py:46 in train, code: gm, args = torch._dynamo.compiled_autograd.ops.FunctionalCompiledAutograd(loss)
+        functional_compiled_autograd = torch__dynamo_compiled_autograd_ops_FunctionalCompiledAutograd(loss);  loss = None
+        getitem = functional_compiled_autograd[1];  functional_compiled_autograd = None
+        getitem_1 = getitem[0];  getitem = None
+        getitem_8: "f32[][]cpu" = getitem_1[0]
+        getitem_9: "f32[64, 1][1, 1]cpu" = getitem_1[1]
+        getitem_10: "f32[64, 1][1, 1]cpu" = getitem_1[2]
+        getitem_11: "f32[64, 3][3, 1]cpu" = getitem_1[3];  getitem_1 = None
+        
+         # File: <eval_with_key>.0:11 in forward, code: validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], True)]);  getitem = None
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_8], [((None, None, device(type='cpu'), 6, 0, None), [], True)]);  getitem_8 = None
+        getitem_15: "f32[][]cpu" = validate_outputs[0];  validate_outputs = None
+        
+         # File: <eval_with_key>.0:13 in forward, code: mse_loss_backward0 = torch__dynamo_compiled_autograd_ops_MseLossBackward0([getitem_6], [True, False], 1, getitem_1, getitem_2);  getitem_6 = getitem_1 = getitem_2 = None
+        mse_loss_backward0 = torch__dynamo_compiled_autograd_ops_MseLossBackward0([getitem_15], [True, False], 1, getitem_9, getitem_10);  getitem_15 = getitem_9 = getitem_10 = None
+        getitem_17: "f32[64, 1][1, 1]cpu" = mse_loss_backward0[0];  mse_loss_backward0 = None
+        
+         # File: <eval_with_key>.0:16 in forward, code: validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_7, getitem_8], [((None, None, device(type='cpu'), 6, 0, None), [64, 1], True), None]);  getitem_7 = getitem_8 = None
+        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_17, None], [((None, None, device(type='cpu'), 6, 0, None), [64, 1], True), None]);  getitem_17 = None
+        getitem_19: "f32[64, 1][1, 1]cpu" = validate_outputs_1[0];  validate_outputs_1 = None
+        
+         # File: <eval_with_key>.0:18 in forward, code: addmm_backward0 = torch__dynamo_compiled_autograd_ops_AddmmBackward0([getitem_9], [True, False, True], 1, 1, getitem_3, 0, [64, 3], [], None, 0, [3, 1], [1, 3]);  getitem_9 = getitem_3 = None
+        addmm_backward0 = torch__dynamo_compiled_autograd_ops_AddmmBackward0([getitem_19], [True, False, True], 1, 1, getitem_11, 0, [64, 3], [], None, 0, [3, 1], [1, 3]);  getitem_19 = getitem_11 = None
+        getitem_22: "f32[64, 1][1, 1]cpu" = addmm_backward0[0]
+        getitem_23: "f32[3, 1][1, 3]cpu" = addmm_backward0[2];  addmm_backward0 = None
+        
+         # File: <eval_with_key>.0:22 in forward, code: validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_11, getitem_12, getitem_13], [((None, None, device(type='cpu'), 6, 0, None), [1], True), None, ((None, None, device(type='cpu'), 6, 0, None), [3, 1], True)]);  getitem_11 = getitem_12 = getitem_13 = None
+        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_22, None, getitem_23], [((None, None, device(type='cpu'), 6, 0, None), [1], True), None, ((None, None, device(type='cpu'), 6, 0, None), [3, 1], True)]);  getitem_22 = getitem_23 = None
+        getitem_26: "f32[1][1]cpu" = validate_outputs_2[0]
+        getitem_27: "f32[3, 1][1, 3]cpu" = validate_outputs_2[2];  validate_outputs_2 = None
+        
+         # File: /home/xmfan/core/a/pytorch/torch/_dynamo/polyfills/__init__.py:80 in accumulate_grad, code: new_grad = torch.clone(new_grad)
+        new_grad: "f32[1][1]cpu" = torch.clone(getitem_26);  getitem_26 = new_grad = None
+        
+         # File: <eval_with_key>.0:26 in forward, code: tbackward0 = torch__dynamo_compiled_autograd_ops_TBackward0([getitem_16], [True]);  getitem_16 = None
+        tbackward0 = torch__dynamo_compiled_autograd_ops_TBackward0([getitem_27], [True]);  getitem_27 = None
+        getitem_29: "f32[1, 3][3, 1]cpu" = tbackward0[0];  tbackward0 = None
+        
+         # File: <eval_with_key>.0:28 in forward, code: validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_17], [((None, None, device(type='cpu'), 6, 0, None), [1, 3], True)]);  getitem_17 = None
+        validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_29], [((None, None, device(type='cpu'), 6, 0, None), [1, 3], True)]);  getitem_29 = None
+        getitem_31: "f32[1, 3][3, 1]cpu" = validate_outputs_3[0];  validate_outputs_3 = None
+        
+         # File: /home/xmfan/core/a/pytorch/torch/_dynamo/polyfills/__init__.py:80 in accumulate_grad, code: new_grad = torch.clone(new_grad)
+        new_grad_1: "f32[1, 3][3, 1]cpu" = torch.clone(getitem_31);  getitem_31 = new_grad_1 = None
+        return ()
+"""
--- a/test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py
@ -1,7 +1,6 @@
 # Owner(s): ["oncall: distributed"]

 import sys
-from typing import List

 import torch
 import torch.distributed as dist
@ -119,7 +118,7 @@ def _find_name_param_mappings(module: torch.nn.Module, prefix: str):


 def _discover_ddp_ignored_params(module: torch.nn.Module, prefix: str):
-    ddp_ignore_parameters: List[str] = []
+    ddp_ignore_parameters: list[str] = []
    if isinstance(module, FSDP2):
        ddp_ignore_parameters = [name for name, _ in module.named_parameters(prefix)]
    else:
--- a/test/distributed/_composable/fsdp/test_fully_shard_init.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@ -1089,8 +1089,7 @@ class TestHSDPWithCustomHook(FSDPTestMultiThread):
        torch.nn.init.constant_(model.in_proj.weight, 1.0 * rank_group)
        torch.nn.init.constant_(model.out_proj.weight, 2.0 * rank_group)

-        fully_shard(model, mesh=mesh)
-        model = cast(FSDPModule, model)
+        model = fully_shard(model, mesh=mesh)

        hook_called: bool = False

--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@ -292,15 +292,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)

        def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
-            largest_dim = -1
-            largest_dim_size = -1
-            for dim, dim_size in enumerate(param.shape):
-                if dim_size > largest_dim_size:
-                    largest_dim = dim
-                    largest_dim_size = dim_size
-            assert largest_dim >= 0, f"{param.shape}"
-            assert largest_dim < param.ndim, f"{largest_dim=} {param.shape}"
-            return Shard(largest_dim)
+            return Shard(param.shape.index(max(param.shape)))

        shard_placement_fn = _shard_placement_fn if use_shard_placement_fn else None
        fully_shard(model, shard_placement_fn=shard_placement_fn)
@ -769,13 +761,7 @@ class TestFullyShardShardPlacementFnMultiProcess(FSDPTest):
        ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)

        def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
-            largest_dim = -1
-            largest_dim_size = -1
-            for dim, dim_size in enumerate(param.shape):
-                if dim_size > largest_dim_size:
-                    largest_dim = dim
-                    largest_dim_size = dim_size
-            return Shard(largest_dim)
+            return Shard(param.shape.index(max(param.shape)))

        for layer in model.layers:
            fully_shard(layer, shard_placement_fn=shard_placement_fn)
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@ -97,11 +97,7 @@ class ReplicateTest(MultiProcessInductorTestCase):
        device: Union[str, torch.device],
    ):
        self.create_pg(device)
-        torch._dynamo.config.optimize_ddp = (
-            "python_reducer_without_compiled_forward"
-            if no_compile_forward
-            else "python_reducer"
-        )
+        torch._dynamo.config.optimize_ddp = "python_reducer"
        torch.manual_seed(123)
        model = Net(checkpoint=checkpoint).to(device)
        input = torch.randn([1, DIM], device=device)
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@ -47,7 +47,12 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
    MultiProcessTestCase,
    with_comms,
 )
-from torch.testing._internal.distributed.common_state_dict import VerifyStateDictMixin
+from torch.testing._internal.distributed.common_state_dict import (
+    FusionEmbedding,
+    FusionEmbeddingWithHook,
+    FusionEmbeddingWithModifier,
+    VerifyStateDictMixin,
+)
 from torch.utils._pytree import tree_all, tree_all_only


@ -919,6 +924,20 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
                ),
            )

+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    def test_state_dict_with_hook_on_keys(self) -> None:
+        with torch.device("meta"):
+            metamodel = FusionEmbedding(4, 4, 4)
+        with torch.device("cuda"):
+            gpumodel = FusionEmbeddingWithHook(4, 4, 4)
+        gpumodel_state_dict = get_model_state_dict(gpumodel)
+        with self.assertRaisesRegex(RuntimeError, "Missing key"):
+            set_model_state_dict(metamodel, gpumodel_state_dict)
+        with torch.device("meta"):
+            metamodel_modified = FusionEmbeddingWithModifier(4, 4, 4)
+        set_model_state_dict(metamodel_modified, gpumodel_state_dict)
+

 class TestNoComm(MultiProcessTestCase):
    def setUp(self) -> None:
--- a/test/distributed/elastic/utils/distributed_test.py
+++ b/test/distributed/elastic/utils/distributed_test.py
@ -170,7 +170,7 @@ class DistributedUtilTest(TestCase):
            server_port=pick_free_port,
            timeout=1,
        )
-        with self.assertRaises(RuntimeError):
+        with self.assertRaises(DistNetworkError):
            create_c10d_store(
                is_server=True, server_addr=server_addr, server_port=store1.port
            )
--- a/test/distributed/flight_recorder/test_fr_analysis.py
+++ b/test/distributed/flight_recorder/test_fr_analysis.py
@ -8,7 +8,7 @@ import sys
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent

 sys.path.insert(0, str(REPO_ROOT))
-from tools.flight_recorder.components.types import COLLECTIVES, MatchState
+from tools.flight_recorder.components.types import COLLECTIVES, MatchInfo, MatchState
 from tools.flight_recorder.components.utils import match_one_event


@ -50,14 +50,14 @@ class FlightRecorderEventTest(TestCase):
        )
        membership = {"0": {0, 1}}
        self.assertEqual(
-            match_one_event(e1, e1, membership, "0"), MatchState.FULLY_MATCHED
+            match_one_event(e1, e1, membership, "0").state, MatchState.FULLY_MATCHED
        )

        e2 = create_one_event(
            "all_gather", ("0", "default"), [[4, 4]], [[4, 4]], "scheduled", 1
        )
        self.assertEqual(
-            match_one_event(e1, e2, membership, "0"),
+            match_one_event(e1, e2, membership, "0").state,
            MatchState.COLLECTIVE_TYPE_MISMATCH,
        )

@ -67,34 +67,39 @@ class FlightRecorderEventTest(TestCase):
        e4 = create_one_event(
            "all_to_all", ("0", "default"), [[4, 4]], [[4, 4]], "scheduled", 1
        )
-        self.assertEqual(match_one_event(e3, e4, membership, "0"), MatchState.UNDECIDED)
+        self.assertEqual(
+            match_one_event(e3, e4, membership, "0").state, MatchState.UNDECIDED
+        )

        e5 = create_one_event(
            "all_reduce", ("0", "default"), [[5, 4]], [[4, 4]], "scheduled", 1, 1
        )
        self.assertEqual(
-            match_one_event(e1, e5, membership, "0"), MatchState.SIZE_OR_SYNTAX_MISMATCH
+            match_one_event(e1, e5, membership, "0").state,
+            MatchState.SIZE_OR_SYNTAX_MISMATCH,
        )

        e6 = create_one_event(
            "all_reduce", ("0", "default"), [[4, 4]], [[5, 4]], "scheduled", 1, 2
        )
        self.assertEqual(
-            match_one_event(e1, e6, membership, "0"), MatchState.SIZE_OR_SYNTAX_MISMATCH
+            match_one_event(e1, e6, membership, "0").state,
+            MatchState.SIZE_OR_SYNTAX_MISMATCH,
        )

        e7 = create_one_event(
            "all_reduce", ("0", "default"), [[4, 4]], [[5, 4]], "scheduled", 2
        )
        self.assertEqual(
-            match_one_event(e7, e7, membership, "0"), MatchState.SIZE_OR_SYNTAX_MISMATCH
+            match_one_event(e7, e7, membership, "0").state,
+            MatchState.SIZE_OR_SYNTAX_MISMATCH,
        )

        e9 = create_one_event(
            "all_reduce", ("0", "default"), [[4, 4]], [[4, 4]], "completed", 1
        )
        self.assertEqual(
-            match_one_event(e1, e9, membership, "0"),
+            match_one_event(e1, e9, membership, "0").state,
            MatchState.COLLECTIVE_STATE_MISMATCH,
        )

@ -108,7 +113,7 @@ class FlightRecorderEventTest(TestCase):
            output_dtypes="float16",
        )
        self.assertEqual(
-            match_one_event(e10, e9, membership, "0"),
+            match_one_event(e10, e9, membership, "0").state,
            MatchState.COLLECTIVE_DTYPE_MISMATCH,
        )

@ -128,9 +133,19 @@ class FlightRecorderEventTest(TestCase):
                collective, ("0", "default"), input_sizes, output_sizes, "scheduled", 1
            )
            membership = {"0": {0, 1}}
-            result = match_one_event(event, event, membership, "0")
+            result = match_one_event(event, event, membership, "0").state
            self.assertEqual(result, expectedState)


+class FlightMatchInfoTest(TestCase):
+    def test_match_info(self):
+        m1 = MatchInfo(MatchState.FULLY_MATCHED, "rank 0")
+        m2 = MatchInfo(MatchState.FULLY_MATCHED, "rank 1")
+        self.assertEqual(m1.state, MatchState.FULLY_MATCHED)
+        self.assertEqual(m1.state, m2.state)
+        self.assertEqual(str(m1), "Error type: FULLY_MATCHED, rank 0")
+        self.assertEqual(str(m2), "Error type: FULLY_MATCHED, rank 1")
+
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@ -3,7 +3,7 @@ import gc
 import threading
 import unittest
 from datetime import timedelta
-from typing import List, Optional
+from typing import Optional

 import torch
 import torch.distributed as dist
@ -576,24 +576,24 @@ class ProcessGroupDummy(dist.ProcessGroup):
        self.waits = 0
        self.dels = 0

-    def broadcast(self, tensor_list: List[torch.Tensor], opts: object) -> dist.Work:
+    def broadcast(self, tensor_list: list[torch.Tensor], opts: object) -> dist.Work:
        return _DummyWork(self)

    def allgather_into_tensor_coalesced(
        self,
-        output_lists: List[torch.Tensor],
-        input_list: List[torch.Tensor],
+        output_lists: list[torch.Tensor],
+        input_list: list[torch.Tensor],
        opts: object,
    ) -> dist.Work:
        return _DummyWork(self)

-    def allreduce(self, tensors: List[torch.Tensor], opts: object) -> dist.Work:
+    def allreduce(self, tensors: list[torch.Tensor], opts: object) -> dist.Work:
        return _DummyWork(self)

    def reduce_scatter_tensor_coalesced(
        self,
-        outputTensors: List[torch.Tensor],
-        inputTensors: List[torch.Tensor],
+        outputTensors: list[torch.Tensor],
+        inputTensors: list[torch.Tensor],
        opts: object,
    ) -> dist.Work:
        return _DummyWork(self)
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@ -289,7 +289,7 @@ class TCPStoreTest(TestCase, StoreTestBase):
        port = common.find_free_port()

        err_msg_reg = f"^The server socket has failed to listen on any local .*{port}"
-        with self.assertRaisesRegex(RuntimeError, err_msg_reg):
+        with self.assertRaisesRegex(dist.DistNetworkError, err_msg_reg):
            # Use noqa to silence flake8.
            # Need to store in an unused variable here to ensure the first
            # object is not destroyed before the second object is created.
@ -521,6 +521,38 @@ class TCPStoreTest(TestCase, StoreTestBase):
        with self.assertRaisesRegex(ValueError, "TCPStore world size cannot be 0"):
            dist.TCPStore("localhost", 0, world_size=0, is_master=False)

+    def test_agent_store(self) -> None:
+        store = self._create_store()
+
+        with self.assertRaisesRegex(
+            dist.DistNetworkError,
+            "The server socket has failed to listen on any local network address",
+        ):
+            dist.TCPStore(
+                host_name="localhost",
+                port=store.port,
+                world_size=1,
+                is_master=True,
+                use_libuv=self._use_libuv,
+            )
+
+        USE_AGENT_STORE = "TORCHELASTIC_USE_AGENT_STORE"
+        MASTER_PORT = "MASTER_PORT"
+
+        os.environ[USE_AGENT_STORE] = "1"
+        os.environ[MASTER_PORT] = str(store.port)
+        second_server = dist.TCPStore(
+            host_name="localhost",
+            port=store.port,
+            world_size=1,
+            is_master=True,
+            use_libuv=self._use_libuv,
+        )
+        del os.environ[USE_AGENT_STORE]
+        del os.environ[MASTER_PORT]
+
+        self.assertEqual(second_server.port, store.port)
+

 class LibUvTCPStoreTest(TCPStoreTest):
    _use_libuv = True
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@ -1251,7 +1251,7 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no
        x = torch.randn(4, 4).to(device)
        opt_fn = torch.compile(fn, fullgraph=True)
        with self.assertRaisesRegex(
-            torch._dynamo.exc.Unsupported, "skip function graph_break in file"
+            torch._dynamo.exc.Unsupported, "User-inserted graph break"
        ):
            opt_fn(x)

--- a/test/dynamo/test_aot_autograd_cache.py
+++ b/test/dynamo/test_aot_autograd_cache.py
@ -27,7 +27,7 @@ from torch._inductor.utils import fresh_inductor_cache
 from torch._subclasses import FakeTensorMode
 from torch.compiler._cache import CacheArtifactManager
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
-from torch.testing._internal.common_cuda import SM80OrLater
+from torch.testing._internal.common_cuda import SM80OrLater, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
@ -693,6 +693,36 @@ class AOTAutogradCacheTests(InductorTestCase):
            self.assertNotEqual(res1, res3)
            self.assertEqual(res1, res3.sub(torch.ones(2, 2)))

+    @inductor_config.patch("fx_graph_cache", True)
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_constant_tensor_device_guards(self):
+        """
+        Usually, when there are example inputs, the device index of the inputs
+        is sufficient to make sure we don't cache hit with the results from different
+        cuda devices.
+        When the input has no arguments, we still need to have the cuda
+        device index in the cache key.
+        """
+
+        @torch.compile
+        def f():
+            y = torch.tensor([5], device="cuda")
+            return (y,)
+
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            result = f()
+            self.assertEqual(result[0].device, torch.device("cuda:0"))
+
+        self._clear_dynamo_and_codecache()
+
+        with torch.cuda._DeviceGuard(1):
+            torch.cuda.set_device(1)
+            result = f()
+            self.assertEqual(result[0].device, torch.device("cuda:1"))
+

@inductor_config.patch("fx_graph_cache", True)
 class AOTAutogradCachePicklerTests(torch._dynamo.test_case.TestCase):
--- a/test/dynamo/test_autograd_function.py
+++ b/test/dynamo/test_autograd_function.py
@ -271,7 +271,10 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase):
        model = CustomFuncBwdPrintModule()
        opt_model = torch.compile(model, backend="eager", fullgraph=True)
        x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
-        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "builtin: print"):
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.Unsupported,
+            "Dynamo does not know how to trace builtin operator `print`",
+        ):
            opt_model(x)

    def test_stride_in_bwd(self):
--- a/Show More
+++ b/Show More