[wip][flex_attention]: adding a heuristic for USE_TMA

Summary: Adding a USE_TMA heuristic which captures almost all of the perf. The best perf improvements are seen in document_mask+large shapes which are very important for training. Mean: +15.8% improvement overall Only a single shape (below) sees a perf degradation of >5%. Document Mask: 37.8% average improvement * Strong performance on variable-length sequences * Once case of regression on a short sequences (-5.72%) Sliding Window: 58.7% average improvement * 30-97% improvements Head Bias: 2.5% average improvement * Consistent 2-3% gains across all sequence lengths Others: * Softcap: 0.15% average improvement (minimal impact) * Prefix_LM: -0.4% average change (slight degradation) * Full Attention (noop): 1.6% average improvement * Causal: 0.6% average improvement Note: sliding window uplift is because of the test side change. Test Plan: Ran ``` buck2 run @//mode/opt -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 //pytorch/tritonbench:run -- --op flex_attention --metrics tflops,latency ``` Across the different variations. Here is the uplift plot: {F1982793932} Raw Data: ``` Shape Mask Type Post Changes Pre-Changes Percent Change (8, 16, 128, 16, 128, 128) noop 12.2404 11.672 4.869773818 (8, 16, 128, 16, 128, 128) causal 19.6761 18.9992 3.562781591 (8, 16, 128, 16, 128, 128) rel 12.64 12.1127 4.353282092 (8, 16, 128, 16, 128, 128) head_bias 12.3075 12.0091 2.484782373 (8, 16, 128, 16, 128, 128) alibi 19.0314 18.2775 4.124743537 (8, 16, 128, 16, 128, 128) sliding_window 19.1613 18.5902 3.072048714 (1, 16, 1024, 16, 1024, 128) document_mask 28.8403 30.5914 -5.72415777 (8, 16, 128, 16, 128, 128) prefix_lm 18.3073 18.8398 -2.826463126 (8, 16, 128, 16, 128, 128) softcap 18.2974 18.3773 -0.4347755111 (8, 16, 256, 16, 256, 128) noop 48.6082 47.8148 1.659318872 (8, 16, 256, 16, 256, 128) causal 57.6149 56.2675 2.394632781 (8, 16, 256, 16, 256, 128) rel 47.4945 47.2779 0.4581421764 (8, 16, 256, 16, 256, 128) head_bias 48.3813 47.014 2.908282639 (8, 16, 256, 16, 256, 128) alibi 56.9332 54.2731 4.901323123 (8, 16, 256, 16, 256, 128) sliding_window 56.4246 52.6885 7.090921169 (1, 16, 2048, 16, 2048, 128) document_mask 76.3089 74.3736 2.602133015 (8, 16, 256, 16, 256, 128) prefix_lm 58.2794 55.5256 4.95951417 (8, 16, 256, 16, 256, 128) softcap 55.2826 53.8968 2.571210165 (8, 16, 512, 16, 512, 128) noop 183.26 187.142 -2.074360646 (8, 16, 512, 16, 512, 128) causal 174.899 172.658 1.297941596 (8, 16, 512, 16, 512, 128) rel 189.178 185.085 2.211416376 (8, 16, 512, 16, 512, 128) head_bias 187.142 181.349 3.194393131 (8, 16, 512, 16, 512, 128) alibi 164.641 164.48 0.09788424125 (8, 16, 512, 16, 512, 128) sliding_window 169.957 122.94 38.24385879 (1, 16, 4096, 16, 4096, 128) document_mask 146.112 107.093 36.43468761 (8, 16, 512, 16, 512, 128) prefix_lm 175.354 175.354 0 (8, 16, 512, 16, 512, 128) softcap 162.262 162.183 0.04871040738 (8, 16, 1024, 16, 1024, 128) noop 451.681 446.073 1.257193329 (8, 16, 1024, 16, 1024, 128) causal 252.694 252.641 0.02097838435 (8, 16, 1024, 16, 1024, 128) rel 446.166 443.139 0.6830813808 (8, 16, 1024, 16, 1024, 128) head_bias 454.921 446.258 1.941253714 (8, 16, 1024, 16, 1024, 128) alibi 236.666 236.712 -0.01943289736 (8, 16, 1024, 16, 1024, 128) sliding_window 249.215 167.812 48.50844993 (1, 16, 8192, 16, 8192, 128) document_mask 210.169 149.254 40.81297654 (8, 16, 1024, 16, 1024, 128) prefix_lm 254.604 254.284 0.125843545 (8, 16, 1024, 16, 1024, 128) softcap 229.892 229.848 0.01914308587 (8, 16, 2048, 16, 2048, 128) noop 608.016 597.526 1.755572142 (8, 16, 2048, 16, 2048, 128) causal 321.017 321.693 -0.2101382374 (8, 16, 2048, 16, 2048, 128) rel 587.111 586.792 0.054363386 (8, 16, 2048, 16, 2048, 128) head_bias 598.687 587.111 1.971688488 (8, 16, 2048, 16, 2048, 128) alibi 295.773 295.773 0 (8, 16, 2048, 16, 2048, 128) sliding_window 318.097 183.105 73.72381967 (1, 16, 16384, 16, 16384, 128) document_mask 285.551 195.233 46.26164634 (8, 16, 2048, 16, 2048, 128) prefix_lm 322.304 321.806 0.1547516205 (8, 16, 2048, 16, 2048, 128) softcap 286.219 285.88 0.1185812229 (8, 16, 4096, 16, 4096, 128) noop 623.445 625.253 -0.2891629468 (8, 16, 4096, 16, 4096, 128) causal 369.057 369.057 0 (8, 16, 4096, 16, 4096, 128) rel 626.718 630.683 -0.6286835066 (8, 16, 4096, 16, 4096, 128) head_bias 619.192 617.717 0.2387824845 (8, 16, 4096, 16, 4096, 128) alibi 336.842 336.849 -0.002078082464 (8, 16, 4096, 16, 4096, 128) sliding_window 367.357 192.262 91.07103848 (1, 16, 32768, 16, 32768, 128) document_mask 352.813 236.054 49.46283478 (8, 16, 4096, 16, 4096, 128) prefix_lm 369.309 369.309 0 (8, 16, 4096, 16, 4096, 128) softcap 324.088 324.112 -0.007404847707 (8, 16, 8192, 16, 8192, 128) noop 668.267 644.41 3.702146149 (8, 16, 8192, 16, 8192, 128) causal 393.945 397.241 -0.8297230145 (8, 16, 8192, 16, 8192, 128) rel 663.352 649.078 2.199119366 (8, 16, 8192, 16, 8192, 128) head_bias 653.654 654.78 -0.1719661566 (8, 16, 8192, 16, 8192, 128) alibi 363.893 360.898 0.8298743689 (8, 16, 8192, 16, 8192, 128) sliding_window 385.163 197.756 94.76678331 (1, 16, 65536, 16, 65536, 128) document_mask 405.668 264.619 53.3026729 (8, 16, 8192, 16, 8192, 128) prefix_lm 393.872 397.458 -0.9022336951 (8, 16, 8192, 16, 8192, 128) softcap 345.572 346.346 -0.223475946 (8, 16, 16384, 16, 16384, 128) noop 681.404 642.487 6.057243182 (8, 16, 16384, 16, 16384, 128) causal 416.768 413.811 0.7145774279 (8, 16, 16384, 16, 16384, 128) rel 676.936 660.778 2.445299329 (8, 16, 16384, 16, 16384, 128) head_bias 676.907 657.428 2.962910007 (8, 16, 16384, 16, 16384, 128) alibi 382.453 373.863 2.29763309 (8, 16, 16384, 16, 16384, 128) sliding_window 395.434 200.448 97.27510377 (1, 16, 131072, 16, 131072, 128) document_mask 439.601 282.557 55.57958217 (8, 16, 16384, 16, 16384, 128) prefix_lm 416.927 412.476 1.079093087 (8, 16, 16384, 16, 16384, 128) softcap 363.194 359.052 1.153593351 ``` Differential Revision: D84373821
2025-10-23 14:59:34 +08:00 · 2025-10-17 15:31:36 -07:00
546 changed files with 3297 additions and 6408 deletions
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -83,6 +83,10 @@ function build_cpython {
        py_suffix=${py_ver::-1}
        py_folder=$py_suffix
    fi
+    # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
+    if [ "$py_suffix" == "3.14.0" ]; then
+        py_suffix="3.14.0rc2"
+    fi
    wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
    do_cpython_build $py_ver Python-$py_suffix

--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
        logger.info("Successfully cloned %s", target)
        return r, commit

-    except GitCommandError:
-        logger.exception("Git operation failed")
+    except GitCommandError as e:
+        logger.error("Git operation failed: %s", e)
        raise


--- a/.flake8
+++ b/.flake8
@ -7,12 +7,14 @@ max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
+    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
    EXE001,
    # these ignores are from flake8-bugbear; please fix!
    B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
+    # these ignores are from flake8-logging-format; please fix!
+    G100,G101,G200
    # these ignores are from flake8-simplify. please fix or ignore with commented reason
    SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
    # SIM104 is already covered by pyupgrade ruff
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-69bbe7363897764f9e758d851cd0340147d27f94
+1b013f5b5a87a1882eb143c26d79d091150d6a37
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -133,32 +133,3 @@

 "ciflow/vllm":
 - .github/ci_commit_pins/vllm.txt
-
-"ciflow/b200":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/**/*cublas*
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
-
-"ciflow/h100":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/**/*cublas*
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
-
-"ciflow/rocm":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -33,7 +33,6 @@ ciflow_push_tags:
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/rocm-mi355
- ciflow/rocm-navi31
 - ciflow/s390
 - ciflow/slow
 - ciflow/torchbench
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -79,21 +79,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
    ),
    "12.9": (
-        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
-        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
-        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
-        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
-        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
-        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
-        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
-        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
-        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
-        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
-        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
+        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "13.0": (
        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -26,8 +26,9 @@ name: !{{ build_environment }}
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
+          python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
          freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
 {%- endmacro %}

--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -224,7 +224,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -473,7 +473,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -722,7 +722,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -971,7 +971,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1220,7 +1220,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1469,7 +1469,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1718,7 +1718,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -259,7 +259,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_9-test:  # Testing
@ -925,7 +925,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_9-test:  # Testing
@ -1591,7 +1591,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_9-test:  # Testing
@ -2257,7 +2257,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_9-test:  # Testing
@ -2923,7 +2923,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_9-test:  # Testing
@ -3589,7 +3589,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_9-test:  # Testing
@ -4255,7 +4255,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_9-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@ -63,6 +63,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.10.4"
          freethreaded: false
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -59,6 +59,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.10.4"
          freethreaded: false
@ -168,6 +169,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.11.4"
          freethreaded: false
@ -277,6 +279,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.12.4"
          freethreaded: false
@ -386,6 +389,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.13.4"
          freethreaded: false
@ -495,6 +499,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.13.4"
          freethreaded: true
@ -604,8 +609,9 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "3.14.0"
+          python-version: "3.14.0-rc.2"
          freethreaded: false
      - name: Checkout PyTorch
        uses: actions/checkout@v4
@ -713,8 +719,9 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
+          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "3.14.0"
+          python-version: "3.14.0-rc.2"
          freethreaded: true
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/rocm-navi31.yml
+++ b/.github/workflows/rocm-navi31.yml
@ -1,63 +0,0 @@
-name: rocm-navi31
-
-on:
-  push:
-    tags:
-      - ciflow/rocm-navi31/*
-  workflow_dispatch:
-  schedule:
-    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
-    # Also run less frequently on weekends.
-    - cron: 45 */2 * * 1-5
-    - cron: 45 4,12 * * 0,6
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions: read-all
-
-jobs:
-  target-determination:
-    if: github.repository_owner == 'pytorch'
-    name: before-test
-    uses: ./.github/workflows/target_determination.yml
-    permissions:
-      id-token: write
-      contents: read
-
-  linux-jammy-rocm-py3_10-build:
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3_10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-      tests-to-include: >-
-         ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
-         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
-         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
-         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
-         inductor/test_flex_attention inductor/test_max_autotune' || '' }}
-    secrets: inherit
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -59,3 +59,29 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-rocm-py3_10-gfx1100-test:
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3_10-gfx1100
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+        ]}
+      tests-to-include: >
+         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
+         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
+         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
+         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
+         inductor/test_flex_attention inductor/test_max_autotune
+    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -190,40 +190,6 @@ jobs:
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    secrets: inherit

-  linux-jammy-rocm-py3_10-build:
-    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
-    secrets: inherit
-
  inductor-build:
    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.gitignore
+++ b/.gitignore
@ -374,7 +374,6 @@ third_party/ruy/
 third_party/glog/

 # Virtualenv
-.venv/
 venv/

 # Log files
--- a/14
+++ b/14
@ -201,17 +201,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /torch/csrc/stable/ @janeyx99 @mikaylagawarecki
 /torch/headeronly/ @janeyx99
 /torch/header_only_apis.txt @janeyx99
-
-# FlexAttention
-/torch/nn/attention/flex_attention.py @drisspg
-/torch/_higher_order_ops/flex_attention.py @drisspg
-/torch/_inductor/kernel/flex/ @drisspg
-/torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg
-/test/inductor/test_flex_attention.py @drisspg
-/test/inductor/test_flex_decoding.py @drisspg
-
-# Low Precision GEMMs
-/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
-/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
-/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
-/test/test_scaled_matmul_cuda.py @drisspg @slayton58
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -313,14 +313,13 @@ IF(USE_FBGEMM_GENAI)

    # Add additional HIPCC compiler flags for performance
    set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
+      -mllvm
+      -amdgpu-coerce-illegal-types=1
      -mllvm
      -enable-post-misched=0
      -mllvm
      -greedy-reverse-local-assignment=1
      -fhip-new-launch-api)
-    if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
-        list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
-      endif()

    # Only compile for gfx942 for now.
    # This is rather hacky, I could not figure out a clean solution :(
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -183,6 +183,11 @@ struct CUDACachingHostAllocatorImpl
    return true;
  }

+  bool pinned_use_background_threads() override {
+    return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
+        pinned_use_background_threads();
+  }
+
  EventPool::Event create_event_internal(DeviceIndex idx) {
    // Leak the event pool to avoid shutdown issue.
    static auto* event_pool = new EventPool();
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@ -177,6 +177,7 @@ inline void segmented_sort_pairs(
  }
 }

+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
 template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
 inline void unique_by_key(
  KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
@ -192,6 +193,7 @@ inline void unique_by_key(
  CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
    keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
 }
+#endif

 namespace impl {

@ -577,6 +579,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
 #endif
 }

+#if CUB_SUPPORTS_SCAN_BY_KEY()

 template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
 inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
@ -604,6 +607,7 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
 #endif
 }

+#endif

 template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
 void unique(InputIteratorT input, OutputIteratorT output,
--- a/aten/src/ATen/cuda/cub_definitions.cuh
+++ b/aten/src/ATen/cuda/cub_definitions.cuh
@ -28,6 +28,22 @@
 #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
 #endif

+// cub support for UniqueByKey is added to cub 1.16 in:
+// https://github.com/NVIDIA/cub/pull/405
+#if CUB_VERSION >= 101600
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
+#else
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
+#endif
+
+// cub support for scan by key is added to cub 1.15
+// in https://github.com/NVIDIA/cub/pull/376
+#if CUB_VERSION >= 101500
+#define CUB_SUPPORTS_SCAN_BY_KEY() 1
+#else
+#define CUB_SUPPORTS_SCAN_BY_KEY() 0
+#endif
+
 // cub support for cub::FutureValue is added to cub 1.15 in:
 // https://github.com/NVIDIA/cub/pull/305
 #if CUB_VERSION >= 101500
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -160,10 +160,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
  DispatchKey::CUDA,
  DispatchKey::CPU,
  DispatchKey::PrivateUse1,
-  DispatchKey::SparseCPU,
-  DispatchKey::SparseCUDA,
-  DispatchKey::SparseCsrCPU,
-  DispatchKey::SparseCsrCUDA,
 });

 inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@ -141,8 +141,6 @@ void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
    return;
  }

-  checkTrilTriuMemoryOverlap(result, self);
-
  bool inplace_op = self.is_same(result);

  bool inplace_update = false;
--- a/aten/src/ATen/native/TriangularOpsUtils.h
+++ b/aten/src/ATen/native/TriangularOpsUtils.h
@ -1,4 +1,3 @@
-#include <ATen/MemoryOverlap.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/LinearAlgebraUtils.h>

@ -55,13 +54,4 @@ static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor
  return std::make_tuple(true, tensor);
 }

-static inline void checkTrilTriuMemoryOverlap(const Tensor& result, const Tensor& self) {
-  if (result.is_same(self)) {
-    at::assert_no_internal_overlap(result);
-  } else {
-    at::assert_no_internal_overlap(result);
-    at::assert_no_overlap(result, self);
-  }
-}
-
 }  // namespace at::native
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -120,7 +120,7 @@ static void pow_tensor_scalar_kernel(
  } else if (dtype == ScalarType::Half) {
    [&]() {
      using scalar_t =
-          decltype(c10::impl::ScalarTypeToCPPType<ScalarType::Half>::t);
+          c10::impl::ScalarTypeToCPPTypeT<ScalarType::Half>;
      const auto exp = exp_scalar.to<scalar_t>();
      using Vec = Vectorized<scalar_t>;
      cpu_kernel_vec(iter,
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -2322,23 +2322,12 @@ _scaled_nvfp4_nvfp4(
          const Tensor& scale_b, const SwizzleType swizzle_b,
          const std::optional<Tensor>& bias,
          const c10::ScalarType out_dtype,
-          Tensor& out,
-          const std::optional<Tensor>& global_scale_a = std::nullopt,
-          const std::optional<Tensor>& global_scale_b = std::nullopt) {
+          const bool single_scale,
+          Tensor& out) {
 #ifdef USE_ROCM
  TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
 #endif
-  std::optional<Tensor> alpha = std::nullopt;
-  // Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise,
-  //       if this is "And" we would silently do nothing in the case where one global scale is
-  //       passed and not the other.
-  if (global_scale_a.has_value() || global_scale_b.has_value()) {
-    TORCH_CHECK_VALUE(global_scale_a.has_value(),
-        "For two-level-scaled NVFP4, global_scale_a must have a value");
-    TORCH_CHECK_VALUE(global_scale_b.has_value(),
-        "For two-level-scaled NVFP4, global_scale_b must have a value");
-    alpha = global_scale_a.value().mul(global_scale_b.value());
-  }
+  TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
  // Restrictions:
  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
  // Scales must be swizzled
@ -2360,7 +2349,7 @@ _scaled_nvfp4_nvfp4(

  auto scaling_choice_a = ScalingType::BlockWise1x16;
  auto scaling_choice_b = ScalingType::BlockWise1x16;
-  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha);
+  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
 }


@ -2566,10 +2555,9 @@ _scaled_mm_cuda_v2_out(
  } else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
    return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
  } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
-    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out,
-                               scale_a[1], scale_b[1]);
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
  } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
-    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
+    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
  } else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) {
    return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
  } else {
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -856,9 +856,13 @@ struct type_specialized_kernel_launcher {
      out_calc_t output_offset_calculator,
      loader_t loader,
      storer_t storer) {
-    if (ret_t == rt_binary_specializations[arg_index][0] &&
-        arg0_t == rt_binary_specializations[arg_index][1] &&
-        arg1_t == rt_binary_specializations[arg_index][2])
+    constexpr ScalarType sret_t = rt_binary_specializations[arg_index][0];
+    constexpr ScalarType sarg0_t = rt_binary_specializations[arg_index][1];
+    constexpr ScalarType sarg1_t = rt_binary_specializations[arg_index][2];
+    if (ret_t == sret_t && arg0_t == sarg0_t && arg1_t == sarg1_t) {
+      using cret_t = c10::impl::ScalarTypeToCPPTypeT<sret_t>;
+      using carg0_t = c10::impl::ScalarTypeToCPPTypeT<sarg0_t>;
+      using carg1_t = c10::impl::ScalarTypeToCPPTypeT<sarg1_t>;
      launch_vectorized_templated_kernel<
          func_t,
          array_t,
@ -866,12 +870,9 @@ struct type_specialized_kernel_launcher {
          out_calc_t,
          loader_t,
          storer_t,
-          decltype(c10::impl::ScalarTypeToCPPType<
-                   rt_binary_specializations[arg_index][0]>::t),
-          decltype(c10::impl::ScalarTypeToCPPType<
-                   rt_binary_specializations[arg_index][1]>::t),
-          decltype(c10::impl::ScalarTypeToCPPType<
-                   rt_binary_specializations[arg_index][2]>::t)>(
+          cret_t,
+          carg0_t,
+          carg1_t>(
          numel,
          f,
          data,
@ -879,6 +880,7 @@ struct type_specialized_kernel_launcher {
          output_offset_calculator,
          loader,
          storer);
+    }
  }
 };

--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@ -15,7 +15,9 @@
 #include <ATen/native/cuda/block_reduce.cuh>
 #include <ATen/native/cuda/thread_constants.h>

+#if CUB_SUPPORTS_SCAN_BY_KEY()
 #include <thrust/iterator/reverse_iterator.h>
+#endif

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -238,6 +240,10 @@ __global__ void renorm_kernel(

 } // anonymous namespace

+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+template<typename index_t>
+void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
+#endif

 Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
                               int64_t num_weights, int64_t padding_idx,
@ -300,6 +306,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice

  if (scale_grad_by_freq) {
    count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+#if CUB_SUPPORTS_SCAN_BY_KEY()
    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
      cudaStream_t stream = at::cuda::getCurrentCUDAStream();

@ -326,6 +333,11 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
        num_indices
      );
    });
+#else
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
+      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
+    });
+#endif
  }

  return embedding_backward_cuda_kernel(grad, orig_indices,
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
@ -10,7 +10,9 @@

 #include <c10/macros/Macros.h>

+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
 #include <thrust/iterator/counting_iterator.h>
+#endif

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -194,9 +196,18 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm
            partials_per_segment_offset[num_of_segments-1];
 }

+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
+  *num_of_segments_ptr = num_of_segments;
+}
+#endif

 } // anon namespace

+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+template<typename index_t>
+int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
+#endif

 Tensor embedding_backward_cuda_kernel(
        const Tensor &grad,
@ -223,12 +234,20 @@ Tensor embedding_backward_cuda_kernel(
  auto segment_offsets = at::empty({numel}, orig_indices.options());
  auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
  int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
+    int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
+    write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+#else
  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
    cuda::cub::unique_by_key(
      sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
      segment_offsets.mutable_data_ptr<index_t>(),
      num_of_segments_ptr, sorted_indices.numel());
  });
+#endif

  int64_t max_segments = std::min<int64_t>(numel, num_weights);

--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@ -31,10 +31,16 @@

 #include <c10/macros/Macros.h>

+#if CUB_SUPPORTS_SCAN_BY_KEY()
 #include <thrust/iterator/reverse_iterator.h>
+#endif

 namespace at::native {

+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+template<typename index_t>
+void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
+#endif

 namespace {

@ -193,6 +199,7 @@ Tensor embedding_bag_backward_cuda_sum_avg(

  if (scale_grad_by_freq) {
    count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+#if CUB_SUPPORTS_SCAN_BY_KEY()
    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
      cudaStream_t stream = at::cuda::getCurrentCUDAStream();

@ -219,6 +226,11 @@ Tensor embedding_bag_backward_cuda_sum_avg(
        num_indices
      );
    });
+#else
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
+      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
+    });
+#endif
  }
  return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
      count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag,
--- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
+++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
@ -0,0 +1,90 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/native/cuda/SortingCommon.cuh>
+#include <ATen/cuda/cub_definitions.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
+#include <ATen/cuda/ThrustAllocator.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace at::native {
+
+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+
+template<typename index_t>
+void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  at::cuda::ThrustAllocator allocator;
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto num_indices = count.numel();
+
+  // Compute an increasing sequence per unique item in sortedIndices:
+  // sorted: 2 5 5 5 7 7 8 9 9
+  //  count: 1 1 2 3 1 2 1 1 2
+  auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
+  auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
+  thrust::inclusive_scan_by_key(
+    policy,
+    sorted_data,
+    sorted_data + num_indices,
+    thrust::make_constant_iterator(1),
+    count_data
+  );
+
+  // Take the maximum of each count per unique key in reverse:
+  // sorted: 2 5 5 5 7 7 8 9 9
+  //  count: 1 3 3 3 2 2 1 2 2
+  thrust::inclusive_scan_by_key(
+    policy,
+    thrust::make_reverse_iterator(sorted_data + num_indices),
+    thrust::make_reverse_iterator(sorted_data),
+    thrust::make_reverse_iterator(count_data + num_indices),
+    thrust::make_reverse_iterator(count_data + num_indices),
+    thrust::equal_to<index_t>(),
+    thrust::maximum<index_t>()
+  );
+}
+
+template
+void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
+template
+void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
+
+#endif
+
+template<typename index_t>
+int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  at::cuda::ThrustAllocator allocator;
+  auto policy = thrust::cuda::par(allocator).on(stream);
+  const ptrdiff_t numel = sorted_indices.numel();
+  auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
+  auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
+  auto ends = thrust::unique_by_key_copy(
+          policy,
+          sorted_indices_dev,
+          sorted_indices_dev + numel,
+          thrust::make_counting_iterator(0),
+          dummy_dev,
+          thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
+  return thrust::get<0>(ends) - dummy_dev;
+}
+
+template
+int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
+template
+int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
+
+} // namespace at::native
--- a/aten/src/ATen/native/cuda/NLLLoss2d.cu
+++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu
@ -146,7 +146,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
  int64_t batch_size = target.size(0);
  int64_t H = target.size(1);
  int64_t W = target.size(2);
-  int64_t n_classes = grad_input.size(1);

  CUDA_KERNEL_LOOP(index, n_threads) {
    const int64_t b = index % batch_size;
@ -157,7 +156,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
    if (cur_target == ignore_index) {
      continue;
    }
-    CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
    scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1));
    grad_input[b][cur_target][h][w] = value * grad_output[b][h][w];
  }
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -413,12 +413,14 @@ struct ReduceOp {
      value = thread_reduce<output_vec_size>(input_slice);
    }

-    if (config.should_block_x_reduce()) {
-      value = block_x_reduce<output_vec_size>(value, shared_memory);
-    }
    if (config.should_block_y_reduce()) {
      value = block_y_reduce<output_vec_size>(value, shared_memory);
    }
+    __syncthreads();
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<output_vec_size>(value, shared_memory);
+    }
+
    using out_ptr_vec_t = std::array<out_scalar_t*, output_vec_size>;
    using offset_vec_t = std::array<index_t, output_vec_size>;
    offset_vec_t base_offsets;
@ -655,8 +657,8 @@ struct ReduceOp {
    __syncthreads();
    // Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
    // matching Triton, etc.
-    // TODO(PaulZhang12): AMD and internal
-    #if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
+    // todo for AMD
+    #ifdef USE_ROCM
    for (int offset = 1; offset < dim_x; offset <<= 1) {
    #else
    for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
--- a/aten/src/ATen/native/cuda/TensorTopK.cpp
+++ b/aten/src/ATen/native/cuda/TensorTopK.cpp
@ -19,6 +19,7 @@

 namespace at::native {

+// TODO: remove this when CUDA <11.6 is no longer supported
 void topk_out_with_sort(
  const Tensor& self,
  int64_t k, int64_t dim, bool largest,
@ -30,12 +31,21 @@ void topk_out_with_sort(
  indices.copy_(sorted_indices.narrow(dim, 0, k));
 }

+// TODO: remove this when CUDA <11.6 is no longer supported
+bool disable_sort_for_topk();
 bool should_use_sort(const Tensor& self, int64_t dim) {
 #if defined(USE_ROCM)
  if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
  return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
 #else
-  return false;
+  if (disable_sort_for_topk()) return false;
+  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
+  if (self.dim() == 0) return false;
+  if (self.dtype() == kBool) return false; // Bool is not support by topk
+  int64_t slice_size = self.size(dim);
+  if (slice_size == 0) return false;
+  int64_t num_slices = self.numel() / slice_size;
+  return num_slices <= 10 && slice_size >= 100000;
 #endif
 }

--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@ -21,6 +21,11 @@ using namespace at::native;

 namespace at::native {

+// TODO: remove this when CUDA <11.6 is no longer supported
+bool disable_sort_for_topk() {
+  return CUB_SUPPORTS_SCAN_BY_KEY();
+}
+
 namespace sbtopk { // single_block_topk

 template <typename T>
@ -413,6 +418,10 @@ __global__ void computeBlockwiseWithinKCounts(
  }
  __syncthreads();

+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+  return;
+#endif
+
  Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);

  // if largest, then only threads that has tidx > desired_digit are active
@ -468,6 +477,7 @@ __global__ void computeBlockwiseWithinKCounts(
  }
 }

+#if CUB_SUPPORTS_SCAN_BY_KEY()
 // Assumption: slice_size can not be larger than UINT32_MAX
 template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
@ -599,6 +609,7 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
    }
  }
 }
+#endif

 int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
  // occupancy of this kernel is limited by registers per threads
@ -676,12 +687,16 @@ void launch(
  uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
  AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));

+#if CUB_SUPPORTS_SCAN_BY_KEY()
  auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
  uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
  AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));

  auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
  uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
+#else
+  uint32_t* withinKCounts = nullptr;
+#endif

  Bitwise desiredMask = 0;
  dim3 grid;
@ -728,6 +743,7 @@ void launch(
  }
  desired = desired_in;

+#if CUB_SUPPORTS_SCAN_BY_KEY()
  computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
    desired, counts, num_blocks, blocks_per_slice, kthCounts);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -743,6 +759,28 @@ void launch(
    topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
    blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
+#else
+  // Find topk values based on kth values
+  {
+    dim3 grid;
+    TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
+    int warp_size = at::cuda::warp_size();
+    dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
+    sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
+        input,
+        inputSliceSize,
+        outputSliceSize,
+        largest,
+        numInputSlices,
+        inputWithinSliceStride,
+        topK,
+        topKWithinSliceStride,
+        indices,
+        indicesWithinSliceStride,
+        kthValues);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+#endif
 }

 } // namespace mbtopk
@ -750,6 +788,7 @@ void launch(
 bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
  if (num_slices > std::numeric_limits<uint32_t>::max() ||
      slice_size > std::numeric_limits<uint32_t>::max()) return false;
+#if CUB_SUPPORTS_SCAN_BY_KEY()
  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
  return (num_slices <= 20 && slice_size >= 20000) ||
      (num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
@ -758,6 +797,12 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
      (num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
      (num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
      (num_slices > 4000 && slice_size >= 400);
+#else
+  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
+  return (num_slices <= 400 && slice_size >= 5000) ||
+      (num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
+      (num_slices >= 4000 && slice_size >= 300);
+#endif
 }

 void launch_gather_topk_kernel(
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@ -5,7 +5,6 @@
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/native/Resize.h>
-#include <ATen/native/TriangularOpsUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -45,7 +44,7 @@ __global__ void triu_tril_kernel(
    const int64_t k,
    const int64_t N_padded,
    const IndexType last_dim_padded) {
-  int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread;
+  int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
  if (linear_idx >= N_padded) {
    return;
  }
@ -111,8 +110,6 @@ __global__ void triu_tril_kernel(

 template <bool upper>
 void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
-  checkTrilTriuMemoryOverlap(result, self);
-
  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
      at::ScalarType::ComplexHalf,
      at::ScalarType::Half,
--- a/aten/src/ATen/native/cuda/reduction_template.cuh
+++ b/aten/src/ATen/native/cuda/reduction_template.cuh
@ -466,7 +466,7 @@ struct ReduceJitOp {

    __syncthreads();

-    #if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
+    #ifdef USE_ROCM
    for (int offset = 1; offset < dim_x; offset <<= 1) {
    #else
    for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -487,7 +487,9 @@ std::unique_ptr<fe::graph::Graph> build_graph(
  auto scaled_dot_product_flash_attention_options =
      fe::graph::SDPA_attributes()
          .set_name("CUDNN_SDPA")
-          .set_generate_stats(return_softmaxstats)
+          .set_is_inference(return_softmaxstats == false)
+          // TODO(eqy): switch to this API once cuDNN FE is upgraded
+          // .set_generate_stats(return_softmaxstats)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale);
  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@ -705,7 +707,9 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
  auto scaled_dot_product_flash_attention_options =
      fe::graph::SDPA_attributes()
          .set_name("CUDNN_SDPA_NESTEDTENSOR")
-          .set_generate_stats(return_softmaxstats)
+          .set_is_inference(return_softmaxstats == false)
+          // TODO(eqy): switch to this API once cuDNN FE is upgraded
+          // .set_generate_stats(return_softmaxstats)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale)
          .set_seq_len_q(SEQ_LEN_Q_)
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@ -441,7 +441,7 @@ kernel void applySYRK(
    uint3 tid [[thread_position_in_threadgroup]],
    uint3 tgid [[threadgroup_position_in_grid]],
    uint3 tpg [[threads_per_threadgroup]],
-    uint warp_id [[simdgroup_index_in_threadgroup]]) {
+    uint sgitg [[simdgroup_index_in_threadgroup]]) {
  const uint tx = tid.x;
  const uint ty = tid.y;
  const uint simdGroupsPerThreadgroup = (tpg.x * tpg.y + 31) / 32;
@ -474,8 +474,11 @@ kernel void applySYRK(
      (actSize_j % 8 == 0) && (actSize_h % 8 == 0) && (actSize_k % 8 == 0);

  if (use_simdgroup) {
+    uint warp_id = sgitg;
+
    simdgroup_matrix<float, 8, 8> negative_identity =
        simdgroup_matrix<float, 8, 8>(-1.0);
+    simdgroup_matrix<float, 8, 8> identity = simdgroup_matrix<float, 8, 8>(1.0);
    simdgroup_matrix<float, 8, 8> Prod;
    simdgroup_matrix<float, 8, 8> Afrag;
    simdgroup_matrix<float, 8, 8> Bfrag;
@ -518,7 +521,8 @@ kernel void applySYRK(
            /* transpose = */ upper);

        simdgroup_multiply(Prod, Afrag, Bfrag);
-        simdgroup_multiply_accumulate(Cfrag, Prod, negative_identity, Cfrag);
+        simdgroup_multiply(Prod, Prod, negative_identity);
+        simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod);
      }

      simdgroup_store(
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -706,7 +706,6 @@
  variants: function, method
  dispatch:
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
-  tags: reduction


 - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@ -716,7 +715,6 @@
  cpp_no_default_args: ['dim']
  dispatch:
    CompositeExplicitAutograd: all_dims_default
-  tags: reduction

 - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
@ -725,7 +723,6 @@
    CPU, CUDA: all_out
    MPS: all_out_mps
    MTIA: all_out_mtia
-  tags: reduction

 - func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
@ -734,16 +731,13 @@
    CPU, CUDA: all_dims_out
    CompositeExplicitAutograd: all_dims_out_default
  cpp_no_default_args: ['dim']
-  tags: reduction

 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
  variants: function, method
@ -755,14 +749,14 @@
  device_check: NoCheck   # TensorIterator
  structured_delegate: any.out
  variants: function, method
-  tags: [core, reduction]
+  tags: core

 - func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  structured_delegate: any.dims_out
  variants: function, method
  cpp_no_default_args: ['dim']
-  tags: [core, reduction]
+  tags: core
  dispatch:
    CompositeExplicitAutograd: any_dims_default

@ -772,7 +766,6 @@
  dispatch:
    CPU, CUDA: any_out
    MPS: any_out_mps
-  tags: reduction

 - func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
@ -781,16 +774,13 @@
    CPU, CUDA: any_dims_out
    CompositeExplicitAutograd: any_dims_out_default
  cpp_no_default_args: ['dim']
-  tags: reduction

 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  dispatch:
@ -836,27 +826,25 @@
  structured_delegate: argmax.out
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: [core, reduction]
+  tags: core

 - func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
    CPU, CUDA: argmax_out
    MPS: argmax_out_mps
-  tags: reduction

 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
  structured_delegate: argmin.out
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: [core, reduction]
+  tags: core

 - func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
    CPU, CUDA: argmin_out
    MPS: argmin_out_mps
-  tags: reduction

 - func: acosh(Tensor self) -> Tensor
  variants: function, method
@ -1382,7 +1370,6 @@
  dispatch:
    SparseCPU: bmm_sparse_cpu
    SparseCUDA: bmm_sparse_cuda
-    SparseMPS: bmm_sparse_mps
    NestedTensorCPU: bmm_nested
    NestedTensorCUDA: bmm_nested_cuda
  tags: core
@ -1398,7 +1385,6 @@
    MTIA: bmm_out_mtia
    SparseCPU: bmm_out_sparse_cpu
    SparseCUDA: bmm_out_sparse_cuda
-    SparseMPS: bmm_out_sparse_mps
    SparseCsrCUDA: bmm_out_sparse_csr_cuda

 - func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
@ -1881,14 +1867,12 @@
    CUDA: count_nonzero_cuda
    MPS: count_nonzero_mps
  autogen: count_nonzero.dim_IntList_out
-  tags: reduction

 - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: count_nonzero
  autogen: count_nonzero.out
-  tags: reduction

 - func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
  variants: function, method
@ -3809,23 +3793,19 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: logsumexp
-  tags: reduction

 - func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    # calls squeeze
    CompositeExplicitAutogradNonFunctional: logsumexp_out
-  tags: reduction

 - func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor

@ -3875,7 +3855,6 @@
  device_check: NoCheck   # TensorIterator
  structured_delegate: aminmax.out
  variants: function, method
-  tags: reduction

 - func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
  device_check: NoCheck   # TensorIterator
@ -3883,7 +3862,6 @@
  dispatch:
    CPU, CUDA, MTIA: aminmax_out
    MPS: aminmax_out_mps
-  tags: reduction

 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
  dispatch:
@ -3899,7 +3877,7 @@
  variants: function, method
  dispatch:
    QuantizedCPU, QuantizedCUDA: qmax
-  tags: [core, reduction]
+  tags: core

 - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
  device_check: NoCheck   # TensorIterator
@ -3909,16 +3887,13 @@
  dispatch:
    CPU, CUDA, MTIA: max_out
    MPS: max_out_mps
-  tags: reduction

 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
  variants: function
@ -3931,14 +3906,13 @@
 - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
  variants: function, method
  structured_delegate: amax.out
-  tags: [core, reduction]
+  tags: core

 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
    CPU, CUDA, MTIA: amax_out
    MPS: amax_out_mps
-  tags: reduction

 # Return: (Tensor output, Tensor indices)
 - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@ -4000,14 +3974,13 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: mean
-  tags: [core, reduction]
+  tags: core

 # For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
 - func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CompositeExplicitAutograd: mean_dtype_out
-  tags: reduction

 - func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  structured_delegate: mean.out
@ -4015,7 +3988,7 @@
  variants: function, method
  dispatch:
    QuantizedCPU: mean_quantized_cpu
-  tags: [core, reduction]
+  tags: core

 - func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  structured: True
@ -4024,16 +3997,13 @@
    CPU, CUDA: mean_out
    MPS: mean_out_mps
    QuantizedCPU: mean_out_quantized_cpu
-  tags: reduction

 - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  device_check: NoCheck   # Composite
@ -4096,7 +4066,7 @@
  variants: function, method
  dispatch:
    QuantizedCPU, QuantizedCUDA: qmin
-  tags: [core, reduction]
+  tags: core

 - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
  device_check: NoCheck   # TensorIterator
@ -4106,28 +4076,24 @@
  dispatch:
    CPU, CUDA, MTIA: min_out
    MPS: min_out_mps
-  tags: reduction

 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
  variants: function, method
  structured_delegate: amin.out
-  tags: [core, reduction]
+  tags: core

 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
    CPU, CUDA, MTIA: amin_out
    MPS: amin_out_mps
-  tags: reduction

 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
 # native_functions.yaml
@ -4207,7 +4173,7 @@
  structured_delegate: mm.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm
+    SparseCPU, SparseCUDA: _sparse_mm
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm
  tags: core

@ -5892,7 +5858,6 @@
    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
  autogen: sum.out
-  tags: reduction

 - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
@ -5903,12 +5868,11 @@
    NestedTensorCPU: NestedTensor_sum_dim_CPU
    SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
-  tags: [core, reduction]
+  tags: core

 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  structured: True
@ -5916,11 +5880,9 @@
  dispatch:
    CPU, CUDA: sum_out
    MPS: sum_out_mps
-  tags: reduction

 - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 # TODO: this function will be replaced once nested expand semantics have been settled on
 - func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
@ -5932,13 +5894,11 @@
  dispatch:
    CPU, CUDA: nansum
    MPS: nansum_mps
-  tags: reduction

 - func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CPU, CUDA: nansum_out
    MPS: nansum_out_mps
-  tags: reduction

 - func: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
  variants: function, method
@ -6002,13 +5962,11 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
@ -6017,19 +5975,16 @@
    CPU, CUDA: std
    MPS: std_mps
    QuantizedCPU: std_quantized_cpu
-  tags: reduction

 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
  variants: function
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
  variants: function
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
@ -6038,51 +5993,42 @@
    CPU, CUDA: std_mean
    MPS: std_mean_mps
  autogen: std_mean.correction_out
-  tags: reduction

 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
  variants: function
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
  variants: function
-  tags: reduction

 - func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA: std_out
    QuantizedCPU: std_out_quantized_cpu
-  tags: reduction

 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  variants: function
-  tags: reduction

 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
  device_check: NoCheck   # TensorIterator
@ -6091,13 +6037,13 @@
    CPU, CUDA: prod
    MPS: prod_mps
  autogen: prod.out
-  tags: [core, reduction]
+  tags: core

 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  structured_delegate: prod.int_out
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: [core, reduction]
+  tags: core

 - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  structured: True
@ -6105,16 +6051,13 @@
  dispatch:
    CPU, CUDA: prod_out
    MPS: prod_out_mps
-  tags: reduction

 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: t(Tensor(a) self) -> Tensor(a)
  device_check: NoCheck
@ -6575,12 +6518,11 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: [core, reduction]
+  tags: core
  cpp_no_default_args: ["unbiased"]

 - func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
@ -6590,51 +6532,43 @@
    CPU, CUDA: var
    MPS: var_mps
    MTIA: var_mtia
-  tags: [core, reduction]
+  tags: core

 - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA: var_out
-  tags: reduction

 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  variants: function
-  tags: reduction

 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
  variants: function
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
  variants: function
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
@ -6643,18 +6577,15 @@
    CPU, CUDA: var_mean
    MPS: var_mean_mps
  autogen: var_mean.correction_out
-  tags: reduction

 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
  variants: function
  cpp_no_default_args: ["unbiased"]
-  tags: reduction

 - func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
  device_check: NoCheck   # TensorIterator
  variants: function
-  tags: reduction

 - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
  variants: method
@ -6914,7 +6845,6 @@
  dispatch:
    CompositeExplicitAutograd: norm
  autogen: norm.ScalarOpt_dtype_out
-  tags: reduction

 - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
  device_check: NoCheck   # TensorIterator
@ -6922,7 +6852,6 @@
  dispatch:
    CompositeExplicitAutograd: norm
  autogen: norm.Scalar_out
-  tags: reduction

 - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
  structured_delegate: norm.dtype_out
@ -6930,7 +6859,6 @@
  variants: function, method
  dispatch:
    SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm
-  tags: reduction

 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
  structured_delegate: norm.out
@ -6938,7 +6866,6 @@
  variants: function, method
  dispatch:
    SparseCPU, SparseCUDA, SparseMPS: sparse_norm
-  tags: reduction

 - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
  structured: True
@ -6946,7 +6873,6 @@
  dispatch:
    CPU, CUDA: norm_dtype_out
    MPS: norm_dtype_out_mps
-  tags: reduction

 - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
@ -6954,26 +6880,21 @@
  dispatch:
    CPU, CUDA: norm_out
    MPS: norm_out_mps
-  tags: reduction

 # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
  device_check: NoCheck   # TensorIterator
  variants: function, method
-  tags: reduction

 - func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
-  tags: reduction

 - func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
  variants: method, function
@ -7191,7 +7112,6 @@
    MTIA: addmm_out_mtia
    SparseCPU: addmm_out_sparse_dense_cpu
    SparseCUDA: addmm_out_sparse_dense_cuda
-    SparseMPS: addmm_out_sparse_dense_mps
    SparseCsrCPU: addmm_out_sparse_compressed_cpu
    SparseCsrCUDA: addmm_out_sparse_compressed_cuda

@ -7201,7 +7121,6 @@
  dispatch:
    SparseCPU: addmm_sparse_dense_cpu
    SparseCUDA: addmm_sparse_dense_cuda
-    SparseMPS: addmm_sparse_dense_mps
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
  tags: core

@ -10159,14 +10078,12 @@
    CPU, CUDA: min
    MPS: min_mps
    QuantizedCPU: min_quantized_cpu
-  tags: [reduction]

 - func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA: min_unary_out
    QuantizedCPU: min_quantized_unary_out
-  tags: [reduction]

 - func: fmin(Tensor self, Tensor other) -> Tensor
  structured_delegate: fmin.out
@ -10189,7 +10106,6 @@
    CPU, CUDA: max
    MPS: max_mps
    QuantizedCPU: max_quantized_cpu
-  tags: [reduction]

 - func: fmax(Tensor self, Tensor other) -> Tensor
  structured_delegate: fmax.out
@ -10236,7 +10152,6 @@
  dispatch:
    CPU, CUDA: max_unary_out
    QuantizedCPU: max_quantized_unary_out
-  tags: [reduction]

 - func: minimum(Tensor self, Tensor other) -> Tensor
  structured_delegate: minimum.out
@ -10356,7 +10271,6 @@
  device_check: NoCheck   # TensorIterator
  structured_delegate: all.all_out
  variants: method, function
-  tags: reduction

 - func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck
@ -10365,7 +10279,6 @@
    CPU, CUDA: all_all_out
    MTIA: all_all_out_mtia
    MPS: all_all_out_mps
-  tags: reduction

 - func: any(Tensor self) -> Tensor
  device_check: NoCheck   # TensorIterator
@ -10373,7 +10286,7 @@
  variants: method, function
  dispatch:
    SparseCPU, SparseCUDA, SparseMPS: any_sparse
-  tags: [core, reduction]
+  tags: core

 - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck
@ -10381,7 +10294,6 @@
  dispatch:
    CPU, CUDA: any_all_out
    MPS: any_all_out_mps
-  tags: reduction

 - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
@ -14433,7 +14345,6 @@
  python_module: linalg
  variants: function
  structured_delegate: linalg_vector_norm.out
-  tags: reduction

 - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
  python_module: linalg
@ -14441,7 +14352,6 @@
  dispatch:
    CPU, CUDA: linalg_vector_norm_out
    MPS: linalg_vector_norm_out_mps
-  tags: reduction

 - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  python_module: linalg
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -1,6 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/SparseTensorUtils.h>
-#include <ATen/ExpandUtils.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/sparse/SparseStubs.h>
 #include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
@ -19,8 +18,6 @@
 #include <ATen/ops/ones_like.h>
 #include <ATen/ops/argsort.h>
 #include <ATen/ops/result_type.h>
-#include <ATen/ops/bmm_native.h>
-#include <ATen/ops/addmm_native.h>
 #include <ATen/ops/copy_sparse_to_sparse.h>
 #include <ATen/ops/mul.h>
 #endif
@ -36,305 +33,6 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary();
 #include <ATen/native/mps/Mul_metallib.h>
 #endif

-static Tensor& s_addmm_out_sparse_dense_mps(
-    Tensor& r,
-    const Tensor& t,
-    const SparseTensor& sparse_,
-    const Tensor& dense,
-    const Scalar& beta,
-    const Scalar& alpha) {
-  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: sparse_dim must be 2, got ", sparse_.sparse_dim());
-  TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: sparse values must be 0-dense-dim, got ", sparse_.dense_dim());
-  TORCH_CHECK(dense.dim() == 2, "addmm: 'dense' must be 2D, got ", dense.dim());
-  TORCH_CHECK(t.dim() == 2, "addmm: 't' must be 2D, got ", t.dim());
-
-  const int64_t I = sparse_.size(0);
-  const int64_t J = sparse_.size(1);
-  const int64_t K = dense.size(1);
-
-  TORCH_CHECK(dense.size(0) == J,
-      "addmm: dense (mat2) dim0 must be ", J, ", got ", dense.size(0));
-  TORCH_CHECK(t.size(0) == I && t.size(1) == K,
-      "addmm: 't' shape must be (", I, ", ", K, "), got (", t.size(0), ", ", t.size(1), ")");
-
-  r.resize_({I, K});
-
-  auto sparse = sparse_.coalesce();
-  const int64_t nnz = sparse._nnz();
-
-  if (nnz == 0 || I == 0 || K == 0) {
-    at::mul_out(r, t, beta);
-    return r;
-  }
-
-  const auto v_dtype = sparse._values().scalar_type();
-  const auto d_dtype = dense.scalar_type();
-  const auto t_dtype = t.scalar_type();
-  auto compute_dtype = c10::promoteTypes(c10::promoteTypes(v_dtype, d_dtype), t_dtype);
-
-  TORCH_CHECK(canCast(compute_dtype, r.scalar_type()),
-              "Can't convert computed type ", compute_dtype, " to output ", r.scalar_type());
-
-  auto indices2d = sparse._indices().contiguous();
-  auto values = sparse._values().to(compute_dtype);
-  auto dense_c = dense.to(compute_dtype).contiguous();
-  auto t_c = t.to(compute_dtype).contiguous();
-
-  const bool out_needs_cast = (r.scalar_type() != compute_dtype) || !r.is_contiguous();
-  Tensor out_buf = out_needs_cast
-      ? at::empty({I, K}, r.options().dtype(compute_dtype))
-      : r;
-  auto out_contig = out_buf.contiguous();
-
-  auto device = r.device();
-  auto stream = getCurrentMPSStream();
-
-  const float alpha_f = alpha.to<float>();
-  const float beta_f  = beta.to<float>();
-
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      const std::string func = "spmm_addmm_coo_" + mps::scalarToMetalTypeString(values);
-      auto pso = lib.getPipelineStateForFunc(func);
-      auto enc = stream->commandEncoder();
-      [enc setComputePipelineState:pso];
-
-      const uint32_t tew = pso.threadExecutionWidth;
-      const uint32_t gridX = static_cast<uint32_t>(K);
-      const uint32_t gridZ = static_cast<uint32_t>(I);
-      const uint32_t tgW = std::min<uint32_t>(gridX, tew);
-
-      MTLSize grid = MTLSizeMake(gridX, 1, gridZ);
-      MTLSize tgs = MTLSizeMake(tgW, 1, 1);
-
-      mtl_setArgs(enc,
-                  indices2d,
-                  values,
-                  dense_c,
-                  t_c,
-                  out_contig,
-                  std::array<uint32_t, 3>{static_cast<uint32_t>(I),
-                                           static_cast<uint32_t>(J),
-                                           static_cast<uint32_t>(K)},
-                  std::array<float, 2>{alpha_f, beta_f},
-                  static_cast<uint32_t>(nnz));
-      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
-    }
-  });
-
-  if (out_needs_cast) {
-    r.copy_(out_contig.to(r.scalar_type()));
-  }
-
-  return r;
-}
-
-
-static void build_batch_ptr_mps(
-    const Tensor& indices_dim0,
-    int64_t B,
-    Tensor& batch_ptr
-) {
-  // Builds an array of pointers which point to each batches elements. Example:
-  // idx_b = [0, 0, 0, 1, 1, 2, 2, 2, 2]  // 9 non-zero elements
-  //          └─────┘  └──┘  └─────────┘
-  //          batch 0  batch 1  batch 2
-  // batch_ptr = [0, 3, 5, 9]
-  //              │  │  │  └─ end of batch 2 (total nnz)
-  //              │  │  └──── batch 2 starts at index 5
-  //              │  └─────── batch 1 starts at index 3
-  //              └────────── batch 0 starts at index 0
-  TORCH_CHECK(indices_dim0.is_mps() && batch_ptr.is_mps(), "MPS device expected");
-  auto device = indices_dim0.device();
-  auto stream = getCurrentMPSStream();
-
-  const int64_t nnz = indices_dim0.numel();
-
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pso = lib.getPipelineStateForFunc("build_batch_ptr_from_sorted_batches");
-      auto enc = stream->commandEncoder();
-      [enc setComputePipelineState:pso];
-
-      const uint32_t tew = pso.threadExecutionWidth;
-      const uint32_t Q = static_cast<uint32_t>(B + 1);
-      const uint32_t tgW = std::min<uint32_t>(Q, tew);
-      MTLSize grid = MTLSizeMake(Q, 1, 1);
-      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
-
-      mtl_setArgs(enc,
-                  indices_dim0,
-                  batch_ptr,
-                  std::array<uint32_t, 2>{static_cast<uint32_t>(nnz),
-                                          static_cast<uint32_t>(B)});
-      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
-    }
-  });
-}
-
-static void build_row_ptr_per_batch_mps(
-    const Tensor& rows,
-    const Tensor& batch_ptr,
-    int64_t B,
-    int64_t I,
-    Tensor& row_ptr
-) {
-  // Build per-batch CSR-style row pointer arrays from row indices sorted by batch
-  // Given:
-  //   rows: 1-D array of length nnz with row ids in [0, I), sorted within each batch
-  //   batch_ptr: length B+1, where [batch_ptr[b], batch_ptr[b+1]) is the subrange for batch b
-  // Produces:
-  //   - row_ptr: shape [B, I+1]
-  //
-  // Example (B = 2, I = 4):
-  // rows       = [0,   0,   1,  3,  0,   2,    2]   // 7 non-zero elements
-  //               └─── batch 0 ──┘  └─ batch 1 ─┘
-  // batch_ptr  = [0, 4, 7]
-  //               │  │  └─ end of batch 1 (total nnz)
-  //               │  └──── end of batch 0/start of batch 1
-  //               └─────── start of batch 0
-  //
-  // per-batch row pointers (I+1 entries each):
-  //   row_ptr[0] = [0, 2, 3, 3, 4]
-  //   row_ptr[1] = [0, 1, 1, 3, 3]
-  // laid out in memory: [0, 2, 3, 3, 4,  0, 1, 1, 3, 3]
-  TORCH_CHECK(rows.is_mps() && batch_ptr.is_mps() && row_ptr.is_mps(), "MPS device expected");
-  auto stream = getCurrentMPSStream();
-
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pso = lib.getPipelineStateForFunc("build_row_ptr_from_sorted_rows_by_batch");
-      auto enc = stream->commandEncoder();
-      [enc setComputePipelineState:pso];
-
-      const uint32_t tew = pso.threadExecutionWidth;
-      const uint32_t Qx = static_cast<uint32_t>(I + 1);
-      const uint32_t Qy = static_cast<uint32_t>(B);
-      const uint32_t tgW = std::min<uint32_t>(Qx, tew);
-
-      MTLSize grid = MTLSizeMake(Qx, Qy, 1);
-      MTLSize tgs = MTLSizeMake(tgW, 1, 1);
-
-      mtl_setArgs(enc,
-                  rows,
-                  batch_ptr,
-                  row_ptr,
-                  std::array<uint32_t, 2>{static_cast<uint32_t>(I),
-                                           static_cast<uint32_t>(B)});
-      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
-    }
-  });
-}
-
-Tensor& bmm_out_sparse_mps(const SparseTensor& self_, const Tensor& mat2_, Tensor& result_) {
-  TORCH_CHECK(result_.is_mps(), "bmm_sparse: expected 'out' to be MPS, got ", result_.device());
-  TORCH_CHECK(self_.is_mps(),  "bmm_sparse: expected 'self' to be MPS, got ", self_.device());
-  TORCH_CHECK(mat2_.is_mps(),  "bmm_sparse: expected 'mat2' to be MPS, got ", mat2_.device());
-
-  TORCH_CHECK(self_.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self_.dense_dim());
-  TORCH_CHECK(self_.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self_.sparse_dim());
-  TORCH_CHECK(mat2_.dim() == 3, "bmm_sparse: Tensor 'mat2' must have 3 dims, but has ", mat2_.dim());
-
-  TORCH_CHECK(self_.size(0) == mat2_.size(0), "bmm_sparse: 'self.size(0)' and 'mat2.size(0)' must match");
-  TORCH_CHECK(self_.size(2) == mat2_.size(1), "bmm_sparse: 'self.size(2)' and 'mat2.size(1)' must match");
-
-  const int64_t B = self_.size(0);
-  const int64_t I = self_.size(1);
-  const int64_t J = self_.size(2);
-  const int64_t K = mat2_.size(2);
-
-  auto self = self_.coalesce();
-  const int64_t nnz = self._nnz();
-  if (nnz == 0) {
-    return result_.zero_();
-  }
-
-  const auto computeDtype = at::kFloat;
-
-  auto indices = self._indices();
-  auto values  = self._values();
-
-  auto values_c = values.scalar_type() == computeDtype ? values : values.to(computeDtype);
-  auto mat2_c = mat2_.scalar_type()   == computeDtype ? mat2_   : mat2_.to(computeDtype);
-  auto mat2_contig = mat2_c.contiguous();
-
-  auto idx_b = indices.select(0, 0).contiguous();
-  auto idx_i = indices.select(0, 1).contiguous();
-  auto idx_j = indices.select(0, 2).contiguous();
-
-  // builds an array of pointers of where the batch_idx's pointer starts and ends
-  // look in function for better explanation
-  auto batch_ptr = at::empty({B + 1}, at::device(result_.device()).dtype(kLong));
-  build_batch_ptr_mps(idx_b, B, batch_ptr);
-  // build row_ptr per batch: for each (b, i) get [start, end) into rows/cols/vals
-  auto row_ptr = at::empty({B * (I + 1)}, at::device(result_.device()).dtype(kLong));
-  build_row_ptr_per_batch_mps(idx_i, batch_ptr, B, I, row_ptr);
-
-  const bool out_needs_cast = (result_.scalar_type() != computeDtype) || !result_.is_contiguous();
-  Tensor out_buf = out_needs_cast
-      ? at::empty({B, I, K}, result_.options().dtype(computeDtype))
-      : result_;
-  auto out_contig = out_buf.contiguous();
-
-  auto stream = getCurrentMPSStream();
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pso = lib.getPipelineStateForFunc("spmm_bmm_coo_rows_grouped_" + mps::scalarToMetalTypeString(values));
-      auto enc = stream->commandEncoder();
-      [enc setComputePipelineState:pso];
-
-      const uint32_t tew = pso.threadExecutionWidth;
-      const uint32_t tgW = std::min<uint32_t>((uint32_t)K, tew);
-
-      // One threadgroup per (row i, batch b), lanes cover K
-      MTLSize grid = MTLSizeMake(tgW, (uint32_t)I, (uint32_t)B);
-      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
-
-      mtl_setArgs(enc,
-                  idx_i,
-                  idx_j,
-                  values_c,
-                  mat2_contig,
-                  out_contig,
-                  row_ptr,
-                  std::array<uint32_t, 4>{(uint32_t)B, (uint32_t)I, (uint32_t)J, (uint32_t)K});
-      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
-    }
-  });
-  if (out_needs_cast) {
-    result_.copy_(out_contig.to(result_.scalar_type()));
-  }
-  return result_;
-}
-
-Tensor bmm_sparse_mps(const Tensor& self, const Tensor& mat2) {
-  Tensor result = at::zeros({self.size(0), self.size(1), mat2.size(2)}, mat2.options());
-  return bmm_out_sparse_mps(self, mat2, result);
-}
-
-Tensor& addmm_out_sparse_dense_mps(
-    const Tensor& self,
-    const SparseTensor& mat1,
-    const Tensor& mat2,
-    const Scalar& beta,
-    const Scalar& alpha,
-    Tensor& result) {
-  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
-  return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
-}
-
-Tensor addmm_sparse_dense_mps(
-    const Tensor& self,
-    const SparseTensor& mat1,
-    const Tensor& mat2,
-    const Scalar& beta,
-    const Scalar& alpha
-) {
-  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
-  Tensor result = at::empty({0}, self.options());
-  return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
-}
-
 static SparseTensor& mul_out_dense_sparse_mps(
    const Tensor& dense,
    const Tensor& sparse,
--- a/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
@ -1,105 +1,10 @@
+#include <metal_stdlib>
 #include <c10/metal/indexing.h>
-#include <c10/metal/utils.h>
-using namespace c10::metal;
 using namespace metal;

-inline uint lower_bound_i64(device const long* arr, uint lo, uint hi, long key) {
-  uint l = lo, r = hi;
-  while (l < r) {
-    uint m = (l + r) >> 1;
-    long v = arr[m];
-    if (v < key) {
-      l = m + 1;
-    } else {
-      r = m;
-    }
-  }
-  return l;
-}

-inline uint upper_bound_i64(device const long* arr, uint lo, uint hi, long key) {
-  uint l = lo, r = hi;
-  while (l < r) {
-    uint m = (l + r) >> 1;
-    long v = arr[m];
-    if (v <= key) {
-      l = m + 1;
-    } else {
-      r = m;
-    }
-  }
-  return l;
-}
-
-kernel void build_row_ptr_from_sorted_rows_by_batch(
-    device const long* rows        [[buffer(0)]],
-    device const long* batch_ptr   [[buffer(1)]],
-    device long*       row_ptr     [[buffer(2)]],
-    constant uint2&    dims        [[buffer(3)]],
-    uint3              tid         [[thread_position_in_grid]])
-{
-  const uint I = dims.x;
-  const uint B = dims.y;
-
-  const uint i = tid.x;
-  const uint b = tid.y;
-
-  if (b >= B || i > I) return;
-
-  const uint base = (uint)batch_ptr[b];
-  const uint lim  = (uint)batch_ptr[b + 1];
-
-  const ulong out_base = (ulong)b * (ulong)(I + 1);
-
-  if (i == I) {
-    row_ptr[out_base + (ulong)I] = (long)lim;
-  } else {
-    const long key = (long)i;
-    const uint pos = lower_bound_i64(rows, base, lim, key);
-    row_ptr[out_base + (ulong)i] = (long)pos;
-  }
-}
-
-template <typename T>
-kernel void spmm_bmm_coo_rows_grouped(
-    device const long*   rows      [[buffer(0)]],
-    device const long*   cols      [[buffer(1)]],
-    device const T*      vals      [[buffer(2)]],
-    device const T*      dense     [[buffer(3)]],
-    device T*            out       [[buffer(4)]],
-    device const long*   row_ptr   [[buffer(5)]],
-    constant uint4&      dims      [[buffer(6)]],
-    uint3                tid       [[thread_position_in_grid]],
-    uint3                ltid      [[thread_position_in_threadgroup]],
-    uint3                tptg      [[threads_per_threadgroup]])
-{
-  const uint B = dims.x;
-  const uint I = dims.y;
-  const uint J = dims.z;
-  const uint K = dims.w;
-
-  const uint b = tid.z;
-  const uint i = tid.y;
-  const uint lane = ltid.x;
-  const uint tgW  = tptg.x;
-
-  const ulong rp_base = (ulong)b * (ulong)(I + 1);
-  const uint start = (uint)row_ptr[rp_base + (ulong)i];
-  const uint end   = (uint)row_ptr[rp_base + (ulong)i + 1];
-
-  for (uint k = lane; k < K; k += tgW) {
-    auto acc = static_cast<accum_t<T>>(T(0));
-    for (uint p = start; p < end; ++p) {
-      const uint c = (uint)cols[p];
-      const auto v = static_cast<accum_t<T>>(vals[p]);
-      const uint d_off = ((b * J) + c) * K + k;
-      const auto d = static_cast<accum_t<T>>(dense[d_off]);
-      acc += mul(v, d);
-    }
-    const uint y_off = ((b * I) + i) * K + k;
-    out[y_off] = static_cast<T>(acc);
-  }
-}
+template <typename T> struct MulAccum { using type = float; };
+template <> struct MulAccum<float2> { using type = float2; };

 template <typename T>
 kernel void dense_sparse_mul_kernel(
@ -127,9 +32,10 @@ kernel void dense_sparse_mul_kernel(
  ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
  ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;

-  const auto a = static_cast<accum_t<T>>(values[val_idx]);
-  const auto b = static_cast<accum_t<T>>(dense[dense_idx]);
-  out_values[val_idx] = static_cast<T>(mul(a, b));
+  using accum_t = typename MulAccum<T>::type;
+  const accum_t a = static_cast<accum_t>(values[val_idx]);
+  const accum_t b = static_cast<accum_t>(dense[dense_idx]);
+  out_values[val_idx] = static_cast<T>(a * b);
 }

 kernel void intersect_binary_search(
@ -214,76 +120,6 @@ kernel void fused_gather_mul_kernel(
  }
 }

-
-kernel void build_batch_ptr_from_sorted_batches(
-    device const long* batches       [[buffer(0)]],
-    device long*       batch_ptr     [[buffer(1)]],
-    constant uint2&    nnz_B         [[buffer(2)]],
-    uint3              tid           [[thread_position_in_grid]])
-{
-  uint b = tid.x;
-  uint nnz = nnz_B.x;
-  uint batch = nnz_B.y;
-
-  if (b == batch) {
-    batch_ptr[b] = (long)nnz;
-    return;
-  }
-
-  uint lo = 0;
-  uint hi = nnz;
-  long key = (long)b;
-  while (lo < hi) {
-    uint mid = (lo + hi) >> 1;
-    long v = batches[mid];
-    if (v < key) lo = mid + 1;
-    else         hi = mid;
-  }
-  batch_ptr[b] = (long)lo;
-}
-
-template <typename T>
-kernel void spmm_addmm_coo(
-    device const long*   indices2d   [[buffer(0)]],
-    device const T*      vals        [[buffer(1)]],
-    device const T*      dense       [[buffer(2)]],
-    device const T*      t_in        [[buffer(3)]],
-    device T*            out         [[buffer(4)]],
-    constant uint3&      dims        [[buffer(5)]],
-    constant float2&     alpha_beta  [[buffer(6)]],
-    constant uint&       nnz         [[buffer(7)]],
-    uint3                tid         [[thread_position_in_grid]])
-{
-  const uint K = dims.z;
-  const uint k = tid.x;
-  const uint i = tid.z;
-  const float alpha = alpha_beta.x;
-  const float beta = alpha_beta.y;
-
-  device const long* rows = indices2d;
-  device const long* cols = indices2d + nnz;
-
-  const uint start = lower_bound_i64(rows, 0u, nnz, (long)i);
-  const uint end = upper_bound_i64(rows, 0u, nnz, (long)i);
-
-  // accumulator is float for scalar/half/bfloat and float2 for float2
-  auto acc = static_cast<accum_t<T>>(T(0));
-
-  for (uint p = start; p < end; ++p) {
-    const uint c = (uint)cols[p];
-    const auto v = static_cast<accum_t<T>>(vals[p]);
-    const uint dense_off = c * K + k;
-    const auto d = static_cast<accum_t<T>>(dense[dense_off]);
-    acc += mul(v, d);
-  }
-
-  const uint off = i * K + k;
-  const auto base = (beta != 0.0f) ? (static_cast<accum_t<T>>(t_in[off]) * beta) : static_cast<accum_t<T>>(T(0));
-  const auto y = base + alpha * acc;
-  out[off] = static_cast<T>(y);
-}
-
-
 #define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE)                                 \
  template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void     \
  dense_sparse_mul_kernel<DTYPE>(                                           \
@ -315,36 +151,6 @@ INSTANTIATE_DENSE_SPARSE_MUL(float2);
      constant uint2&     dims_output   [[buffer(8)]],                       \
      uint3               gid           [[thread_position_in_grid]]);

-INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
-
-
-#define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE)                         \
-  template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void    \
-  spmm_bmm_coo_rows_grouped<DTYPE>(                                          \
-      device const long*   rows      [[buffer(0)]],                          \
-      device const long*   cols      [[buffer(1)]],                          \
-      device const DTYPE*  vals      [[buffer(2)]],                          \
-      device const DTYPE*  dense     [[buffer(3)]],                          \
-      device DTYPE*        out       [[buffer(4)]],                          \
-      device const long*   row_ptr   [[buffer(5)]],                          \
-      constant uint4&      dims      [[buffer(6)]],                          \
-      uint3                tid       [[thread_position_in_grid]],            \
-      uint3                ltid      [[thread_position_in_threadgroup]],     \
-      uint3                tptg      [[threads_per_threadgroup]]);
-
-INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED);
-
-#define INSTANTIATE_SPMM_ADDMM_COO(DTYPE) \
-  template [[host_name("spmm_addmm_coo_" #DTYPE)]] kernel void  \
-  spmm_addmm_coo<DTYPE>(                                        \
-    device const long*   indices2d   [[buffer(0)]],             \
-    device const DTYPE*  vals        [[buffer(1)]],             \
-    device const DTYPE*  dense       [[buffer(2)]],             \
-    device const DTYPE*  t_in        [[buffer(3)]],             \
-    device DTYPE*        out         [[buffer(4)]],             \
-    constant uint3&      dims        [[buffer(5)]],             \
-    constant float2&     alpha_beta  [[buffer(6)]],             \
-    constant uint&       nnz         [[buffer(7)]],             \
-    uint3                tid         [[thread_position_in_grid]]);
-
-INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_ADDMM_COO);
+INSTANTIATE_FUSED_GATHER_MUL(float);
+INSTANTIATE_FUSED_GATHER_MUL(half);
+INSTANTIATE_FUSED_GATHER_MUL(bfloat);
--- a/aten/src/ATen/native/tags.yaml
+++ b/aten/src/ATen/native/tags.yaml
@ -93,7 +93,3 @@
          This operator does not support cudagraphs. The presence of this tag on an operator will cause
          Inductor to split the graph around this operator. Note that operators without this tag may still
          not support CUDAGraphs. Inductor may have other hardcoded lists around that.
- tag: reduction
-  desc: |
-          This tag indicates that an operator performs a reduction operation, computing aggregate values
-          (sum, mean, max, min, etc.) across one or more dimensions of the input tensor(s).
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -1751,8 +1751,8 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
                        f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
                    )
                )
-            except Exception:
-                log.exception("Failed to save memory snapshot")
+            except Exception as e:
+                log.error("Failed to save memory snapshot, %s", e)

            torch.cuda.memory._record_memory_history(enabled=None)

--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -124,7 +124,7 @@ with open(MODELS_FILENAME) as fh:
            continue
        batch_size = int(batch_size)
        BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
-assert BATCH_SIZE_KNOWN_MODELS
+assert len(BATCH_SIZE_KNOWN_MODELS)


 try:
--- a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
+++ b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
@ -296,8 +296,8 @@ class OperatorInputsLoader:
        for key in self.operator_db.keys():
            try:
                op = eval(key)
-            except AttributeError:
-                log.warning("Evaluating an op name into an OpOverload", exc_info=True)
+            except AttributeError as ae:
+                log.warning("Evaluating an op name into an OpOverload: %s", ae)
                continue
            yield op

--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
@ -3,7 +3,6 @@ import sys
 from benchmark_base import BenchmarkBase

 import torch
-from torch._dynamo.utils import CompileTimeInstructionCounter


 class Benchmark(BenchmarkBase):
@ -33,11 +32,7 @@ class Benchmark(BenchmarkBase):
    def _work(self):
        # enable_cpp_symbolic_shape_guards has impact on this benchmark
        # Keep using False value for consistency.
-        with (
-            torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False),
-            torch._export.config.patch(use_new_tracer_experimental=True),
-            CompileTimeInstructionCounter.record(),
-        ):
+        with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False):
            torch.export.export(self.m, (self.input,), strict=True)


--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1719000000,0.1



-sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1
+sum_floordiv_regression,compile_time_instruction_count,966100000,0.1



--- a/benchmarks/fastrnns/bench.py
+++ b/benchmarks/fastrnns/bench.py
@ -127,7 +127,7 @@ def trainbench(
        bwd_time = bwd_start_event.elapsed_time(bwd_end_event)
        return fwd_time, bwd_time

-    creator_args = {
+    creator_args = creator_args = {
        "seqLength": seqLength,
        "numLayers": numLayers,
        "inputSize": inputSize,
--- a/benchmarks/fastrnns/test_bench.py
+++ b/benchmarks/fastrnns/test_bench.py
@ -12,7 +12,7 @@ def modeldef(request, net_name, executor, fuser):

    # Given a 'net_name' provided by generate_tests, build the thing
    name, rnn_creator, context = get_nn_runners(net_name)[0]
-    creator_args = {
+    creator_args = creator_args = {
        "seqLength": 100,
        "numLayers": 1,
        "inputSize": 512,
--- a/benchmarks/gpt_fast/mixtral_moe_quantize.py
+++ b/benchmarks/gpt_fast/mixtral_moe_quantize.py
@ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler:
                cur_state_dict[f"{fqn}.weight"] = int8_weight
                cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
            elif isinstance(mod, ConditionalFeedForward):
-                for weight_idx in range(3):
+                for weight_idx in range(0, 3):
                    weight_name = f"w{weight_idx + 1}"
                    scales_name = f"scales{weight_idx + 1}"
                    weight = getattr(mod, weight_name)
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -1729,8 +1729,10 @@ def define_buck_targets(
            "torch/csrc/jit/backends/backend_debug_info.cpp",
            "torch/csrc/jit/backends/backend_interface.cpp",
        ],
-        compiler_flags = get_pt_compiler_flags(),
-        fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
+        compiler_flags = get_pt_compiler_flags() + select({
+            "DEFAULT": [],
+            "ovr_config//os:android": c2_fbandroid_xplat_compiler_flags
+        }),
        # @lint-ignore BUCKLINT link_whole
        link_whole = True,
        linker_flags = get_no_as_needed_linker_flag(),
@ -2023,6 +2025,9 @@ def define_buck_targets(
                "ovr_config//os:android-x86_64": [
                    "-mssse3",
                ],
+            }) + select({
+                "DEFAULT": [],
+                "ovr_config//os:android": c2_fbandroid_xplat_compiler_flags,
            }),
            exported_preprocessor_flags = get_aten_preprocessor_flags(),
            exported_deps = [
--- a/c10/core/AllocatorConfig.cpp
+++ b/c10/core/AllocatorConfig.cpp
@ -1,4 +1,5 @@
 #include <c10/core/AllocatorConfig.h>
+#include <c10/core/DeviceType.h>
 #include <c10/util/env.h>

 namespace c10::CachingAllocator {
@ -46,7 +47,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
      63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
  const size_t interval_end =
      63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
      interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
      "kRoundUpPowerOfTwoIntervals mismatch");

@ -65,7 +66,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
      std::numeric_limits<size_t>::max() / kMB;

  size_t val_env = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
      val_env >= min_allowed_split_size_mb,
      "CachingAllocator option max_split_size_mb too small, must be >= ",
      min_allowed_split_size_mb);
@ -84,7 +85,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
      std::numeric_limits<size_t>::max() / kMB;

  size_t val_env = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
      val_env >= min_allowed_split_size_mb,
      "CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
      min_allowed_split_size_mb);
@ -99,7 +100,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
    size_t i) {
  tokenizer.checkToken(++i, ":");
  double val_env = tokenizer.toDouble(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
      val_env > 0 && val_env < 1.0,
      "garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
  garbage_collection_threshold_ = val_env;
@ -120,7 +121,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
      size_t value_index = i;
      tokenizer.checkToken(++i, ":");
      size_t value = tokenizer.toSizeT(++i);
-      TORCH_CHECK_VALUE(
+      TORCH_CHECK(
          value == 0 || llvm::isPowerOf2_64(value),
          "For roundups, the divisions has to be power of 2 or 0 to disable roundup ");

@ -128,13 +129,12 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
        std::fill(
            std::next(
                roundup_power2_divisions_.begin(),
-                static_cast<std::vector<size_t>::difference_type>(
-                    last_index + 1)),
+                static_cast<std::vector<size_t>::difference_type>(last_index)),
            roundup_power2_divisions_.end(),
            value);
      } else {
        size_t boundary = tokenizer.toSizeT(value_index);
-        TORCH_CHECK_VALUE(
+        TORCH_CHECK(
            llvm::isPowerOf2_64(boundary),
            "For roundups, the intervals have to be power of 2 ");

@ -164,7 +164,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
        "Expected closing bracket ']' in ConfigTokenizer but reached end of config");
  } else { // Keep this for backwards compatibility
    size_t value = tokenizer.toSizeT(i);
-    TORCH_CHECK_VALUE(
+    TORCH_CHECK(
        llvm::isPowerOf2_64(value),
        "For roundups, the divisions has to be power of 2 ");
    std::fill(
@ -224,7 +224,7 @@ void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
      // If a device-specific configuration parser hook is registered, it will
      // check if the key is unrecognized.
      if (device_config_parser_hook_) {
-        TORCH_CHECK_VALUE(
+        TORCH_CHECK(
            getKeys().find(key) != getKeys().end(),
            "Unrecognized key '",
            key,
--- a/c10/core/AllocatorConfig.h
+++ b/c10/core/AllocatorConfig.h
@ -76,7 +76,7 @@ class ConfigTokenizer {
    } else if (token == "False") {
      return false;
    } else {
-      TORCH_CHECK_VALUE(
+      TORCH_CHECK(
          false,
          "Expected 'True' or 'False' at index ",
          i,
@ -253,7 +253,7 @@ class C10_API AcceleratorAllocatorConfig {
    device_config_parser_hook_ = std::move(hook);
    auto& mutable_keys = getMutableKeys();
    for (auto& key : keys) {
-      TORCH_CHECK_VALUE(
+      TORCH_CHECK(
          mutable_keys.insert(key).second,
          "Duplicated key '",
          key,
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@ -102,7 +102,7 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
  } else {
    std::random_device rd;
    // limit to 53 bits to ensure unique representation in double
-    s = (((static_cast<uint64_t>(rd())) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+    s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
  }
  return s;
 }
--- a/c10/core/RefcountedDeleter.cpp
+++ b/c10/core/RefcountedDeleter.cpp
@ -20,8 +20,7 @@ void maybeApplyRefcountedDeleter(const c10::Storage& storage) {
  std::lock_guard<std::mutex> guard(replace_data_ptr_mutex);
  c10::DataPtr& data_ptr = storage.mutable_data_ptr();

-  if (reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
-      reinterpret_cast<const void*>(&c10::refcounted_deleter)) {
+  if ((void*)data_ptr.get_deleter() == (void*)&c10::refcounted_deleter) {
    // Data pointer is already shared
    return;
  }
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@ -52,19 +52,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT)
 #undef DEFINE_CONSTANT

-inline const char* toString(ScalarType t) {
-#define DEFINE_CASE(_, name) \
-  case ScalarType::name:     \
-    return #name;
-
-  switch (t) {
-    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
-    default:
-      return "UNKNOWN_SCALAR";
-  }
-#undef DEFINE_CASE
-}
-
 inline size_t elementSize(ScalarType t) {
 #define CASE_ELEMENTSIZE_CASE(ctype, name) \
  case ScalarType::name:                   \
@ -308,12 +295,6 @@ inline bool canCast(const ScalarType from, const ScalarType to) {

 C10_API ScalarType promoteTypes(ScalarType a, ScalarType b);

-inline std::ostream& operator<<(
-    std::ostream& stream,
-    at::ScalarType scalar_type) {
-  return stream << toString(scalar_type);
-}
-
 // Returns a pair of strings representing the names for each dtype.
 // The returned pair is (name, legacy_name_if_applicable)
 C10_API std::pair<std::string, std::string> getDtypeNames(
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@ -4,6 +4,7 @@
 #include <c10/core/SymNodeImpl.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/safe_numerics.h>
+#include <functional>

 namespace c10 {

@ -83,7 +84,7 @@ DEFINE_BINARY(max_slow_path, sym_max, SymInt)

 SymInt::operator SymFloat() const {
  if (auto ma = maybe_as_int()) {
-    return SymFloat(static_cast<double>(*ma));
+    return SymFloat(double(*ma));
  } else {
    return SymFloat(toSymNodeImplUnowned()->sym_float());
  }
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -9,6 +9,7 @@
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Logging.h>
 #include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
 #include <optional>

 #include <utility>
--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@ -1,5 +1,9 @@
 #include <c10/core/TensorOptions.h>

+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/util/Optional.h>
+
 #include <iostream>

 namespace c10 {
--- a/c10/core/impl/COW.cpp
+++ b/c10/core/impl/COW.cpp
@ -2,6 +2,7 @@

 #include <c10/core/Allocator.h>
 #include <c10/core/StorageImpl.h>
+#include <c10/core/alignment.h>
 #include <c10/core/impl/COWDeleter.h>
 #include <c10/util/Exception.h>
 #include <c10/util/ParallelGuard.h>
@ -44,8 +45,7 @@ bool has_simple_data_ptr(const c10::StorageImpl& storage) {
 }

 bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
-  return reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
-      reinterpret_cast<const void*>(&cow::cow_deleter);
+  return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
 }

 c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@ -1,4 +1,5 @@
 #include <c10/core/DispatchKey.h>
+#include <c10/core/SafePyObject.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/irange.h>
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -20,7 +20,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
  tokenizer.checkToken(++i, ":");
  i++; // Move to the value after the colon
 #ifdef USE_ROCM
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
       (tokenizer[i] == PYTORCH_TOKEN2)),
      "Unknown allocator backend, "
@ -36,7 +36,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
      " != ",
      get()->name());
 #else // USE_ROCM
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
      "Unknown allocator backend, "
      "options are native and " PYTORCH_TOKEN1);
@ -109,7 +109,7 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
    } else {
      const auto& keys =
          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
-      TORCH_CHECK_VALUE(
+      TORCH_CHECK(
          keys.find(key) != keys.end(),
          "Unrecognized key '",
          key,
@ -151,12 +151,12 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
    size_t i) {
  tokenizer.checkToken(++i, ":");
  size_t val2 = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
      llvm::isPowerOf2_64(val2),
      "Number of register threads has to be power of 2, got ",
      val2);
  auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
      val2 <= maxThreads,
      "Number of register threads should be less than or equal to ",
      maxThreads,
@ -171,8 +171,7 @@ size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
    size_t i) {
  tokenizer.checkToken(++i, ":");
  size_t val2 = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
-      val2 > 0, "Pinned reserve segment size has to be greater than 0");
+  TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0");
  m_pinned_reserve_segment_size_mb = val2;
  return i;
 }
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -3,7 +3,6 @@
 #include <c10/core/AllocatorConfig.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
-#include <c10/util/Deprecated.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>

@ -18,14 +17,9 @@ enum class Expandable_Segments_Handle_Type : int {
 // Environment config parser
 class C10_CUDA_API CUDAAllocatorConfig {
 public:
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
  static size_t max_split_size() {
    return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
  }
-
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
  static double garbage_collection_threshold() {
    return c10::CachingAllocator::AcceleratorAllocatorConfig::
        garbage_collection_threshold();
@ -70,8 +64,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return instance().m_pinned_num_register_threads;
  }

-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
  static bool pinned_use_background_threads() {
    return c10::CachingAllocator::AcceleratorAllocatorConfig::
        pinned_use_background_threads();
@ -88,15 +80,11 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return 128;
  }

-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
  static size_t roundup_power2_divisions(size_t size) {
    return c10::CachingAllocator::AcceleratorAllocatorConfig::
        roundup_power2_divisions(size);
  }

-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
  static std::vector<size_t> roundup_power2_divisions() {
    return c10::CachingAllocator::AcceleratorAllocatorConfig::
        roundup_power2_divisions();
@ -107,8 +95,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
        max_non_split_rounding_size();
  }

-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
  static std::string last_allocator_settings() {
    return c10::CachingAllocator::getAllocatorSettings();
  }
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -512,7 +512,7 @@ struct ExpandableSegment {
    header.segment_size = segment_size_;
    header.num_handles = end - begin;

-    buf.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));
+    buf.write((const char*)&header, sizeof(ShareHeader));
    for (auto i : c10::irange(begin, end)) {
      // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
      auto& handle = handles_.at(i).value();
@ -528,9 +528,7 @@ struct ExpandableSegment {
        TORCH_CHECK(
            handle.shareable_handle != std::nullopt,
            "shareable_handle is null");
-        buf.write(
-            reinterpret_cast<const char*>(&*handle.shareable_handle),
-            sizeof(int));
+        buf.write((const char*)&*handle.shareable_handle, sizeof(int));
      } else {
        if (!handle.shareable_handle) {
          CUmemFabricHandle fabric_handle;
@ -543,8 +541,7 @@ struct ExpandableSegment {
            handle.shareable_handle != std::nullopt,
            "shareable_handle is null");
        buf.write(
-            reinterpret_cast<const char*>(&*handle.shareable_handle),
-            sizeof(CUmemFabricHandle));
+            (const char*)&*handle.shareable_handle, sizeof(CUmemFabricHandle));
      }
    }
    return rangeFromHandles(begin, end);
@ -555,7 +552,7 @@ struct ExpandableSegment {
      std::vector<c10::DeviceIndex> peers,
      std::istream& buf) {
    ShareHeader header{};
-    buf.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));
+    buf.read((char*)&header, sizeof(ShareHeader));
    auto segment = std::make_unique<ExpandableSegment>(
        device, std::nullopt, header.segment_size, std::move(peers));
 // older build setups (e.g. multiwheels) do not have this syscall, added 2020
@ -577,11 +574,11 @@ struct ExpandableSegment {
      for (auto i : c10::irange(header.num_handles)) {
        (void)i;
        int fd = 0;
-        buf.read(reinterpret_cast<char*>(&fd), sizeof(int));
+        buf.read((char*)&fd, sizeof(int));
        auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
        if (myfd == -1) {
          auto err = errno;
-          close(static_cast<int>(pidfd));
+          close((int)pidfd);
          for (auto& h : segment->handles_) {
            C10_CUDA_DRIVER_CHECK(
                // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
@ -601,16 +598,15 @@ struct ExpandableSegment {
            (void*)(uintptr_t)myfd,
            CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
        LOG(INFO) << "use posix fd to import expandable segments.";
-        close(static_cast<int>(myfd));
+        close((int)myfd);
        segment->handles_.emplace_back(Handle{handle, std::nullopt});
      }
-      close(static_cast<int>(pidfd));
+      close((int)pidfd);
    } else {
      for (auto i : c10::irange(header.num_handles)) {
        (void)i;
        CUmemFabricHandle fabric_handle;
-        buf.read(
-            reinterpret_cast<char*>(&fabric_handle), sizeof(CUmemFabricHandle));
+        buf.read((char*)&fabric_handle, sizeof(CUmemFabricHandle));
        CUmemGenericAllocationHandle handle = 0;
        C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
            &handle,
@ -1063,7 +1059,7 @@ class RingBuffer {

  void setMaxEntries(size_t size) {
    std::lock_guard<std::mutex> lk(alloc_trace_lock);
-    alloc_trace_max_entries_ = std::max(static_cast<size_t>(1), size);
+    alloc_trace_max_entries_ = std::max(size_t(1), size);
  }

  void insertEntries(const T& entry) {
@ -1264,9 +1260,6 @@ class DeviceCachingAllocator {
  // thread local compile context for each device
  static thread_local std::stack<std::string> compile_context;

-  // thread local user metadata for annotating allocations
-  static thread_local std::string user_metadata;
-
 public:
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  explicit DeviceCachingAllocator(c10::DeviceIndex id)
@ -1274,7 +1267,7 @@ class DeviceCachingAllocator {
        large_blocks(/*small=*/false),
        small_blocks(/*small=*/true) {
    stats.max_split_size =
-        static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
+        static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
    context_recorder_.store(nullptr);
  }

@ -1309,14 +1302,6 @@ class DeviceCachingAllocator {
    }
  }

-  void setUserMetadata(const std::string& metadata) {
-    user_metadata = metadata;
-  }
-
-  std::string getUserMetadata() {
-    return user_metadata;
-  }
-
  bool checkPoolLiveAllocations(
      MempoolId_t mempool_id,
      const std::unordered_set<void*>& expected_live_allocations) const {
@ -1409,8 +1394,7 @@ class DeviceCachingAllocator {
      // Do garbage collection if the flag is set.
      if (C10_UNLIKELY(
              set_fraction &&
-              AcceleratorAllocatorConfig::garbage_collection_threshold() >
-                  0.0)) {
+              CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
        garbage_collect_cached_blocks(context);
      }
      // Attempt allocate
@ -1662,7 +1646,7 @@ class DeviceCachingAllocator {
      stats.active_bytes[stat_type].increase(block->size);
      stats.requested_bytes[stat_type].increase(block->requested_size);
    });
-    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
      stats.oversize_allocations.increase(1);

    auto allocated_bytes_gauge =
@ -1931,7 +1915,7 @@ class DeviceCachingAllocator {
        block->pool->owner_MempoolId(),
        context ? context : block->context_when_allocated);

-    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
      stats.oversize_allocations.decrease(1);

    // If the block has been used on more than one stream, handle accordingly.
@ -1995,16 +1979,15 @@ class DeviceCachingAllocator {
      while (base_block->prev) {
        base_block = base_block->prev;
      }
-      offset = static_cast<const char*>(block->ptr) -
-          static_cast<const char*>(base_block->ptr);
+      offset = (char*)block->ptr - (char*)base_block->ptr;
      cudaIpcMemHandle_t handle;
      C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
-      ss.write(reinterpret_cast<const char*>(&handle), CUDA_IPC_HANDLE_SIZE);
+      ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE);
    } else {
      ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
      auto full_range = block->expandable_segment_->share(
          SegmentRange(block->ptr, block->size), ss);
-      offset = static_cast<const char*>(block->ptr) - full_range.ptr;
+      offset = (char*)block->ptr - full_range.ptr;
    }
    return ShareableHandle{offset, ss.str()};
  }
@ -2505,8 +2488,7 @@ class DeviceCachingAllocator {
    if (size < kMinBlockSize) {
      return kMinBlockSize;
    } else {
-      auto divisions =
-          AcceleratorAllocatorConfig::roundup_power2_divisions(size);
+      auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
      if (divisions > 1 && size > (kMinBlockSize * divisions)) {
        return roundup_power2_next_division(size, divisions);
      } else {
@ -3000,7 +2982,7 @@ class DeviceCachingAllocator {
    if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
      return remaining >= kMinBlockSize;
    } else {
-      return (size < AcceleratorAllocatorConfig::max_split_size()) &&
+      return (size < CUDAAllocatorConfig::max_split_size()) &&
          (remaining > kSmallSize);
    }
  }
@ -3020,7 +3002,7 @@ class DeviceCachingAllocator {

    if (C10_UNLIKELY(
            set_fraction &&
-            AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+            CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
      // Track block reuse interval only when garbage collection is enabled.
      ++pool.get_free_blocks_call_count;
    }
@ -3062,13 +3044,13 @@ class DeviceCachingAllocator {
    }

    // Do not return an oversized block for a large request
-    if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
-        ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
+    if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= CUDAAllocatorConfig::max_split_size()))
      return false;
    // Allow oversized block size to be rounded up but within a limit
-    if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
+    if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
        ((*it)->size >=
-         p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
+         p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
      return false;
    p.block = *it;
    pool.blocks.erase(it);
@ -3091,7 +3073,7 @@ class DeviceCachingAllocator {
    // therefore should be of less overheads.

    size_t gc_threshold = static_cast<size_t>(
-        AcceleratorAllocatorConfig::garbage_collection_threshold() *
+        CUDAAllocatorConfig::garbage_collection_threshold() *
        static_cast<double>(allowed_memory_maximum));
    // No need to trigger GC yet
    if (total_allocated_memory <= gc_threshold) {
@ -3234,13 +3216,12 @@ class DeviceCachingAllocator {
    }

    total_allocated_memory += size;
-    p.block = new Block(
-        p.device(), p.stream(), size, p.pool, static_cast<char*>(ptr));
+    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
    for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
      stats.segment[stat_type].increase(1);
      stats.reserved_bytes[stat_type].increase(size);
    });
-    if (size >= AcceleratorAllocatorConfig::max_split_size())
+    if (size >= CUDAAllocatorConfig::max_split_size())
      stats.oversize_segments.increase(1);
    auto reserved_bytes_gauge =
        STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@ -3269,7 +3250,7 @@ class DeviceCachingAllocator {
  bool release_available_cached_blocks(
      const AllocParams& p,
      const std::shared_ptr<GatheredContext>& context) {
-    if (AcceleratorAllocatorConfig::max_split_size() ==
+    if (CUDAAllocatorConfig::max_split_size() ==
        std::numeric_limits<size_t>::max())
      return false;
    BlockPool& pool = *p.pool;
@ -3277,8 +3258,8 @@ class DeviceCachingAllocator {
    // because of std::unique_ptr, block cannot be trivially copied
    // Use constructor for search key.
    Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
-    key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
-        ? AcceleratorAllocatorConfig::max_split_size()
+    key.size = (key.size < CUDAAllocatorConfig::max_split_size())
+        ? CUDAAllocatorConfig::max_split_size()
        : key.size;
    auto it = pool.blocks.lower_bound(&key);
    if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@ -3291,7 +3272,7 @@ class DeviceCachingAllocator {
      --it; // Back up one item.  Now on the largest block for the correct
            // stream
      while ((totalReleased < key.size) &&
-             ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
+             ((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
             ((*it)->stream == p.stream())) {
        auto cur = it;
        bool is_first = cur == pool.blocks.begin();
@ -3416,7 +3397,7 @@ class DeviceCachingAllocator {
        stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
            .current);

-    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
      stats.oversize_segments.decrease(1);
    pool->blocks.erase(block);
    delete block;
@ -3701,8 +3682,7 @@ class DeviceCachingAllocator {
        mempool_id,
        getApproximateTime(),
        record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
-        compile_string,
-        user_metadata);
+        compile_string);

    // Callbacks should not include any Pytorch call
    for (const auto& cb : trace_trackers_) {
@ -3757,7 +3737,6 @@ static void uncached_delete(void* ptr) {

 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
-thread_local std::string DeviceCachingAllocator::user_metadata;
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
 #else
@ -3783,7 +3762,7 @@ class NativeCachingAllocator : public CUDAAllocator {
      allocated_blocks;

  static size_t get_mutex_shard_id(void* ptr) {
-    return twang_mix64(reinterpret_cast<uintptr_t>(ptr)) % kNumMutexShard;
+    return twang_mix64((size_t)ptr) % kNumMutexShard;
  }

  void add_allocated_block(Block* block) {
@ -3820,8 +3799,8 @@ class NativeCachingAllocator : public CUDAAllocator {
    if (size < device_count) {
      device_allocator.resize(device_count);
      for (const auto i : c10::irange(size, device_count)) {
-        device_allocator[i] = std::make_unique<DeviceCachingAllocator>(
-            static_cast<c10::DeviceIndex>(i));
+        device_allocator[i] =
+            std::make_unique<DeviceCachingAllocator>(c10::DeviceIndex(i));
      }
    }
  }
@ -3955,18 +3934,6 @@ class NativeCachingAllocator : public CUDAAllocator {
    device_allocator[device]->popCompileContext();
  }

-  void setUserMetadata(const std::string& metadata) override {
-    c10::DeviceIndex device = 0;
-    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-    device_allocator[device]->setUserMetadata(metadata);
-  }
-
-  std::string getUserMetadata() override {
-    c10::DeviceIndex device = 0;
-    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-    return device_allocator[device]->getUserMetadata();
-  }
-
  bool isHistoryEnabled() override {
    c10::DeviceIndex device = 0;
    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
@ -4067,8 +4034,8 @@ class NativeCachingAllocator : public CUDAAllocator {

    auto& md = result.config_metadata;
    md.garbage_collection_threshold =
-        AcceleratorAllocatorConfig::garbage_collection_threshold();
-    md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
+        CUDAAllocatorConfig::garbage_collection_threshold();
+    md.max_split_size = CUDAAllocatorConfig::max_split_size();
    md.pinned_num_register_threads =
        CUDAAllocatorConfig::pinned_num_register_threads();
    md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
@ -4076,12 +4043,11 @@ class NativeCachingAllocator : public CUDAAllocator {
        CUDAAllocatorConfig::release_lock_on_cudamalloc();
    md.pinned_use_host_register =
        CUDAAllocatorConfig::pinned_use_cuda_host_register();
-    md.last_allocator_settings =
-        AcceleratorAllocatorConfig::last_allocator_settings();
+    md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
    md.graph_capture_record_stream_reuse =
        CUDAAllocatorConfig::graph_capture_record_stream_reuse();
    md.roundup_power2_divisions =
-        AcceleratorAllocatorConfig::roundup_power2_divisions();
+        CUDAAllocatorConfig::roundup_power2_divisions();

    return result;
  }
@ -4350,7 +4316,7 @@ class NativeCachingAllocator : public CUDAAllocator {
        // SHARABLE_CUDA_MALLOC
      if (type == SHAREABLE_CUDA_MALLOC) {
        cudaIpcMemHandle_t cuda_handle;
-        ss.read(reinterpret_cast<char*>(&cuda_handle), CUDA_IPC_HANDLE_SIZE);
+        ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE);
        C10_CUDA_CHECK(cudaIpcOpenMemHandle(
            &cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess));
      } else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) {
@ -4459,12 +4425,11 @@ CUDAAllocator* allocator();
 } // namespace CudaMallocAsync

 struct BackendStaticInitializer {
-  // Parses the environment configuration for CUDA/ROCm allocator backend at
-  // load time. This duplicates some logic from CUDAAllocatorConfig to ensure
-  // lazy initialization without triggering global static constructors. The
-  // function looks for the key "backend" and returns the appropriate allocator
-  // instance based on its value. If no valid configuration is found, it falls
-  // back to the default Native allocator.
+  // Parses env for backend at load time, duplicating some logic from
+  // CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
+  // runtime). Defers verbose exceptions and error checks, including Cuda
+  // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
+  // works, maybe we should move all of CUDAAllocatorConfig here?
  CUDAAllocator* parseEnvForBackend() {
    auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
 #ifdef USE_ROCM
@ -4473,35 +4438,34 @@ struct BackendStaticInitializer {
      val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
    }
 #endif
-    if (!val.has_value()) {
-      val = c10::utils::get_env("PYTORCH_ALLOC_CONF");
-    }
    if (val.has_value()) {
-      c10::CachingAllocator::ConfigTokenizer tokenizer(val.value());
-      for (size_t i = 0; i < tokenizer.size(); i++) {
-        const auto& key = tokenizer[i];
-        if (key == "backend") {
-          tokenizer.checkToken(++i, ":");
-          i++; // Move to the value after the colon
-          if (tokenizer[i] == "cudaMallocAsync"
+      const std::string& config = val.value();
+
+      std::regex exp("[\\s,]+");
+      std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
+      std::sregex_token_iterator end;
+      std::vector<std::string> options(it, end);
+
+      for (auto option : options) {
+        std::regex exp2("[:]+");
+        std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
+        std::sregex_token_iterator end2;
+        std::vector<std::string> kv(it2, end2);
+        if (kv.size() >= 2) {
+          if (kv[0] == "backend") {
 #ifdef USE_ROCM
-              // convenience for ROCm users to allow either CUDA or HIP env var
-              || tokenizer[i] == "hipMallocAsync"
+            // convenience for ROCm users to allow either CUDA or HIP env var
+            if (kv[1] == "cudaMallocAsync" || kv[1] == "hipMallocAsync")
+#else
+            if (kv[1] == "cudaMallocAsync")
 #endif
-          ) {
-            return CudaMallocAsync::allocator();
+              return CudaMallocAsync::allocator();
+            if (kv[1] == "native")
+              return &Native::allocator;
          }
-          break;
-        } else {
-          // Skip the key and its value
-          i = tokenizer.skipKey(i);
-        }
-        if (i + 1 < tokenizer.size()) {
-          tokenizer.checkToken(++i, ",");
        }
      }
    }
-    // Default fallback allocator.
    return &Native::allocator;
  }

--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -118,8 +118,7 @@ struct TraceEntry {
      MempoolId_t mempool,
      approx_time_t time,
      std::shared_ptr<GatheredContext> context = nullptr,
-      std::string compile_context = "",
-      std::string user_metadata = "")
+      std::string compile_context = "")
      : action_(action),
        device_(device),
        addr_(addr),
@ -127,8 +126,7 @@ struct TraceEntry {
        stream_(stream),
        size_(size),
        mempool_(std::move(mempool)),
-        compile_context_(std::move(compile_context)),
-        user_metadata_(std::move(user_metadata)) {
+        compile_context_(std::move(compile_context)) {
    time_.approx_t_ = time;
  }
  Action action_;
@ -140,7 +138,6 @@ struct TraceEntry {
  MempoolId_t mempool_;
  trace_time_ time_{};
  std::string compile_context_;
-  std::string user_metadata_;
 };

 // Calls made by record_function will save annotations
@ -300,10 +297,6 @@ class CUDAAllocator : public DeviceAllocator {
      const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
  virtual void pushCompileContext(std::string& md) {}
  virtual void popCompileContext() {}
-  virtual void setUserMetadata(const std::string& metadata) {}
-  virtual std::string getUserMetadata() {
-    return "";
-  }
  virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;

  // Attached AllocatorTraceTracker callbacks will be called while the
@ -543,14 +536,6 @@ inline void enablePeerAccess(
  get()->enablePeerAccess(dev, dev_to_access);
 }

-inline void setUserMetadata(const std::string& metadata) {
-  get()->setUserMetadata(metadata);
-}
-
-inline std::string getUserMetadata() {
-  return get()->getUserMetadata();
-}
-
 } // namespace c10::cuda::CUDACachingAllocator

 namespace c10::cuda {
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@ -1,6 +1,8 @@
 #include <c10/cuda/CUDADeviceAssertionHost.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAFunctions.h>
+#include <c10/util/Backtrace.h>
+#include <c10/util/Exception.h>
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
 #include <cuda_runtime.h>
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -4,6 +4,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
+#include <c10/util/irange.h>

 #include <unordered_set>
 #include <vector>
@ -46,7 +47,7 @@ bool operator==(const UsageStream& lhs, const UsageStream& rhs) {

 struct UsageStreamHash {
  size_t operator()(const UsageStream& us) const noexcept {
-    return std::hash<void*>{}(us.stream) + static_cast<size_t>(us.device);
+    return std::hash<void*>{}(us.stream) + size_t(us.device);
  }
 };

--- a/c10/cuda/CUDAMiscFunctions.cpp
+++ b/c10/cuda/CUDAMiscFunctions.cpp
@ -1,6 +1,7 @@
 #include <c10/cuda/CUDAMiscFunctions.h>
 #include <c10/util/env.h>
 #include <cuda_runtime.h>
+#include <cstring>
 #include <string>

 namespace c10::cuda {
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@ -128,7 +128,7 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
  } else if (s.isExt()) {
    stream << "EXT";
  } else {
-    stream << "PRIORITY " << static_cast<int>(s.getStreamType());
+    stream << "PRIORITY " << int(s.getStreamType());
  }
  return stream;
 }
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@ -1,6 +1,7 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/driver_api.h>
+#include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 #include <cuda_runtime.h>
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@ -328,21 +328,5 @@ struct pair {
  T2 second;
 };

-#define INSTANTIATE_FOR_ALL_TYPES(MACRO) \
-  MACRO(float);                          \
-  MACRO(half);                           \
-  MACRO(bfloat);                         \
-  MACRO(float2);                         \
-  MACRO(long);                           \
-  MACRO(char);                           \
-  MACRO(uchar);                          \
-  MACRO(short);                          \
-  MACRO(int);
-
-#define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \
-  MACRO(float);                            \
-  MACRO(half);                             \
-  MACRO(bfloat);
-
 } // namespace metal
 } // namespace c10
--- a/c10/test/core/AllocatorConfig_test.cpp
+++ b/c10/test/core/AllocatorConfig_test.cpp
@ -67,8 +67,8 @@ TEST(AllocatorConfigTest, allocator_config_test) {
  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2);
  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2);
-  EXPECT_EQ(
-      AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
+  // EXPECT_EQ(
+  //     AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
  EXPECT_EQ(
      AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1);
  EXPECT_EQ(
@ -101,8 +101,8 @@ TEST(AllocatorConfigTest, allocator_config_test) {
  EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1);
  EXPECT_EQ(
      AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0);
-  EXPECT_EQ(
-      AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
+  // EXPECT_EQ(
+  //     AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
  EXPECT_EQ(
      AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2);

--- a/c10/util/ApproximateClock.cpp
+++ b/c10/util/ApproximateClock.cpp
@ -1,6 +1,7 @@
 #include <c10/util/ApproximateClock.h>
-#include <c10/util/Exception.h>
+#include <c10/util/ArrayRef.h>
 #include <c10/util/irange.h>
+#include <fmt/format.h>

 namespace c10 {

@ -46,8 +47,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
  for (const auto i : c10::irange(replicates)) {
    auto delta_ns = end_times[i].t_ - start_times_[i].t_;
    auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
-    scale_factors[i] =
-        static_cast<double>(delta_ns) / static_cast<double>(delta_approx);
+    scale_factors[i] = (double)delta_ns / (double)delta_approx;
  }
  std::sort(scale_factors.begin(), scale_factors.end());
  long double scale_factor = scale_factors[replicates / 2 + 1];
@ -65,8 +65,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
  for (const auto i : c10::irange(replicates)) {
    auto dt = start_times_[i].t_ - t0;
    auto dt_approx =
-        static_cast<double>(start_times_[i].approx_t_ - t0_approx) *
-        scale_factor;
+        (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
    t0_correction[i] = dt - (time_t)dt_approx; // NOLINT
  }
  t0 += t0_correction[t0_correction.size() / 2 + 1]; // NOLINT
@ -74,9 +73,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
  return [=](approx_time_t t_approx) {
    // See above for why this is more stable than `A * t_approx + B`.
    return t_approx > t0_approx
-        ? static_cast<time_t>(
-              static_cast<double>(t_approx - t0_approx) * scale_factor) +
-            t0
+        ? (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0
        : 0;
  };
 }
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@ -18,6 +18,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>

 #include <array>
 #include <cstddef>
@ -40,200 +41,106 @@ namespace c10 {
 ///
 /// This is intended to be trivially copyable, so it should be passed by
 /// value.
+///
+/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct
+/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of
+/// the underlying constexpr calls, we rely on apparent-type dispatch for
+/// inheritance. This should be fine because their memory format is the same,
+/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods.
+/// However, you should prefer to use ArrayRef when possible, because its use
+/// of TORCH_CHECK will lead to better user-facing error messages.
 template <typename T>
-class ArrayRef final {
+class ArrayRef final : public HeaderOnlyArrayRef<T> {
 public:
-  using iterator = const T*;
-  using const_iterator = const T*;
-  using size_type = size_t;
-  using value_type = T;
-
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
- private:
-  /// The start of the array, in an external buffer.
-  const T* Data;
-
-  /// The number of elements.
-  size_type Length;
-
-  void debugCheckNullptrInvariant() {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        Data != nullptr || Length == 0,
-        "created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal");
-  }
-
- public:
-  /// @name Constructors
+  /// @name Constructors, all inherited from HeaderOnlyArrayRef except for
+  /// SmallVector.
  /// @{

-  /// Construct an empty ArrayRef.
-  /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
+  using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef;

-  /// Construct an ArrayRef from a single element.
-  // TODO Make this explicit
-  constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
-
-  /// Construct an ArrayRef from a pointer and length.
-  constexpr ArrayRef(const T* data, size_t length)
-      : Data(data), Length(length) {
-    debugCheckNullptrInvariant();
-  }
-
-  /// Construct an ArrayRef from a range.
-  constexpr ArrayRef(const T* begin, const T* end)
-      : Data(begin), Length(end - begin) {
-    debugCheckNullptrInvariant();
-  }
+  /// Construct an ArrayRef from a std::vector.
+  /// This constructor is identical to the one in HeaderOnlyArrayRef, but we
+  /// include it to help with Class Template Argument Deduction (CTAD).
+  /// Without it, CTAD can fail sometimes due to the indirect constructor
+  /// inheritance. So we explicitly include this constructor.
+  template <typename A>
+  /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
+      : HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}

  /// Construct an ArrayRef from a SmallVector. This is templated in order to
  /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
  /// copy-construct an ArrayRef.
+  /// NOTE: this is the only constructor that is not inherited from
+  /// HeaderOnlyArrayRef.
  template <typename U>
  /* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    debugCheckNullptrInvariant();
-  }
-
-  template <
-      typename Container,
-      typename U = decltype(std::declval<Container>().data()),
-      typename = std::enable_if_t<
-          (std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
-  /* implicit */ ArrayRef(const Container& container)
-      : Data(container.data()), Length(container.size()) {
-    debugCheckNullptrInvariant();
-  }
-
-  /// Construct an ArrayRef from a std::vector.
-  // The enable_if stuff here makes sure that this isn't used for
-  // std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
-  // bitfield.
-  template <typename A>
-  /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    static_assert(
-        !std::is_same_v<T, bool>,
-        "ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
-  }
-
-  /// Construct an ArrayRef from a std::array
-  template <size_t N>
-  /* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
-      : Data(Arr.data()), Length(N) {}
-
-  /// Construct an ArrayRef from a C array.
-  template <size_t N>
-  // NOLINTNEXTLINE(*c-arrays*)
-  /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
-
-  /// Construct an ArrayRef from a std::initializer_list.
-  /* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
-      : Data(
-            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
-                                             : std::begin(Vec)),
-        Length(Vec.size()) {}
+      : HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}

  /// @}
-  /// @name Simple Operations
+  /// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef
  /// @{

-  constexpr iterator begin() const {
-    return Data;
-  }
-  constexpr iterator end() const {
-    return Data + Length;
-  }
-
-  // These are actually the same as iterator, since ArrayRef only
-  // gives you const iterators.
-  constexpr const_iterator cbegin() const {
-    return Data;
-  }
-  constexpr const_iterator cend() const {
-    return Data + Length;
-  }
-
-  constexpr reverse_iterator rbegin() const {
-    return reverse_iterator(end());
-  }
-  constexpr reverse_iterator rend() const {
-    return reverse_iterator(begin());
-  }
-
-  /// Check if all elements in the array satisfy the given expression
-  constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
-    return std::all_of(cbegin(), cend(), pred);
-  }
-
-  /// empty - Check if the array is empty.
-  constexpr bool empty() const {
-    return Length == 0;
-  }
-
-  constexpr const T* data() const {
-    return Data;
-  }
-
-  /// size - Get the array size.
-  constexpr size_t size() const {
-    return Length;
-  }
-
  /// front - Get the first element.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
  constexpr const T& front() const {
    TORCH_CHECK(
-        !empty(), "ArrayRef: attempted to access front() of empty list");
-    return Data[0];
+        !this->empty(), "ArrayRef: attempted to access front() of empty list");
+    return this->Data[0];
  }

  /// back - Get the last element.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
  constexpr const T& back() const {
-    TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
-    return Data[Length - 1];
-  }
-
-  /// equals - Check for element-wise equality.
-  constexpr bool equals(ArrayRef RHS) const {
-    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+    TORCH_CHECK(
+        !this->empty(), "ArrayRef: attempted to access back() of empty list");
+    return this->Data[this->Length - 1];
  }

  /// slice(n, m) - Take M elements of the array starting at element N
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
  constexpr ArrayRef<T> slice(size_t N, size_t M) const {
    TORCH_CHECK(
-        N + M <= size(),
+        N + M <= this->size(),
        "ArrayRef: invalid slice, N = ",
        N,
        "; M = ",
        M,
        "; size = ",
-        size());
-    return ArrayRef<T>(data() + N, M);
+        this->size());
+    return ArrayRef<T>(this->data() + N, M);
  }

  /// slice(n) - Chop off the first N elements of the array.
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
  constexpr ArrayRef<T> slice(size_t N) const {
    TORCH_CHECK(
-        N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
-    return slice(N, size() - N);
+        N <= this->size(),
+        "ArrayRef: invalid slice, N = ",
+        N,
+        "; size = ",
+        this->size());
+    return slice(N, this->size() - N); // should this slice be this->slice?
  }

  /// @}
  /// @name Operator Overloads
  /// @{
-  constexpr const T& operator[](size_t Index) const {
-    return Data[Index];
-  }

  /// Vector compatibility
+  /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
+  /// STD_TORCH_CHECK
  constexpr const T& at(size_t Index) const {
    TORCH_CHECK(
-        Index < Length,
+        Index < this->Length,
        "ArrayRef: invalid index Index = ",
        Index,
        "; Length = ",
-        Length);
-    return Data[Index];
+        this->Length);
+    return this->Data[Index];
  }

  /// Disallow accidental assignment from a temporary.
@ -253,13 +160,6 @@ class ArrayRef final {
  std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
      std::initializer_list<U>) = delete;

-  /// @}
-  /// @name Expensive Operations
-  /// @{
-  std::vector<T> vec() const {
-    return std::vector<T>(Data, Data + Length);
-  }
-
  /// @}
 };

--- a/c10/util/complex_math.cpp
+++ b/c10/util/complex_math.cpp
@ -1,5 +1,7 @@
 #include <c10/util/complex.h>

+#include <cmath>
+
 // Note [ Complex Square root in libc++]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // In libc++ complex square root is computed using polar form
--- a/c10/util/int128.cpp
+++ b/c10/util/int128.cpp
@ -132,15 +132,15 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) {
  int div_base_log = 0;
  switch (flags & std::ios::basefield) {
    case std::ios::hex:
-      div = static_cast<uint64_t>(0x1000000000000000u); // 16^15
+      div = (uint64_t)0x1000000000000000u; // 16^15
      div_base_log = 15;
      break;
    case std::ios::oct:
-      div = static_cast<uint64_t>(01000000000000000000000u); // 8^21
+      div = (uint64_t)01000000000000000000000u; // 8^21
      div_base_log = 21;
      break;
    default: // std::ios::dec
-      div = static_cast<uint64_t>(10000000000000000000u); // 10^19
+      div = (uint64_t)10000000000000000000u; // 10^19
      div_base_log = 19;
      break;
  }
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@ -11,6 +11,7 @@
 #include <unistd.h>

 #include <atomic>
+#include <chrono>
 #include <condition_variable>
 #include <cstdint>
 #include <cstdio>
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@ -74,7 +74,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
        )

    code.append("      " + OutType + "* op = &out[rangeIndex * block_size];")
-    for i in range(uf):
+    for i in range(0, uf):
        j = 8 * i
        code.append("      __m256 vop" + str(j) + " = _mm256_setzero_ps();")

@ -158,7 +158,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
        "&input[idx_pref_T0 * fused_block_size];"
    )

-    for i in range(uf):
+    for i in range(0, uf):
        j = 8 * i
        cachelinesize = 64
        byteoffset = sizeof[InType] * j
@ -170,7 +170,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
        code.append("      if (!normalize_by_lengths || length == 0) {")
    else:
        code.append("      if (!normalize_by_lengths || lengths[rangeIndex] == 0) {")
-    for i in range(uf):
+    for i in range(0, uf):
        j = 8 * i
        code.append("        _mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
    code.append("      } else {")
@ -181,7 +181,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
        code.append(
            "        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);"
        )
-    for i in range(uf):
+    for i in range(0, uf):
        j = 8 * i
        code.append(
            "        _mm256_storeu_ps(&op["
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@ -224,12 +224,6 @@ AMD/ROCm/HIP
 -  Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
 -  (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)

-XPU/Intel GPU
-~~~~~~~~~~~~~
-
- Eikan Wang (`EikanWang <https://github.com/EikanWang>`__)
- Guangye Yu (`guangyey <https://github.com/guangyey>`__)
-
 Build + CI
 ~~~~~~~~~~

--- a/pyproject.toml
+++ b/pyproject.toml
@ -159,6 +159,8 @@ ignore = [
    "EXE001",
    "F405",
    "FURB122", # writelines
+    # these ignores are from flake8-logging-format; please fix!
+    "G101",
    # these ignores are from ruff NPY; please fix!
    "NPY002",
    # these ignores are from ruff PERF; please fix!
@ -202,10 +204,14 @@ select = [
    "NPY",
    "PERF",
    "PGH004",
-    "PIE",
+    "PIE790",
+    "PIE794",
+    "PIE800",
+    "PIE804",
+    "PIE807",
+    "PIE810",
    "PLC0131", # type bivariance
    "PLC0132", # type param mismatch
-    "PLC1802", # len({expression}) used as condition without comparison
    "PLC0205", # string as __slots__
    "PLC3002", # unnecessary-direct-lambda-call
    "PLE",
@ -213,7 +219,6 @@ select = [
    "PLR0206", # property with params
    "PLR1722", # use sys exit
    "PLR1736", # unnecessary list index
-    "PLW0127", # Self-assignment of variable
    "PLW0129", # assert on string literal
    "PLW0131", # named expr without context
    "PLW0133", # useless exception statement
--- a/pyrefly.toml
+++ b/pyrefly.toml
@ -5,7 +5,6 @@ python-version = "3.12"
 project-includes = [
    "torch",
    "caffe2",
-    "tools",
    "test/test_bundled_images.py",
    "test/test_bundled_inputs.py",
    "test/test_complex.py",
@ -23,15 +22,10 @@ project-includes = [
 project-excludes = [
  # ==== below will be enabled directory by directory ====
  # ==== to test Pyrefly on a specific directory, simply comment it out ====
+  "torch/_inductor/runtime",
  "torch/_inductor/codegen/triton.py",
-  "tools/linter/adapters/test_device_bias_linter.py",
-  "tools/code_analyzer/gen_operators_yaml.py",
-  "torch/_inductor/runtime/triton_heuristics.py",
-  "torch/_inductor/runtime/triton_helpers.py",
-  "torch/_inductor/runtime/halide_helpers.py",
  # formatting issues, will turn on after adjusting where suppressions can be
  # in import statements
-  "tools/flight_recorder/components/types.py",
  "torch/linalg/__init__.py",
  "torch/package/importer.py",
  "torch/package/_package_pickler.py",
@ -46,6 +40,17 @@ project-excludes = [
  "torch/distributed/elastic/metrics/__init__.py",
  "torch/_inductor/fx_passes/bucketing.py",
  # ====
+  "benchmarks/instruction_counts/main.py",
+  "benchmarks/instruction_counts/definitions/setup.py",
+  "benchmarks/instruction_counts/applications/ci.py",
+  "benchmarks/instruction_counts/core/api.py",
+  "benchmarks/instruction_counts/core/expand.py",
+  "benchmarks/instruction_counts/core/types.py",
+  "benchmarks/instruction_counts/core/utils.py",
+  "benchmarks/instruction_counts/definitions/standard.py",
+  "benchmarks/instruction_counts/definitions/setup.py",
+  "benchmarks/instruction_counts/execution/runner.py",
+  "benchmarks/instruction_counts/execution/work.py",
  "torch/include/**",
  "torch/csrc/**",
  "torch/distributed/elastic/agent/server/api.py",
@ -132,4 +137,3 @@ errors.bad-param-name-override = false
 errors.implicit-import = false
 permissive-ignores = true
 replace-imports-with-any = ["!sympy.printing.*", "sympy.*", "onnxscript.onnx_opset.*"]
-search-path = ["tools/experimental"]
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@ -190,7 +190,7 @@ class TestActivationSparsifier(TestCase):
                if features is None:
                    assert torch.all(mask * input_data == output)
                else:
-                    for feature_idx in range(len(features)):
+                    for feature_idx in range(0, len(features)):
                        feature = torch.Tensor(
                            [features[feature_idx]], device=input_data.device
                        ).long()
@ -378,7 +378,7 @@ class TestActivationSparsifier(TestCase):
        # some dummy data
        data_list = []
        num_data_points = 5
-        for _ in range(num_data_points):
+        for _ in range(0, num_data_points):
            rand_data = torch.randn(16, 1, 28, 28)
            activation_sparsifier.model(rand_data)
            data_list.append(rand_data)
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@ -143,7 +143,7 @@ class TestBaseDataScheduler(TestCase):

        # checking step count
        step_cnt = 5
-        for _ in range(step_cnt):
+        for _ in range(0, step_cnt):
            sparsifier.step()
            scheduler.step()

--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@ -123,7 +123,7 @@ class _BaseDataSparsiferTestCase(TestCase):

        step_count = 3

-        for _ in range(step_count):
+        for _ in range(0, step_count):
            sparsifier.step()
        for some_data in all_data:
            name, data, _ = self._get_name_data_config(some_data)
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@ -472,8 +472,8 @@ class TestNearlyDiagonalSparsifier(TestCase):
        else:
            height, width = mask.shape
            dist_to_diagonal = nearliness // 2
-            for row in range(height):
-                for col in range(width):
+            for row in range(0, height):
+                for col in range(0, width):
                    if abs(row - col) <= dist_to_diagonal:
                        assert mask[row, col] == 1
                    else:
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@ -7,6 +7,7 @@ set(AOTI_ABI_CHECK_TEST_SRCS
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
--- a/test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
+++ b/test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
@ -0,0 +1,52 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
+
+#include <vector>
+
+using torch::headeronly::HeaderOnlyArrayRef;
+
+TEST(TestHeaderOnlyArrayRef, TestEmpty) {
+  HeaderOnlyArrayRef<float> arr;
+  ASSERT_TRUE(arr.empty());
+}
+
+TEST(TestHeaderOnlyArrayRef, TestSingleton) {
+  float val = 5.0f;
+  HeaderOnlyArrayRef<float> arr(val);
+  ASSERT_FALSE(arr.empty());
+  EXPECT_EQ(arr.size(), 1);
+  EXPECT_EQ(arr[0], val);
+}
+
+TEST(TestHeaderOnlyArrayRef, TestAPIs) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr(vec);
+  ASSERT_FALSE(arr.empty());
+  EXPECT_EQ(arr.size(), 7);
+  for (size_t i = 0; i < arr.size(); i++) {
+    EXPECT_EQ(arr[i], i + 1);
+    EXPECT_EQ(arr.at(i), i + 1);
+  }
+  EXPECT_EQ(arr.front(), 1);
+  EXPECT_EQ(arr.back(), 7);
+  ASSERT_TRUE(arr.slice(3, 4).equals(arr.slice(3)));
+}
+
+TEST(TestHeaderOnlyArrayRef, TestFromInitializerList) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr({1, 2, 3, 4, 5, 6, 7});
+  auto res_vec = arr.vec();
+  for (size_t i = 0; i < vec.size(); i++) {
+    EXPECT_EQ(vec[i], res_vec[i]);
+  }
+}
+
+TEST(TestHeaderOnlyArrayRef, TestFromRange) {
+  std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
+  HeaderOnlyArrayRef<int> arr(vec.data() + 3, vec.data() + 7);
+  auto res_vec = arr.vec();
+  for (size_t i = 0; i < res_vec.size(); i++) {
+    EXPECT_EQ(vec[i + 3], res_vec[i]);
+  }
+}
--- a/test/cpp/aoti_abi_check/test_scalartype.cpp
+++ b/test/cpp/aoti_abi_check/test_scalartype.cpp
@ -53,3 +53,24 @@ TEST_FORALL(AT_FORALL_COMPLEX_TYPES, 2)

 #undef DEFINE_CHECK
 #undef TEST_FORALL
+
+TEST(TestScalarType, toString) {
+  using torch::headeronly::ScalarType;
+
+#define DEFINE_CHECK(_, name) EXPECT_EQ(toString(ScalarType::name), #name);
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
+#undef DEFINE_CHECK
+}
+
+TEST(TestScalarType, operator_left_shift) {
+  using torch::headeronly::ScalarType;
+
+#define DEFINE_CHECK(_, name)   \
+  {                             \
+    std::stringstream ss;       \
+    ss << ScalarType::name;     \
+    EXPECT_EQ(ss.str(), #name); \
+  }
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
+#undef DEFINE_CHECK
+}
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@ -311,10 +311,9 @@ void boxed_fill_infinity(
 }

 Tensor my_pad(Tensor t) {
-  std::vector<int64_t> padding = {1, 2, 2, 1};
  std::string mode = "constant";
  double value = 0.0;
-  return pad(t, padding, mode, value);
+  return pad(t, {1, 2, 2, 1}, mode, value);
 }

 void boxed_my_pad(
@ -342,6 +341,9 @@ void boxed_my_narrow(
 }

 Tensor my_new_empty_dtype_variant(Tensor t) {
+  // Still using a std::vector below even though people can just pass in an
+  // initializer list (which will be implicitly converted to an HeaderOnlyArrayRef)
+  // directly.
  std::vector<int64_t> sizes = {2, 5};
  auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
  return new_empty(t, sizes, dtype);
@ -353,9 +355,8 @@ void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, ui
 }

 Tensor my_new_zeros_dtype_variant(Tensor t) {
-  std::vector<int64_t> sizes = {2, 5};
  auto dtype = std::make_optional(at::ScalarType::Float);
-  return new_zeros(t, sizes, dtype);
+  return new_zeros(t, {2, 5}, dtype);
 }

 void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
@ -429,8 +430,7 @@ void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs)
 }

 Tensor my_amax_vec(Tensor t) {
-  std::vector<int64_t> v = {0,1};
-  return amax(t, v, false);
+  return amax(t, {0,1}, false);
 }

 void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@ -1166,7 +1166,7 @@ class TestFullyShardPrefetch(FSDPTest):
            loss = model(inp)
            events.clear()
            loss.sum().backward()
-            expected_backward_events = [
+            expected_backward_events = expected_backward_events = [
                ("unshard", "norm, output", TrainingState.PRE_BACKWARD),
                # root explicit prefetch layers.2
                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@ -79,7 +79,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
            dist.init_process_group(
                store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
            )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
            group_id = dist.group.WORLD
            self._test_all_gather(
                group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16
@ -94,7 +94,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
            dist.init_process_group(
                store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
            )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
            group_id = dist.group.WORLD
            self._test_all_gather(
                group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16
@ -111,7 +111,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
            dist.init_process_group(
                store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
            )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
            group_id = dist.new_group(range(self.world_size))
            rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
            self._test_all_to_all(
@ -135,7 +135,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
            dist.init_process_group(
                store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
            )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
            group_id = dist.new_group(range(self.world_size))
            rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
            self._test_all_to_all(
@ -158,7 +158,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
            dist.init_process_group(
                store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
            )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
            group_id = dist.new_group(range(self.world_size))
            rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
            self._test_all_to_all_single(
@ -181,7 +181,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
            dist.init_process_group(
                store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
            )
-            group = list(range(self.world_size))
+            group = list(range(0, self.world_size))
            group_id = dist.new_group(range(self.world_size))
            rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
            self._test_all_to_all_single(
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@ -66,7 +66,7 @@ if TEST_WITH_DEV_DBG_ASAN:
 def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
    shards_metadata = []
    local_shards = []
-    for idx in range(world_size * shards_per_rank):
+    for idx in range(0, world_size * shards_per_rank):
        shard_rank = idx // shards_per_rank
        shard_md = ShardMetadata(
            shard_offsets=[idx * shard_size],
--- a/test/distributed/checkpoint/test_utils.py
+++ b/test/distributed/checkpoint/test_utils.py
@ -45,7 +45,7 @@ if TEST_WITH_DEV_DBG_ASAN:
 def create_sharded_tensor(rank, world_size, shards_per_rank):
    shards_metadata = []
    local_shards = []
-    for idx in range(world_size * shards_per_rank):
+    for idx in range(0, world_size * shards_per_rank):
        shard_rank = idx // shards_per_rank
        shard_md = ShardMetadata(
            shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@ -633,7 +633,7 @@ class SimpleElasticAgentTest(unittest.TestCase):
        worker_group = agent.get_worker_group()

        num_restarts = 3
-        for _ in range(num_restarts):
+        for _ in range(0, num_restarts):
            agent._restart_workers(worker_group)
            self.assertEqual(WorkerState.HEALTHY, worker_group.state)

--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@ -146,7 +146,7 @@ def echo_large(size: int) -> dict[int, str]:
    returns a large output ({0: test0", 1: "test1", ..., (size-1):f"test{size-1}"})
    """
    out = {}
-    for idx in range(size):
+    for idx in range(0, size):
        out[idx] = f"test{idx}"
    return out

--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@ -191,7 +191,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
        """
        client = timer.FileTimerClient(file_path)
        sem.release()
-        for _ in range(n):
+        for _ in range(0, n):
            client.acquire("test_scope", 0)
            time.sleep(interval)

--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@ -102,7 +102,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):

            world_size = 8
            processes = []
-            for i in range(world_size):
+            for i in range(0, world_size):
                if i % 2 == 0:
                    p = spawn_ctx.Process(target=_stuck_function, args=(i, mp_queue))
                else:
@ -110,7 +110,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
                p.start()
                processes.append(p)

-            for i in range(world_size):
+            for i in range(0, world_size):
                p = processes[i]
                p.join()
                if i % 2 == 0:
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@ -127,7 +127,7 @@ if not INVALID_PLATFORMS:
        interval seconds. Releases the given semaphore once before going to work.
        """
        sem.release()
-        for i in range(n):
+        for i in range(0, n):
            mp_queue.put(TimerRequest(i, "test_scope", 0))
            time.sleep(interval)

--- a/test/distributed/elastic/utils/data/cycling_iterator_test.py
+++ b/test/distributed/elastic/utils/data/cycling_iterator_test.py
@ -15,7 +15,7 @@ class CyclingIteratorTest(unittest.TestCase):
    def generator(self, epoch, stride, max_epochs):
        # generate an continuously incrementing list each epoch
        # e.g. [0,1,2] [3,4,5] [6,7,8] ...
-        return iter([stride * epoch + i for i in range(stride)])
+        return iter([stride * epoch + i for i in range(0, stride)])

    def test_cycling_iterator(self):
        stride = 3
@ -25,7 +25,7 @@ class CyclingIteratorTest(unittest.TestCase):
            return self.generator(epoch, stride, max_epochs)

        it = CyclingIterator(n=max_epochs, generator_fn=generator_fn)
-        for i in range(stride * max_epochs):
+        for i in range(0, stride * max_epochs):
            self.assertEqual(i, next(it))

        with self.assertRaises(StopIteration):
--- a/Show More
+++ b/Show More