Revert "[1/N] Remove unused loop variables (#166258 )"

This reverts commit 76b2c37045e52540ec51e967aa7b6436a6b9b174. Reverted https://github.com/pytorch/pytorch/pull/166258 on behalf of https://github.com/atalman due to breaks test/distributed/test_serialization.py::TestSerialization::test_weights_only [GH job link](https://github.com/pytorch/pytorch/actions/runs/18894311802/job/53929321703) [HUD commit link](76b2c37045) ([comment](https://github.com/pytorch/pytorch/pull/166258#issuecomment-3460964612))
[pytree] add treespec_{leaf,tuple,dict} functions for args_spec modification (#160843 )
2025-10-29 19:24:55 +08:00 · 2025-10-29 11:10:37 +00:00 · 2025-10-29 09:16:24 +00:00 · 2025-10-29 08:30:35 +00:00 · 2025-10-29 07:32:05 +00:00 · 2025-10-29 06:25:36 +00:00
491 changed files with 15997 additions and 5520 deletions
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.3.24
+NVSHMEM_VERSION=3.4.5

 function install_cuda {
  version=$1
@ -150,7 +150,7 @@ function install_130 {
  CUDNN_VERSION=9.13.0.50
  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 13.0 in the same container
-  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
+  install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 13 $CUDNN_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -100,6 +100,8 @@ COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
+# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
+ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -460,28 +460,18 @@ test_inductor_shard() {
    --verbose
 }

-test_inductor_aoti() {
-  # docker build uses bdist_wheel which does not work with test_aot_inductor
-  # TODO: need a faster way to build
+test_inductor_aoti_cpp() {
  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
    # We need to hipify before building again
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
-    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
-    BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi

-  # aoti cmake custom command requires `torch` to be installed
-  # initialize the cmake build cache and install torch
-  /usr/bin/env "${BUILD_COMMAND[@]}"
-  # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
-  /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
-
  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
 }

@ -1659,7 +1649,7 @@ test_operator_microbenchmark() {

  cd "${TEST_DIR}"/benchmarks/operator_benchmark

-  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
      --benchmark-name "PyTorch operator microbenchmark" --use-compile
@ -1776,7 +1766,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
-    test_inductor_aoti
+    test_inductor_aoti_cpp
  fi
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -7,12 +7,9 @@ if "%DESIRED_PYTHON%" == "3.13t" (
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.13t"
-) else if "%DESIRED_PYTHON%"=="3.14" (
-    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
 ) else if "%DESIRED_PYTHON%"=="3.14t" (
    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.14t"
 ) else (
--- a/.claude/skills/pytorch-docstring.md
+++ b/.claude/skills/pytorch-docstring.md
@ -1,3 +1,8 @@
+---
+name: docstring
+description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
+---
+
 # PyTorch Docstring Writing Guide

 This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.
--- a/.claude/skills/skill-writer/SKILL.md
+++ b/.claude/skills/skill-writer/SKILL.md
@ -0,0 +1,385 @@
+---
+name: skill-writer
+description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
+---
+
+# Skill Writer
+
+This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
+
+## When to use this Skill
+
+Use this Skill when:
+- Creating a new Agent Skill
+- Writing or updating SKILL.md files
+- Designing skill structure and frontmatter
+- Troubleshooting skill discovery issues
+- Converting existing prompts or workflows into Skills
+
+## Instructions
+
+### Step 1: Determine Skill scope
+
+First, understand what the Skill should do:
+
+1. **Ask clarifying questions**:
+   - What specific capability should this Skill provide?
+   - When should Claude use this Skill?
+   - What tools or resources does it need?
+   - Is this for personal use or team sharing?
+
+2. **Keep it focused**: One Skill = one capability
+   - Good: "PDF form filling", "Excel data analysis"
+   - Too broad: "Document processing", "Data tools"
+
+### Step 2: Choose Skill location
+
+Determine where to create the Skill:
+
+**Personal Skills** (`~/.claude/skills/`):
+- Individual workflows and preferences
+- Experimental Skills
+- Personal productivity tools
+
+**Project Skills** (`.claude/skills/`):
+- Team workflows and conventions
+- Project-specific expertise
+- Shared utilities (committed to git)
+
+### Step 3: Create Skill structure
+
+Create the directory and files:
+
+```bash
+# Personal
+mkdir -p ~/.claude/skills/skill-name
+
+# Project
+mkdir -p .claude/skills/skill-name
+```
+
+For multi-file Skills:
+```
+skill-name/
+├── SKILL.md (required)
+├── reference.md (optional)
+├── examples.md (optional)
+├── scripts/
+│   └── helper.py (optional)
+└── templates/
+    └── template.txt (optional)
+```
+
+### Step 4: Write SKILL.md frontmatter
+
+Create YAML frontmatter with required fields:
+
+```yaml
+---
+name: skill-name
+description: Brief description of what this does and when to use it
+---
+```
+
+**Field requirements**:
+
+- **name**:
+  - Lowercase letters, numbers, hyphens only
+  - Max 64 characters
+  - Must match directory name
+  - Good: `pdf-processor`, `git-commit-helper`
+  - Bad: `PDF_Processor`, `Git Commits!`
+
+- **description**:
+  - Max 1024 characters
+  - Include BOTH what it does AND when to use it
+  - Use specific trigger words users would say
+  - Mention file types, operations, and context
+
+**Optional frontmatter fields**:
+
+- **allowed-tools**: Restrict tool access (comma-separated list)
+  ```yaml
+  allowed-tools: Read, Grep, Glob
+  ```
+  Use for:
+  - Read-only Skills
+  - Security-sensitive workflows
+  - Limited-scope operations
+
+### Step 5: Write effective descriptions
+
+The description is critical for Claude to discover your Skill.
+
+**Formula**: `[What it does] + [When to use it] + [Key triggers]`
+
+**Examples**:
+
+✅ **Good**:
+```yaml
+description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
+```
+
+✅ **Good**:
+```yaml
+description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
+```
+
+❌ **Too vague**:
+```yaml
+description: Helps with documents
+description: For data analysis
+```
+
+**Tips**:
+- Include specific file extensions (.pdf, .xlsx, .json)
+- Mention common user phrases ("analyze", "extract", "generate")
+- List concrete operations (not generic verbs)
+- Add context clues ("Use when...", "For...")
+
+### Step 6: Structure the Skill content
+
+Use clear Markdown sections:
+
+```markdown
+# Skill Name
+
+Brief overview of what this Skill does.
+
+## Quick start
+
+Provide a simple example to get started immediately.
+
+## Instructions
+
+Step-by-step guidance for Claude:
+1. First step with clear action
+2. Second step with expected outcome
+3. Handle edge cases
+
+## Examples
+
+Show concrete usage examples with code or commands.
+
+## Best practices
+
+- Key conventions to follow
+- Common pitfalls to avoid
+- When to use vs. not use
+
+## Requirements
+
+List any dependencies or prerequisites:
+```bash
+pip install package-name
+```
+
+## Advanced usage
+
+For complex scenarios, see [reference.md](reference.md).
+```
+
+### Step 7: Add supporting files (optional)
+
+Create additional files for progressive disclosure:
+
+**reference.md**: Detailed API docs, advanced options
+**examples.md**: Extended examples and use cases
+**scripts/**: Helper scripts and utilities
+**templates/**: File templates or boilerplate
+
+Reference them from SKILL.md:
+```markdown
+For advanced usage, see [reference.md](reference.md).
+
+Run the helper script:
+\`\`\`bash
+python scripts/helper.py input.txt
+\`\`\`
+```
+
+### Step 8: Validate the Skill
+
+Check these requirements:
+
+✅ **File structure**:
+- [ ] SKILL.md exists in correct location
+- [ ] Directory name matches frontmatter `name`
+
+✅ **YAML frontmatter**:
+- [ ] Opening `---` on line 1
+- [ ] Closing `---` before content
+- [ ] Valid YAML (no tabs, correct indentation)
+- [ ] `name` follows naming rules
+- [ ] `description` is specific and < 1024 chars
+
+✅ **Content quality**:
+- [ ] Clear instructions for Claude
+- [ ] Concrete examples provided
+- [ ] Edge cases handled
+- [ ] Dependencies listed (if any)
+
+✅ **Testing**:
+- [ ] Description matches user questions
+- [ ] Skill activates on relevant queries
+- [ ] Instructions are clear and actionable
+
+### Step 9: Test the Skill
+
+1. **Restart Claude Code** (if running) to load the Skill
+
+2. **Ask relevant questions** that match the description:
+   ```
+   Can you help me extract text from this PDF?
+   ```
+
+3. **Verify activation**: Claude should use the Skill automatically
+
+4. **Check behavior**: Confirm Claude follows the instructions correctly
+
+### Step 10: Debug if needed
+
+If Claude doesn't use the Skill:
+
+1. **Make description more specific**:
+   - Add trigger words
+   - Include file types
+   - Mention common user phrases
+
+2. **Check file location**:
+   ```bash
+   ls ~/.claude/skills/skill-name/SKILL.md
+   ls .claude/skills/skill-name/SKILL.md
+   ```
+
+3. **Validate YAML**:
+   ```bash
+   cat SKILL.md | head -n 10
+   ```
+
+4. **Run debug mode**:
+   ```bash
+   claude --debug
+   ```
+
+## Common patterns
+
+### Read-only Skill
+
+```yaml
+---
+name: code-reader
+description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
+allowed-tools: Read, Grep, Glob
+---
+```
+
+### Script-based Skill
+
+```yaml
+---
+name: data-processor
+description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
+---
+
+# Data Processor
+
+## Instructions
+
+1. Use the processing script:
+\`\`\`bash
+python scripts/process.py input.csv --output results.json
+\`\`\`
+
+2. Validate output with:
+\`\`\`bash
+python scripts/validate.py results.json
+\`\`\`
+```
+
+### Multi-file Skill with progressive disclosure
+
+```yaml
+---
+name: api-designer
+description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
+---
+
+# API Designer
+
+Quick start: See [examples.md](examples.md)
+
+Detailed reference: See [reference.md](reference.md)
+
+## Instructions
+
+1. Gather requirements
+2. Design endpoints (see examples.md)
+3. Document with OpenAPI spec
+4. Review against best practices (see reference.md)
+```
+
+## Best practices for Skill authors
+
+1. **One Skill, one purpose**: Don't create mega-Skills
+2. **Specific descriptions**: Include trigger words users will say
+3. **Clear instructions**: Write for Claude, not humans
+4. **Concrete examples**: Show real code, not pseudocode
+5. **List dependencies**: Mention required packages in description
+6. **Test with teammates**: Verify activation and clarity
+7. **Version your Skills**: Document changes in content
+8. **Use progressive disclosure**: Put advanced details in separate files
+
+## Validation checklist
+
+Before finalizing a Skill, verify:
+
+- [ ] Name is lowercase, hyphens only, max 64 chars
+- [ ] Description is specific and < 1024 chars
+- [ ] Description includes "what" and "when"
+- [ ] YAML frontmatter is valid
+- [ ] Instructions are step-by-step
+- [ ] Examples are concrete and realistic
+- [ ] Dependencies are documented
+- [ ] File paths use forward slashes
+- [ ] Skill activates on relevant queries
+- [ ] Claude follows instructions correctly
+
+## Troubleshooting
+
+**Skill doesn't activate**:
+- Make description more specific with trigger words
+- Include file types and operations in description
+- Add "Use when..." clause with user phrases
+
+**Multiple Skills conflict**:
+- Make descriptions more distinct
+- Use different trigger words
+- Narrow the scope of each Skill
+
+**Skill has errors**:
+- Check YAML syntax (no tabs, proper indentation)
+- Verify file paths (use forward slashes)
+- Ensure scripts have execute permissions
+- List all dependencies
+
+## Examples
+
+See the documentation for complete examples:
+- Simple single-file Skill (commit-helper)
+- Skill with tool permissions (code-reviewer)
+- Multi-file Skill (pdf-processing)
+
+## Output format
+
+When creating a Skill, I will:
+
+1. Ask clarifying questions about scope and requirements
+2. Suggest a Skill name and location
+3. Create the SKILL.md file with proper frontmatter
+4. Include clear instructions and examples
+5. Add supporting files if needed
+6. Provide testing instructions
+7. Validate against all requirements
+
+The result will be a complete, working Skill that follows all best practices and validation rules.
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-1752fe6809b74921644866275ab80244b96e80bc
+218d2ab791d437309f91e0486eb9fa7f00badc17
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -540,6 +540,26 @@
  - Lint
  - pull

+- name: PrivateUse1
+  patterns:
+  - torch/accelerator/**
+  - torch/utils/backend_registration.py
+  - torch/csrc/acc/**
+  - torch/csrc/DeviceAccelerator.*
+  - torch/csrc/profiler/standalone/privateuse1_observer.*
+  - aten/src/ATen/DeviceAccelerator.*
+  - aten/src/ATen/core/GeneratorForPrivateuseone.*
+  - aten/src/ATen/detail/PrivateUse1HooksInterface.*
+  - docs/source/accelerator/**
+  - test/cpp_extensions/open_registration_extension/torch_openreg/**
+  approved_by:
+  - albanD
+  - fffrog
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
 - name: superuser
  patterns:
  - '*'
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -22,7 +22,7 @@ CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
    "12.9": "12.9.1",
-    "13.0": "13.0.0",
+    "13.0": "13.0.2",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "12.6": "9",
@ -56,7 +56,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
@ -73,7 +73,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
@ -90,27 +90,27 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
    ),
    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
-        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
-        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
+        "nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
+        "nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
-        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
-        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
+        "nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
+        "nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
-        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
-        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
-        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
+        "nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
+        "nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
+        "nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.2.1 | "
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -270,7 +270,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -473,7 +473,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -519,7 +519,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -676,7 +676,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -722,7 +722,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -768,7 +768,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -879,7 +879,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -925,7 +925,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -971,7 +971,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1017,7 +1017,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1128,7 +1128,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1174,7 +1174,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1220,7 +1220,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1266,7 +1266,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1377,7 +1377,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1423,7 +1423,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1469,7 +1469,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1515,7 +1515,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1626,7 +1626,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1672,7 +1672,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1718,7 +1718,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1764,7 +1764,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -127,7 +127,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -259,7 +259,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_9-test:  # Testing
@ -325,7 +325,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda13_0-test:  # Testing
@ -793,7 +793,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -859,7 +859,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -925,7 +925,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_9-test:  # Testing
@ -991,7 +991,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda13_0-test:  # Testing
@ -1459,7 +1459,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -1525,7 +1525,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -1591,7 +1591,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_9-test:  # Testing
@ -1657,7 +1657,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda13_0-test:  # Testing
@ -2125,7 +2125,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -2191,7 +2191,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2257,7 +2257,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_9-test:  # Testing
@ -2323,7 +2323,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda13_0-test:  # Testing
@ -2791,7 +2791,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -2857,7 +2857,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -2923,7 +2923,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_9-test:  # Testing
@ -2989,7 +2989,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda13_0-test:  # Testing
@ -3457,7 +3457,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_6-test:  # Testing
@ -3523,7 +3523,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_8-test:  # Testing
@ -3589,7 +3589,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_9-test:  # Testing
@ -3655,7 +3655,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda13_0-test:  # Testing
@ -4123,7 +4123,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_6-test:  # Testing
@ -4189,7 +4189,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_8-test:  # Testing
@ -4255,7 +4255,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_9-test:  # Testing
@ -4321,7 +4321,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda13_0-test:  # Testing
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -59,14 +59,18 @@ jobs:
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
        ]}
    secrets: inherit

--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png)
+![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)

 --------------------------------------------------------------------------------

@ -72,7 +72,7 @@ Elaborating Further:

 If you use NumPy, then you have used Tensors (a.k.a. ndarray).

-![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png)
+![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png)

 PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
 computation by a huge amount.
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
 While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
 You get the best of speed and flexibility for your crazy research.

-![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif)
+![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)

 ### Python First

--- a/SECURITY.md
+++ b/SECURITY.md
@ -31,9 +31,9 @@ Be careful when running untrusted models. This classification includes models cr

 **Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing).

-**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done.
-
+**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details.

+Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs.

 Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.

--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
  if(USE_CUDA)
    # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
    # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
-    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
+    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*")
    file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
@ -291,6 +291,7 @@ IF(USE_FBGEMM_GENAI)

    set(fbgemm_genai_cuh
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
+      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/f4f4bf16_grouped/"
      "${FBGEMM_GENAI_SRCS}/"
    )

--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -677,8 +677,8 @@ struct CachingHostAllocatorImpl {
  // size. This allows us to quickly find a free block of the right size.
  // We use deque to store per size free list and guard the list with its own
  // mutex.
-  alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
-      std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+  alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>>
+      free_list_{MAX_SIZE_INDEX};

  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -21,12 +21,46 @@ inline void convertImpl(
  }
 }

+template <typename to_type>
+inline void convertFromBool(
+    const bool* __restrict src,
+    to_type* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
+  }
+}
+
+template <typename from_type>
+inline void convertToBool(
+    const from_type* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
+  }
+}
+
 #define CONVERT_TEMPLATE(from_type, to_type)                           \
  template <>                                                          \
  inline void convert(const from_type* src, to_type* dst, int64_t n) { \
    return convertImpl<from_type, to_type>(src, dst, n);               \
  }

+#define CONVERT_FROM_BOOL_TEMPLATE(to_type)                       \
+  inline void convert(const bool* src, to_type* dst, int64_t n) { \
+    return convertFromBool<to_type>(src, dst, n);                 \
+  }
+
+#define CONVERT_TO_BOOL_TEMPLATE(from_type)                         \
+  inline void convert(const from_type* src, bool* dst, int64_t n) { \
+    return convertToBool<from_type>(src, dst, n);                   \
+  }
+
 CONVERT_TEMPLATE(uint8_t, uint8_t)
 CONVERT_TEMPLATE(uint8_t, int8_t)
 CONVERT_TEMPLATE(uint8_t, int16_t)
@ -34,6 +68,7 @@ CONVERT_TEMPLATE(uint8_t, int32_t)
 CONVERT_TEMPLATE(uint8_t, int64_t)
 CONVERT_TEMPLATE(uint8_t, float)
 CONVERT_TEMPLATE(uint8_t, double)
+CONVERT_TO_BOOL_TEMPLATE(uint8_t)
 CONVERT_TEMPLATE(int8_t, uint8_t)
 CONVERT_TEMPLATE(int8_t, int8_t)
 CONVERT_TEMPLATE(int8_t, int16_t)
@ -41,6 +76,7 @@ CONVERT_TEMPLATE(int8_t, int32_t)
 CONVERT_TEMPLATE(int8_t, int64_t)
 CONVERT_TEMPLATE(int8_t, float)
 CONVERT_TEMPLATE(int8_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int8_t)
 CONVERT_TEMPLATE(int16_t, uint8_t)
 CONVERT_TEMPLATE(int16_t, int8_t)
 CONVERT_TEMPLATE(int16_t, int16_t)
@ -48,6 +84,7 @@ CONVERT_TEMPLATE(int16_t, int32_t)
 CONVERT_TEMPLATE(int16_t, int64_t)
 CONVERT_TEMPLATE(int16_t, float)
 CONVERT_TEMPLATE(int16_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int16_t)
 CONVERT_TEMPLATE(int32_t, uint8_t)
 CONVERT_TEMPLATE(int32_t, int8_t)
 CONVERT_TEMPLATE(int32_t, int16_t)
@ -55,6 +92,7 @@ CONVERT_TEMPLATE(int32_t, int32_t)
 CONVERT_TEMPLATE(int32_t, int64_t)
 CONVERT_TEMPLATE(int32_t, float)
 CONVERT_TEMPLATE(int32_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int32_t)
 CONVERT_TEMPLATE(int64_t, uint8_t)
 CONVERT_TEMPLATE(int64_t, int8_t)
 CONVERT_TEMPLATE(int64_t, int16_t)
@ -62,6 +100,7 @@ CONVERT_TEMPLATE(int64_t, int32_t)
 CONVERT_TEMPLATE(int64_t, int64_t)
 CONVERT_TEMPLATE(int64_t, float)
 CONVERT_TEMPLATE(int64_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int64_t)
 CONVERT_TEMPLATE(float, uint8_t)
 CONVERT_TEMPLATE(float, int8_t)
 CONVERT_TEMPLATE(float, int16_t)
@ -69,6 +108,7 @@ CONVERT_TEMPLATE(float, int32_t)
 CONVERT_TEMPLATE(float, int64_t)
 CONVERT_TEMPLATE(float, float)
 CONVERT_TEMPLATE(float, double)
+CONVERT_TO_BOOL_TEMPLATE(float)
 CONVERT_TEMPLATE(double, uint8_t)
 CONVERT_TEMPLATE(double, int8_t)
 CONVERT_TEMPLATE(double, int16_t)
@ -76,22 +116,80 @@ CONVERT_TEMPLATE(double, int32_t)
 CONVERT_TEMPLATE(double, int64_t)
 CONVERT_TEMPLATE(double, float)
 CONVERT_TEMPLATE(double, double)
+CONVERT_TO_BOOL_TEMPLATE(double)
+CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
+CONVERT_FROM_BOOL_TEMPLATE(int8_t)
+CONVERT_FROM_BOOL_TEMPLATE(int16_t)
+CONVERT_FROM_BOOL_TEMPLATE(int32_t)
+CONVERT_FROM_BOOL_TEMPLATE(int64_t)
+CONVERT_FROM_BOOL_TEMPLATE(float)
+CONVERT_FROM_BOOL_TEMPLATE(double)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-CONVERT_TEMPLATE(float16_t, uint8_t)
-CONVERT_TEMPLATE(float16_t, int8_t)
-CONVERT_TEMPLATE(float16_t, int16_t)
-CONVERT_TEMPLATE(float16_t, int32_t)
-CONVERT_TEMPLATE(float16_t, int64_t)
-CONVERT_TEMPLATE(float16_t, float16_t)
-CONVERT_TEMPLATE(float16_t, float)
-CONVERT_TEMPLATE(float16_t, double)
-CONVERT_TEMPLATE(uint8_t, float16_t)
-CONVERT_TEMPLATE(int8_t, float16_t)
-CONVERT_TEMPLATE(int16_t, float16_t)
-CONVERT_TEMPLATE(int32_t, float16_t)
-CONVERT_TEMPLATE(int64_t, float16_t)
-CONVERT_TEMPLATE(float, float16_t)
-CONVERT_TEMPLATE(double, float16_t)
+
+#define CONVERT_FROM_FP16_TEMPLATE(to_type)                            \
+  template <>                                                          \
+  inline void convert(const at::Half* src, to_type* dst, int64_t n) {  \
+    const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src); \
+    return convertImpl<float16_t, to_type>(srcPtr, dst, n);            \
+  }
+
+#define CONVERT_TO_FP16_TEMPLATE(from_type)                             \
+  template <>                                                           \
+  inline void convert(const from_type* src, at::Half* dst, int64_t n) { \
+    float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);              \
+    return convertImpl<from_type, float16_t>(src, dstPtr, n);           \
+  }
+
+CONVERT_FROM_FP16_TEMPLATE(uint8_t)
+CONVERT_FROM_FP16_TEMPLATE(int8_t)
+CONVERT_FROM_FP16_TEMPLATE(int16_t)
+CONVERT_FROM_FP16_TEMPLATE(int32_t)
+CONVERT_FROM_FP16_TEMPLATE(int64_t)
+CONVERT_FROM_FP16_TEMPLATE(float16_t)
+CONVERT_FROM_FP16_TEMPLATE(float)
+CONVERT_FROM_FP16_TEMPLATE(double)
+CONVERT_TO_FP16_TEMPLATE(uint8_t)
+CONVERT_TO_FP16_TEMPLATE(int8_t)
+CONVERT_TO_FP16_TEMPLATE(int16_t)
+CONVERT_TO_FP16_TEMPLATE(int32_t)
+CONVERT_TO_FP16_TEMPLATE(int64_t)
+CONVERT_TO_FP16_TEMPLATE(float)
+CONVERT_TO_FP16_TEMPLATE(double)
+
+inline void convertBoolToFp16Impl(
+    const bool* __restrict src,
+    at::Half* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
+  }
+}
+
+template <>
+inline void convert(const bool* src, at::Half* dst, int64_t n) {
+  return convertBoolToFp16Impl(src, dst, n);
+}
+
+inline void convertFp16ToBoolImpl(
+    const at::Half* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
+  }
+}
+
+template <>
+inline void convert(const at::Half* src, bool* dst, int64_t n) {
+  return convertFp16ToBoolImpl(src, dst, n);
+}
+
 #endif
 #ifdef __ARM_FEATURE_BF16
 CONVERT_TEMPLATE(bfloat16_t, uint8_t)
@ -109,6 +207,44 @@ CONVERT_TEMPLATE(int32_t, bfloat16_t)
 CONVERT_TEMPLATE(int64_t, bfloat16_t)
 CONVERT_TEMPLATE(float, bfloat16_t)
 CONVERT_TEMPLATE(double, bfloat16_t)
+
+inline void convertBoolToBfloat16Impl(
+    const bool* __restrict src,
+    c10::BFloat16* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
+  }
+}
+
+template <>
+inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
+  return convertBoolToBfloat16Impl(src, dst, n);
+}
+
+inline void convertBfloat16ToBoolImpl(
+    const c10::BFloat16* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    // Check if all non-sign bits are 0
+    bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
+    dstPtr[i] = isBf16Zero ? 0 : 1;
+  }
+}
+
+template <>
+inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
+  return convertBfloat16ToBoolImpl(src, dst, n);
+}
+
 #endif

 #endif
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@ -2,10 +2,10 @@

 #include <ATen/cuda/ATenCUDAGeneral.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/core/impl/GPUTrace.h>
-#include <c10/cuda/CUDAStream.h>
-#include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
 #include <c10/util/Exception.h>

 #include <cuda_runtime_api.h>
@ -246,4 +246,79 @@ private:
  }
 };

+// EventPool - Thread-safe pool of CUDA events to avoid expensive cudaEventCreate
+// calls. cudaEventCreate when concurrently invoked from multiple threads can be
+// very expensive (especially on certain device/driver combinations).
+using CUDAEventPtr =
+    std::unique_ptr<CUDAEvent, std::function<void(CUDAEvent*)>>;
+
+class EventPool {
+ public:
+  EventPool() : pools_(at::cuda::device_count()) {}
+
+  CUDAEventPtr get(const DeviceIndex device) {
+    // If the device is invalid, return a default event and no pooling
+    if (device < 0 || device >= (DeviceIndex)pools_.size()) {
+      auto deleter = [](CUDAEvent* event) {
+        delete event;
+      };
+      return CUDAEventPtr(
+        std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(), deleter);
+    }
+
+    auto& pool = pools_[device];
+
+    // Create a destructor that returns the event to the appropriate device pool
+    auto destructor = [&pool](CUDAEvent* event) noexcept {
+      if (event != nullptr) {
+        std::lock_guard<std::mutex> lock(pool.mutex_);
+        pool.event_pool_.emplace_back(event);
+      }
+    };
+
+    {
+      std::lock_guard<std::mutex> lock(pool.mutex_);
+      if (!pool.event_pool_.empty()) {
+        auto event = std::move(pool.event_pool_.back());
+        pool.event_pool_.pop_back();
+        return CUDAEventPtr(event.release(), destructor);
+      }
+    }
+
+    return CUDAEventPtr(
+        std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(),
+        destructor);
+  }
+
+  void empty_cache() {
+    for (auto& pool : pools_) {
+      std::lock_guard<std::mutex> lock(pool.mutex_);
+      pool.event_pool_.clear();
+    }
+  }
+
+  void init_num_events(const size_t num_events) {
+    for (DeviceIndex device_idx = 0; device_idx < at::cuda::device_count(); ++device_idx) {
+        CUDAGuard device_guard(device_idx);
+        std::vector<CUDAEventPtr> temp_events;
+        temp_events.reserve(num_events);
+        for (size_t i = 0; i < num_events; ++i) {
+          auto event = get(device_idx);
+          // Record the event to ensure it's properly initialized
+          event->record();
+          temp_events.emplace_back(std::move(event));
+        }
+        // Events will be returned to pool when temp_events is destroyed
+    }
+  }
+
+ private:
+  struct alignas(64) PerDevicePool {
+    alignas(64) std::mutex mutex_;
+    std::vector<std::unique_ptr<CUDAEvent>> event_pool_;
+  };
+
+  std::vector<PerDevicePool> pools_;
+};
+
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/CUDAGreenContext.cpp
+++ b/aten/src/ATen/cuda/CUDAGreenContext.cpp
@ -1,78 +1,88 @@
 #include <ATen/cuda/CUDAGreenContext.h>

-namespace at::cuda {
-  GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
-#if CUDA_HAS_GREEN_CONTEXT
-    int driver_version;
-    C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
-    TORCH_CHECK(
-        driver_version >= 12080, "cuda driver too old to use green context!");
-    CUcontext pctx = nullptr;
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
-    if (C10_UNLIKELY(!pctx)) {
-      TORCH_WARN(
-          "Attempted to create a green context but"
-          " there was no primary context! Creating a primary context...");
-
-      cudaFree(0);
-    }
-
-    CUdevice device;
-    device_id_ = device_id;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
-
-    // Get device resources
-    CUdevResource device_resource;
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
-        device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
-
-    // Split resources
-    std::vector<CUdevResource> result(1);
-    auto result_data = result.data();
-    unsigned int nb_groups = 1;
-    CUdevResource remaining;
-
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
-            result_data,
-            &nb_groups,
-            &device_resource,
-            &remaining,
-            0, // default flags
-            num_sms));
-
-    TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
-
-    // Generate resource descriptor
-    CUdevResourceDesc desc;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
-            &desc, result_data, 1));
-
-    // Create green context
-    // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
-        &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
-
-    // Convert to regular context
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
-    TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
+#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#include <stdexcept>
+#include <vector>
+#define HAS_CUDA_GREEN_CONTEXT() 1
 #else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
+#define HAS_CUDA_GREEN_CONTEXT() 0
+#endif
+
+namespace at::cuda {
+
+GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
+#if HAS_CUDA_GREEN_CONTEXT()
+  int driver_version;
+  C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
+  TORCH_CHECK(
+      driver_version >= 12080, "cuda driver too old to use green context!");
+  CUcontext pctx = nullptr;
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
+  if (C10_UNLIKELY(!pctx)) {
+    TORCH_WARN(
+        "Attempted to create a green context but"
+        " there was no primary context! Creating a primary context...");
+
+    cudaFree(0);
+  }
+
+   CUdevice device;
+  device_id_ = device_id;
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
+
+  // Get device resources
+  CUdevResource device_resource;
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
+      device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
+
+  // Split resources
+  std::vector<CUdevResource> result(1);
+  auto result_data = result.data();
+  unsigned int nb_groups = 1;
+  CUdevResource remaining;
+
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
+          result_data,
+          &nb_groups,
+          &device_resource,
+          &remaining,
+          0, // default flags
+          num_sms));
+
+  TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
+
+  // Generate resource descriptor
+  CUdevResourceDesc desc;
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
+          &desc, result_data, 1));
+
+  // Create green context
+  // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
+      &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
+
+  // Convert to regular context
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
+  TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
+#else
+  TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
 #endif
  }

  std::unique_ptr<GreenContext> GreenContext::create(
      uint32_t num_sms,
      std::optional<uint32_t> device_id) {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    if (!device_id.has_value()) {
      device_id = at::cuda::current_device();
    }
-    return std::make_unique<GreenContext>(device_id.value(), num_sms);
+    return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
 #else
    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
 #endif
@ -80,7 +90,7 @@ namespace at::cuda {

  // Implement move operations
  GreenContext::GreenContext(GreenContext&& other) noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    device_id_ = std::exchange(other.device_id_, -1);
    green_ctx_ = std::exchange(other.green_ctx_, nullptr);
    context_ = std::exchange(other.context_, nullptr);
@ -91,7 +101,7 @@ namespace at::cuda {
  }

  GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    if (this != &other) {
      // Clean up current resources
      if (green_ctx_) {
@ -120,7 +130,7 @@ namespace at::cuda {
  }

  GreenContext::~GreenContext() noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    C10_CUDA_DRIVER_CHECK(
        c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
 #else
@ -128,25 +138,9 @@ namespace at::cuda {
 #endif
  }

-  // Get the underlying CUDA context
-  CUcontext GreenContext::getContext() const {
-#if CUDA_HAS_GREEN_CONTEXT
-    return context_;
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  // Get the underlying green context
-#if CUDA_HAS_GREEN_CONTEXT
-  CUgreenCtx GreenContext::getGreenContext() const {
-    return green_ctx_;
-  }
-#endif
-
  // Make this context current
  void GreenContext::setContext() {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    auto current_stream = c10::cuda::getCurrentCUDAStream();
    parent_stream_ = current_stream.stream();

@ -175,7 +169,7 @@ namespace at::cuda {
  }

  void GreenContext::popContext() {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    // see above note about stream being hardcoded to the default stream
    at::cuda::CUDAEvent ev;
    ev.record(c10::cuda::getCurrentCUDAStream());
--- a/aten/src/ATen/cuda/CUDAGreenContext.h
+++ b/aten/src/ATen/cuda/CUDAGreenContext.h
@ -1,53 +1,38 @@
 #pragma once
 #include <ATen/cuda/CUDAEvent.h>
-
-#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-#include <c10/cuda/driver_api.h>
 #include <cuda.h>
-#include <memory>
-#include <stdexcept>
-#include <vector>
-#define CUDA_HAS_GREEN_CONTEXT 1
-#else
-#define CUDA_HAS_GREEN_CONTEXT 0
-#endif
+
+// Forward declare green context as opaque ptr
+typedef struct CUgreenCtx_st* CUgreenCtx;

 namespace at::cuda {

 class TORCH_CUDA_CPP_API GreenContext {
 public:
-  GreenContext(uint32_t device_id, uint32_t num_sms);
-
-  static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
+  // Green context creation
+  static std::unique_ptr<GreenContext> create(
+      uint32_t num_sms,
+      std::optional<uint32_t> device_id);
+  ~GreenContext() noexcept;

  // Delete copy constructor and assignment
  GreenContext(const GreenContext&) = delete;
  GreenContext& operator=(const GreenContext&) = delete;

-  // Implement move operations
-  GreenContext(GreenContext&& other) noexcept;
-  GreenContext& operator=(GreenContext&& other) noexcept;
-  ~GreenContext() noexcept;
-
-  // Get the underlying CUDA context
-  CUcontext getContext() const;
-
-  // Get the underlying green context
-#if CUDA_HAS_GREEN_CONTEXT
-  CUgreenCtx getGreenContext() const;
-#endif
-
  // Make this context current
  void setContext();

  void popContext();

 private:
-#if CUDA_HAS_GREEN_CONTEXT
+  GreenContext(uint32_t device_id, uint32_t num_sms);
+  // Implement move operations
+  GreenContext(GreenContext&& other) noexcept;
+  GreenContext& operator=(GreenContext&& other) noexcept;
+
  int32_t device_id_ = -1;
  CUgreenCtx green_ctx_ = nullptr;
  CUcontext context_ = nullptr;
  cudaStream_t parent_stream_ = nullptr;
-#endif
 };
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -580,7 +580,7 @@ std::ofstream& TuningContext::GetUntunedFile(){
      filename.append(device);
    }

-    untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc);
+    untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app);
  }
  return untuned_file_;
 }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -689,6 +689,10 @@ static void check_shape_forward(const at::Tensor& input,
             ", but got bias of size ", at::symint::sizes<T>(bias), " instead");

    for (const auto i : c10::irange(2, k)) {
+      // T could be int64_t or SymInt, Specialized numeric_limts<SymInt> in c10/core/SymInt.h
+      TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
+                  "Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
+                  (std::numeric_limits<T>::max() / 2));
      input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
      // log new kernel size considering dilation
      kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
@ -715,6 +719,11 @@ static void check_shape_forward(const at::Tensor& input,
               "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
    }
  } else { // transposed
+    for (const auto i : c10::irange(2, k)) {
+      TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
+                  "Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
+                  (std::numeric_limits<T>::max() / 2));
+    }
    TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
             "Given transposed=", transposed, ", weight of size ", weight_sizes,
             ", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
--- a/aten/src/ATen/native/ConvolutionTBC.cpp
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@ -52,8 +52,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
  for (const auto k : c10::irange(kw)) {
    int iShift = std::max(0, static_cast<int>(k - real_pad));
    int oShift = std::max(0, static_cast<int>(real_pad - k));
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    int t = std::min(ilen + real_pad - k, olen) - oShift;
+    long t = std::min(ilen + real_pad - k, olen) - oShift;
    // Note: gemm assumes column-major matrices
    // input    is l*m (row-major)
    // weight   is m*r (row-major)
--- a/aten/src/ATen/native/IndexingUtils.cpp
+++ b/aten/src/ATen/native/IndexingUtils.cpp
@ -16,8 +16,7 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
  auto linearId = elements - 1;

  // NOTE: Assumes all strides are positive, which is true for now
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  for (int i = t.dim() - 1; i >= 0; --i) {
+  for (auto i = t.dim() - 1; i >= 0; --i) {
    auto curDimIndex = linearId % t.sym_size(i);
    auto curDimOffset = curDimIndex * t.sym_stride(i);
    offset += curDimOffset;
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@ -68,7 +68,6 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
  const float* input_ptr = input_contig.const_data_ptr<float>();

  TORCH_CHECK(input.dim() >= 2);
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
  const int64_t K = input.size(input.dim() - 1);
  TORCH_CHECK(weight.dim() == 2);
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@ -160,10 +160,9 @@ struct Dist {
    // value of k.
    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
      const Vec pvec(p);
-      double n2 = n - .5;
+      double n2 = static_cast<double>(n) - .5;
      // The -1 accounts for floating point truncation issues
-      // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
+      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast<double>(k) - 1.0)));
      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;

      const scalar_t * self_i = self_start + i * m;
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@ -1,11 +1,11 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Context.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
-#include <ATen/cuda/CachingHostAllocator.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
+#include <ATen/cuda/CachingHostAllocator.h>
 #include <ATen/cuda/PeerToPeerAccess.h>
 #include <ATen/native/Copy.h>
 #include <ATen/native/TensorIterator.h>
@ -27,6 +27,24 @@

 namespace at::native {

+namespace {
+
+// Initial pool size for CUDA events per device.
+constexpr size_t kInitialEventPoolSize = 8;
+
+at::cuda::CUDAEventPtr getEventFromPool(const at::DeviceIndex device_idx) {
+  static auto* event_pool = []() {
+    auto* pool = new at::cuda::EventPool();
+    // Pre-populate the pool with events to avoid stalls in creating events
+    pool->init_num_events(kInitialEventPoolSize);
+    return pool;
+  }();
+
+  return event_pool->get(device_idx);
+}
+
+} // namespace
+
 void neg_kernel_cuda(TensorIteratorBase &iter);
 void conj_kernel_cuda(TensorIteratorBase &iter);

@ -263,12 +281,14 @@ void copy_device_to_device(TensorIterator& iter,
    // write-after-read dependencies on the destination side are handled, so
    // that no one is operating on the dst memory when we perform the copy.
    // src waits on dst barrier (src already waits on src)
-    CUDAEvent dst_ready;
+
+    // Use event pool for better performance instead of creating new events
+    auto dst_ready = getEventFromPool(dst_device.index());
    device_guard.set_device(dst_device);
-    dst_ready.record(getCurrentCUDAStream(dst_device.index()));
+    dst_ready->record(getCurrentCUDAStream(dst_device.index()));

    device_guard.set_device(src_device);
-    dst_ready.block(copy_stream);
+    dst_ready->block(copy_stream);
  }

  if (memcpy_eligible) {
@ -307,11 +327,11 @@ void copy_device_to_device(TensorIterator& iter,
    // operate on dst's copy until the copy is complete.

    // Still on src_device, record stream event
-    CUDAEvent src_ready;
-    src_ready.record(copy_stream);
+    auto src_ready = getEventFromPool(src_device.index());
+    src_ready->record(copy_stream);

    device_guard.set_device(dst_device);
-    src_ready.block(getCurrentCUDAStream(dst_device.index()));
+    src_ready->block(getCurrentCUDAStream(dst_device.index()));
  }

  AT_CUDA_CHECK(cudaGetLastError());
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -22,6 +22,9 @@
 #include <ATen/native/cuda/RowwiseScaledMM.h>
 #include <ATen/native/cuda/ScaledGroupMM.h>
 #include <ATen/native/cuda/GroupMM.h>
+#ifdef USE_ROCM
+#include <ATen/native/hip/ck_group_gemm.h>
+#endif
 #include <ATen/ceil_div.h>

 #ifdef USE_FBGEMM_GENAI
@ -208,6 +211,48 @@ _f8_f8_bf16_rowwise_grouped_mm(
 #endif
 }

+Tensor&
+_f4_f4_bf16_grouped_mm_fbgemm(
+      const Tensor& mat_a,
+      const Tensor& mat_b,
+      const Tensor& scale_a,
+      const Tensor& global_scale_a,
+      const Tensor& scale_b,
+      const Tensor& global_scale_b,
+      const std::optional<Tensor>& offs,
+      const std::optional<Tensor>& bias,
+      Tensor& out) {
+#if !defined(USE_ROCM) && defined(USE_FBGEMM_GENAI)
+  // Typing checks
+  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2,
+      "mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
+  TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
+      "mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
+      "scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
+  TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
+      "scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
+  TORCH_CHECK_VALUE(global_scale_a.scalar_type() == at::kFloat,
+      "global_scale_a must be Float, got: ", global_scale_a.scalar_type());
+  TORCH_CHECK_VALUE(global_scale_b.scalar_type() == at::kFloat,
+      "global_scale_b must be Float, got: ", global_scale_b.scalar_type());
+
+  auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      offs.value(),
+      out,
+      global_scale_a.mul(global_scale_b)
+  );
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
+#endif
+
+  return out;
+}
+
 void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
  // Checks scales for 2d or 3d target tensors (`mat`).
  if (mat.dim() == 2) {
@ -245,7 +290,15 @@ void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int
  }
 }

-void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
+void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
+  // if {mx,nv}fp4, will need to modify K later
+  bool is_fp4 = (mat.scalar_type() == kFloat4_e2m1fn_x2);
+  int blocksize = 32;
+  // check for nvfp4 vs. mxfp4 to fix blocksize
+  if (is_fp4 && scale.scalar_type() == kFloat8_e4m3fn) {
+    blocksize = 16;
+  }
+
  // Checks scales for 2d or 3d target tensors (`mat`).
  if (mat.dim() == 2) {
    // For MXFP8, 2d tensors have variable size groups represented as subtensors,
@ -253,17 +306,19 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
    // so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
    TORCH_CHECK(
      scale.dim() == mat.dim(),
-      "for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
+      "for block-scaled, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(),
+      " and scale.dim() = ", scale.dim(), " for arg ", arg_idx
+    );

-    // LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
-    // RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
+    // LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/blocksize, 4))
+    // RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/blocksize, 4))
    //   * weight is transposed prior to the call, scale stays non-transposed.
    bool LHS = arg_idx == 0;
    int scale_dim_to_check = 0;
    int mat_dim_to_check = LHS ? 0 : 1;
    TORCH_CHECK(
        scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
-        "for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
+        "for block-scaled, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
        "must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
  } else {
    // For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
@ -273,32 +328,40 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
    };

    // TODO: this is for 3d tensor in 2d-3d case specifically.
-    // We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
+    // We'll need to support 3d-3d and 3d-2d cases once mxfp8/nvfp4 grouped gemm supports them.
    int64_t G = mat.size(0);
    int64_t K = mat.size(1);
+    if (is_fp4) {
+      // FP4 packs 2 values into a single 8b word - the "real" K is 2x the
+      // reported K. Reverse that adjustment.
+      const int fp4_elems_per_byte = 2;
+      K *= fp4_elems_per_byte;
+    }
    int64_t N = mat.size(2);
-    int64_t blocked_scale_K = round_up(K/32, 4);
+    int64_t blocked_scale_K = round_up(K/blocksize, 4);
    int64_t blocked_scale_N = round_up(N, 128);

    // fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
    TORCH_CHECK(
      scale.dim() == mat.dim() - 1,
-      "for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
+      "for block-scaled 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N),",
+      "but scale is ", scale.dim(), "D for arg ", arg_idx
    );
    TORCH_CHECK(
      scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
-      "for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
+      "for block-scaled grouped GEMM, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ")",
+      " for arg ", arg_idx, ", got: ", scale.size(0), ", ", scale.size(1)
    );
  }
 }

 void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
  bool using_fp8_rowwise = scale.scalar_type() == kFloat;
-  bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
+  bool using_mx = scale.scalar_type() == at::kFloat8_e8m0fnu;
  if (using_fp8_rowwise) {
    _check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
-  } else if (using_mxfp8) {
-    _check_scales_mxfp8(mat, scale, dim, arg_idx);
+  } else if (using_mx) {
+    _check_scales_blocked(mat, scale, dim, arg_idx);
  } else {
    TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
  }
@ -411,9 +474,10 @@ namespace {

 using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;

-std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2> scale_grouped_kernel_dispatch = {{
+std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 3> scale_grouped_kernel_dispatch = {{
  { "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
-  { "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
+  { "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
+  { "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};

 } // anonymous namespace

@ -525,8 +589,9 @@ _scaled_grouped_mm_cuda_v2(
          out);
    }
    case ScaledGemmImplementation::MXFP8_MXFP8: {
-      _check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
-      _check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
+      // scale shape checks
+      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
+      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
      return _mx8_mx8_bf16_grouped_mm_fbgemm(
          mat_a,
          mat_b,
@ -537,6 +602,21 @@ _scaled_grouped_mm_cuda_v2(
          offs.value(),
          out);
    }
+    case ScaledGemmImplementation::NVFP4_NVFP4: {
+      // scale shape checks
+      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
+      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
+      return _f4_f4_bf16_grouped_mm_fbgemm(
+          mat_a,
+          mat_b,
+          scale_a[0], /* block-scale A */
+          scale_a[1], /* global-scale A */
+          scale_b[0], /* block-scale B */
+          scale_b[1], /* global-scale B */
+          offs.value(),
+          std::nullopt, /* bias */
+          out);
+    }
    default:
      TORCH_CHECK_NOT_IMPLEMENTED(false,
          "_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");
@ -559,12 +639,19 @@ std::optional<c10::ScalarType> out_dtype) {
  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
  bool use_fast_path = false;
+  if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) {
+    use_fast_path = true;
+  }
 #endif
  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
  if (use_fast_path) {
    // fast path, no d2h sync needed
+#ifndef USE_ROCM
    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
+#else
+    at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
+#endif
  } else {
    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
  }
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
@ -12,14 +12,15 @@

 namespace at::native {

-#if AT_USE_JITERATOR()
+#if 0 && AT_USE_JITERATOR()
 constexpr char tan_name[] = "tan_impl";
 #endif

 void tan_kernel_cuda(TensorIteratorBase& iter) {
  auto common_dtype = iter.common_dtype();
  if (at::isComplexType(common_dtype)) {
-#if AT_USE_JITERATOR()
+    // Disabled due to accuracy issues
+#if 0 && AT_USE_JITERATOR()
    static const auto tan_string = jiterator_stringify(
        template <typename T> T tan_impl(T a) { return std::tan(a); });
    AT_DISPATCH_COMPLEX_TYPES_AND(
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
@ -12,14 +12,15 @@

 namespace at::native {

-#if AT_USE_JITERATOR()
+#if 0 && AT_USE_JITERATOR()
 constexpr char tanh_name[] = "tanh_impl";
 #endif

 void tanh_kernel_cuda(TensorIteratorBase& iter) {
  auto common_dtype = iter.common_dtype();
  if (at::isComplexType(common_dtype)) {
-#if AT_USE_JITERATOR()
+    // Disabled due to accuracy issues
+#if 0 && AT_USE_JITERATOR()
    static const auto tanh_string = jiterator_stringify(
        template <typename T> T tanh_impl(T a) { return std::tanh(a); });
    AT_DISPATCH_COMPLEX_TYPES_AND(
--- a/aten/src/ATen/native/cuda/cuBlasCommonArgs.h
+++ b/aten/src/ATen/native/cuda/cuBlasCommonArgs.h
@ -0,0 +1,171 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace at::native {
+
+using at::blas::ScalingType;
+using at::blas::SwizzleType;
+
+namespace {
+
+// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
+c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
+  if (resolve_conj && tensor.is_conj()) {
+    return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
+  } else {
+    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+  }
+}
+
+c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
+  if (tensor.is_non_overlapping_and_dense()) { // common case
+      transpose_tensor = tensor.is_contiguous();
+      return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
+  }
+  IntArrayRef tensor_strides = tensor.strides();
+  IntArrayRef tensor_sizes = tensor.sizes();
+  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
+    transpose_tensor = false;
+    return resolve_conj_if_indicated(tensor, !transpose_result);
+  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
+    transpose_tensor = true;
+    return resolve_conj_if_indicated(tensor, transpose_result);
+  } else {
+    transpose_tensor = true;
+    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
+  }
+}
+
+c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
+  if (tensor.is_non_overlapping_and_dense()) { // common case
+      transpose_tensor = tensor.is_contiguous();
+      return resolve_conj_if_indicated(tensor, true);
+  }
+
+  IntArrayRef tensor_strides = tensor.strides();
+  IntArrayRef tensor_sizes = tensor.sizes();
+  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
+    transpose_tensor = false;
+    return resolve_conj_if_indicated(tensor, true);
+  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
+    transpose_tensor = true;
+    return resolve_conj_if_indicated(tensor, true);
+  } else {
+    transpose_tensor = true;
+    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
+  }
+}
+
+} // namespace
+
+/**
+ * @brief Prepares matrices for CUBLAS operation
+ *
+ * This constructor prepares tensors for CUBLAS
+ * The main difference is that PyTorch uses row-major as the default and
+ * CUBLAS expects column-major.
+ *
+ * @details
+ * To enable row-major output while using CUBLAS,
+ * we use the mathematical identity that (A × B)^T = B^T × A^T.
+ *
+ * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
+ * T = row-major, N = col-major
+ *
+ * Example:
+ * For matrices A (M×K)(row-major) and B (K×N)(row-major):
+ *   - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
+ *   - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
+ *   - However, since the output form cublas is column-major this is
+ *   - equivalent to an output of size MxN row-major as expected
+ *
+ * The transpose flags are derived from the layouts of the passed in tensors
+ *
+ * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
+ * to their unpacked values to match what cuBLAS expects.
+ *
+ * @param mat1 First input matrix
+ * @param mat2 Second input matrix
+ * @param c Output matrix (result)
+ * @param scale_a Optional scaling factor for first matrix
+ * @param scale_b Optional scaling factor for second matrix
+ * @param scale_result Optional scaling factor for result
+ */
+struct cublasCommonArgs {
+  cublasCommonArgs(
+      const Tensor& mat1,
+      const Tensor& mat2,
+      Tensor& c,
+      const std::optional<Tensor>& scale_a = std::nullopt,
+      const std::optional<Tensor>& scale_b = std::nullopt,
+      const std::optional<Tensor>& scale_result = std::nullopt,
+      const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
+      const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
+    bool transpose_result = false, transpose_a = false, transpose_b = false;
+    result = prepare_matrix_for_cublas(c, transpose_result);
+    mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
+    matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
+
+    // Handle scale tensors if provided
+    if (scale_a && scale_b) {
+      // By default since we return in row-major we run the gemm
+      // as B.T @ A.T, check transpose_result to determine if we flip the scales
+      scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
+      scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
+      scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
+      scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
+      scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
+      scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
+    }
+
+    if (scale_result) {
+      scale_result_ptr = scale_result->data_ptr();
+      scale_result_dtype = scale_result->scalar_type();
+    }
+
+    // Update transpose flags
+    if (transpose_result) {
+      transpose_a = !transpose_a;
+      transpose_b = !transpose_b;
+    }
+
+    auto sizes_a = mata->sizes();
+    auto sizes_b = matb->sizes();
+
+    m = sizes_a[transpose_result ? 1 : 0];
+    k = sizes_a[transpose_result ? 0 : 1];
+    n = sizes_b[transpose_result ? 0 : 1];
+    lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
+    ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
+    result_ld = result->stride(transpose_result ? 0 : 1);
+    transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
+    transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
+
+    // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
+    // if the gemm operands are in packed float4
+    if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
+      k = k * 2;
+      lda = lda * 2;
+      ldb = ldb * 2;
+    }
+  }
+
+  // Matrix members
+  char transa, transb;
+  int64_t m, n, k;
+  int64_t lda, ldb, result_ld;
+  c10::MaybeOwned<Tensor> mata, matb, result;
+
+  // Scale members
+  void* scale_mata_ptr = nullptr;
+  void* scale_matb_ptr = nullptr;
+  void* scale_result_ptr = nullptr;
+  std::optional<c10::ScalarType> scale_mata_dtype;
+  std::optional<ScalingType> scaling_mata_type;
+  std::optional<c10::ScalarType> scale_matb_dtype;
+  std::optional<ScalingType> scaling_matb_type;
+  std::optional<c10::ScalarType> scale_result_dtype;
+};
+
+} // namespace at::native
--- a/aten/src/ATen/native/hip/ck_group_gemm.h
+++ b/aten/src/ATen/native/hip/ck_group_gemm.h
@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/ScalarType.h>
+#include <optional>
+
+namespace at {
+namespace hip {
+namespace detail {
+void group_gemm_ck(
+    const at::Tensor& mat_a,
+    const at::Tensor& mat_b,
+    const std::optional<at::Tensor>& offs,
+    const std::optional<at::Tensor>& bias,
+    at::Tensor& out);
+
+} // namespace detail
+} // namespace hip
+} // namespace at
--- a/aten/src/ATen/native/hip/ck_group_gemm.hip
+++ b/aten/src/ATen/native/hip/ck_group_gemm.hip
@ -0,0 +1,458 @@
+#undef __HIP_NO_HALF_CONVERSIONS__
+#include <ATen/hip/HIPContext.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorAccessor.h>
+#include <c10/hip/HIPStream.h>
+#include <iostream>
+#include <vector>
+#include <optional>
+#include <type_traits>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
+#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp>
+#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
+#include <ck/utility/tuple.hpp>
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+namespace at {
+namespace hip {
+namespace detail {
+
+namespace CkTypes {
+    using BF16 = ck::bhalf_t;
+    using F16 = ck::half_t;
+    using F32 = float;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+}
+
+template <typename ALayout, typename BLayout, typename DataType>
+using GroupedGemmKernel = ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage<
+    ALayout, BLayout, ck::Tuple<>, ck::tensor_layout::gemm::RowMajor,
+    DataType, DataType, CkTypes::F32, DataType, ck::Tuple<>, DataType,
+    CkTypes::PassThrough, CkTypes::PassThrough, CkTypes::PassThrough,
+    ck::tensor_operation::device::GemmSpecialization::MNKPadding,
+    1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2,
+    S<1,4,64,1>,  S<0,2,1,3>, S<0,2,1,3>,
+    3, 8, 8, 1,
+    S<1,4,64,1>,  S<0,2,1,3>, S<0,2,1,3>,
+    3, 8, 8, 1,
+    1, 1,
+    S<1,32,1,8>, 4
+>;
+
+template <typename ALayout, typename BLayout, typename DataType>
+void launch_grouped_bgemm_ck_impl_dispatch(
+    const at::Tensor& mat_a,
+    const at::Tensor& mat_b,
+    const std::optional<at::Tensor>& offs,
+    at::Tensor& out)
+{
+    using DeviceOp = GroupedGemmKernel<ALayout, BLayout, DataType>;
+    using PassThrough = CkTypes::PassThrough;
+
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+    std::vector<const void*> p_a_ptrs, p_b_ptrs;
+    std::vector<void*> p_e_ptrs;
+    // Note: d_ptrs will be resized after we populate the other vectors
+
+    const int mat_a_dim = mat_a.dim();
+    const int mat_b_dim = mat_b.dim();
+
+    const char* a_ptr_base = reinterpret_cast<const char*>(mat_a.data_ptr());
+    const char* b_ptr_base = reinterpret_cast<const char*>(mat_b.data_ptr());
+    char* out_ptr_base = reinterpret_cast<char*>(out.data_ptr());
+    const size_t a_element_size = mat_a.element_size();
+    const size_t b_element_size = mat_b.element_size();
+    const size_t out_element_size = out.element_size();
+
+    // for each group, calculate m,n,k,lda,ldb,ldc and A,B,out pointer base addresses.
+    if (mat_a_dim == 2 && mat_b_dim == 2) {
+        // 2D*2D case requires offset tensor
+        auto offs_accessor = offs->accessor<int, 1>();
+        int num_groups = offs_accessor.size(0);
+        const int M = mat_a.size(0); // number of rows in A
+        const int N = mat_b.size(1); // number of columns in B
+        const int K = mat_a.size(1); // columns in A == rows in B
+        // for 2d*2d input, output is 3d.
+        // for each group, A columns (K) are sliced. M and N dimensions are not sliced.
+        for (int i = 0; i < num_groups; ++i) {
+            int start_k = (i == 0) ? 0 : offs_accessor[i-1];
+            int end_k = offs_accessor[i];
+            int k = end_k - start_k;
+
+            //K dimension are sliced, hence select stride(1) always.
+            //K dimension is always dimension 1, regardless of memory layout (row/column major)
+            const void* group_a_ptr = a_ptr_base + start_k * mat_a.stride(1) * a_element_size;
+            const void* group_b_ptr;
+            int ldb;
+
+            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major B [K,N]: K values are horizontally adjacent, use stride(1) for K offset
+                group_b_ptr = b_ptr_base + start_k * mat_b.stride(1) * b_element_size;
+                // Leading dimension = distance between rows = stride(0)
+                ldb = mat_b.stride(0);
+            } else {
+                // Column-major B [K,N]: K values are vertically adjacent, use stride(0) for K offset
+                group_b_ptr = b_ptr_base + start_k * mat_b.stride(0) * b_element_size;
+                // Leading dimension = distance between columns = stride(1)
+                ldb = mat_b.stride(1);
+            }
+
+            // Calculate output pointer for group i in 3D tensor [num_groups, M, N]
+            // stride(0) = M*N elements between groups, so skip i*stride(0) elements to reach group i
+            void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
+            int lda, ldc;
+            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major A [M,K]: leading dimension = distance between rows = stride(0)
+                lda = mat_a.stride(0);
+            } else {
+                // Column-major A [M,K]: leading dimension = distance between columns = stride(1)
+                lda = mat_a.stride(1);
+            }
+            // Output is always row-major in 3D tensor [num_groups, M, N]
+            // Leading dimension for each group's [M,N] slice = stride(1) = N
+            ldc = out.stride(1);
+            size_t output_group_bytes = M * N * out_element_size;
+            void* group_e_ptr_end = (char*)group_e_ptr + output_group_bytes;
+
+            gemm_descs.push_back({
+                static_cast<ck::index_t>(M),
+                static_cast<ck::index_t>(N),
+                static_cast<ck::index_t>(k),
+                static_cast<ck::index_t>(lda),
+                static_cast<ck::index_t>(ldb),
+                static_cast<ck::index_t>(ldc)
+            });
+            p_a_ptrs.push_back(group_a_ptr);
+            p_b_ptrs.push_back(group_b_ptr);
+            p_e_ptrs.push_back(group_e_ptr);
+        }
+    } else if (mat_a_dim == 2 && mat_b_dim == 3) {
+        // 2D*3D case requires offset tensor
+        auto offs_accessor = offs->accessor<int, 1>();
+        int num_groups = offs_accessor.size(0);
+
+        // 2d*3d input, output is 2d.
+        // A: [m * n_groups, k], B: [n_groups, n, k] or [n_groups, k, n], Output: [m * n_groups, n]
+        // Offset divides M dimension (rows of A), each group gets different rows of A and different batch of B
+        const int K = mat_a.size(1); // columns in A
+        // For 2D-3D case: The output determines N (result width)
+        const int N = out.size(1); // N is the width of the output tensor
+
+        for (int i = 0; i < num_groups; ++i) {
+            int start_m = (i == 0) ? 0 : offs_accessor[i - 1];
+            int end_m = offs_accessor[i];
+            int m = end_m - start_m;
+
+            // Skip zero-sized groups but continue processing subsequent groups
+            if (m <= 0) {
+                continue;
+            }
+
+            // Select A rows for group i: skip start_m rows
+            const void* group_a_ptr;
+            int lda;
+            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major A [total_m, K]: skip start_m rows, each row is stride(0) elements apart
+                group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
+                lda = mat_a.stride(0); // distance between rows
+            } else {
+                // Column-major A [total_m, K]: skip start_m elements in the first dimension (stride(0) is between rows)
+                group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
+
+                // Detect stride pattern for A tensor to determine appropriate lda calculation
+                bool a_is_strided_tensor = (mat_a.stride(0) > mat_a.size(0));
+
+                if (a_is_strided_tensor) {
+                    // For strided A tensors: stride(0) gives the actual leading dimension
+                    lda = mat_a.stride(0);
+                } else {
+                    // For non-strided A tensors: use the M dimension (total rows)
+                    lda = mat_a.size(0); // Total M dimension for column-major layout
+                }
+            }
+
+            // Select B batch for group i: B[i, :, :]
+            const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
+            int ldb;
+
+            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major GEMM: expecting B as [K, N] but we have [N, K], so transpose needed
+                ldb = mat_b.stride(2); // Leading dimension for accessing as [K, N]
+            } else {
+                // Detect stride pattern to determine appropriate ldb calculation
+                bool is_strided_tensor = (mat_b.stride(2) > mat_b.size(2));
+
+                if (is_strided_tensor) {
+                    // For strided tensors: stride(2) gives the actual leading dimension
+                    ldb = mat_b.stride(2);
+                } else {
+                    // For non-strided tensors: use the N dimension
+                    ldb = mat_b.size(1);
+                }
+            }
+
+            // Output for this group: rows [start_m:end_m, :] in 2D output [total_m, N]
+            void* group_e_ptr = out_ptr_base + start_m * out.stride(0) * out_element_size;
+            int ldc = out.stride(0); // distance between rows in output (should be N for 2D case)
+
+            gemm_descs.push_back({
+                static_cast<ck::index_t>(m),
+                static_cast<ck::index_t>(N),
+                static_cast<ck::index_t>(K),
+                static_cast<ck::index_t>(lda),
+                static_cast<ck::index_t>(ldb),
+                static_cast<ck::index_t>(ldc)
+            });
+            p_a_ptrs.push_back(group_a_ptr);
+            p_b_ptrs.push_back(group_b_ptr);
+            p_e_ptrs.push_back(group_e_ptr);
+        }
+    } else if (mat_a_dim == 3 && mat_b_dim == 3) {
+        // 3d*3d input, output is 3d - batched matrix multiplication
+        // A: [batch, m, k], B: [batch, k, n] or [batch, n, k] (depending on transpose), Output: [batch, m, n]
+        // Each batch is processed as a separate GEMM operation
+        const int batch_size = mat_a.size(0);
+        const int M = mat_a.size(1); // rows in each A matrix
+        const int K = mat_a.size(2); // columns in A == rows in B (or columns if B is transposed)
+
+        // Determine N from B tensor - it could be B.size(1) or B.size(2) depending on layout
+        int N;
+        if (mat_b.size(1) == K) {
+            // B is [batch, k, n] - normal layout
+            N = mat_b.size(2);
+        } else if (mat_b.size(2) == K) {
+            // B is [batch, n, k] - transposed layout
+            N = mat_b.size(1);
+        } else {
+            TORCH_CHECK(false, "CK Group GEMM 3D-3D: B tensor dimensions incompatible with A. A=[",
+                       batch_size, ",", M, ",", K, "], B=[", mat_b.size(0), ",", mat_b.size(1), ",", mat_b.size(2), "]");
+        }
+
+        for (int i = 0; i < batch_size; ++i) {
+            // Select A batch for group i: A[i, :, :]
+            const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
+
+            // Select B batch for group i: B[i, :, :]
+            const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
+
+            // Select output batch for group i: Output[i, :, :]
+            void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
+
+            int lda, ldb, ldc;
+
+            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major A: leading dimension = distance between rows = stride(1)
+                lda = mat_a.stride(1);
+            } else {
+                // Column-major A: leading dimension = distance between columns = stride(2)
+                lda = mat_a.stride(2);
+            }
+
+            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major B: leading dimension = distance between rows
+                if (mat_b.size(1) == K) {
+                    // B is [batch, k, n] - normal layout
+                    ldb = mat_b.stride(1); // stride between K rows
+                } else {
+                    // B is [batch, n, k] - transposed layout, treat as [k, n] for GEMM
+                    ldb = mat_b.stride(2); // stride between N rows (since we're accessing as [k,n])
+                }
+            } else {
+                // Column-major B: leading dimension = distance between columns
+                if (mat_b.size(1) == K) {
+                    // B is [batch, k, n] - normal layout
+                    ldb = mat_b.stride(2); // stride between N columns
+                } else {
+                    // B is [batch, n, k] - transposed layout
+                    ldb = mat_b.stride(1); // stride between K columns (since we're accessing as [n,k]→[k,n])
+                }
+            }
+
+            // Output is typically row-major: leading dimension = distance between rows = stride(1)
+            ldc = out.stride(1);
+
+            gemm_descs.push_back({
+                static_cast<ck::index_t>(M),
+                static_cast<ck::index_t>(N),
+                static_cast<ck::index_t>(K),
+                static_cast<ck::index_t>(lda),
+                static_cast<ck::index_t>(ldb),
+                static_cast<ck::index_t>(ldc)
+            });
+            p_a_ptrs.push_back(group_a_ptr);
+            p_b_ptrs.push_back(group_b_ptr);
+            p_e_ptrs.push_back(group_e_ptr);
+        }
+    } else if (mat_a_dim == 3 && mat_b_dim == 2) {
+        // 3D*2D case requires offset tensor
+        auto offs_accessor = offs->accessor<int, 1>();
+        int num_groups = offs_accessor.size(0);
+        // 3d*2d input, output is 3d.
+        // A: [n_groups, m, k], B: [k, total_n] (assuming row-major for both)
+        // Offset divides N dimension of B, each group gets different slice of B and different batch of A
+        const int batch_size = mat_a.size(0); // n_groups
+        const int M = mat_a.size(1); // rows in each A matrix
+        const int K = mat_a.size(2); // columns in A
+
+        // For row-major A and B case: B should be [K, total_N]
+        const int total_N = mat_b.size(1); // B is [K, total_N] for row-major
+
+        for (int i = 0; i < num_groups; ++i) {
+            int start_n = (i == 0) ? 0 : offs_accessor[i - 1];
+            int end_n = offs_accessor[i];
+            int n = end_n - start_n;
+
+            // Skip zero-sized groups but continue processing subsequent groups
+            if (n <= 0) {
+                continue;
+            }
+
+            // Select A batch for group i: A[i, :, :]
+            const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
+
+            // Select B slice for group i: B[:, start_n:end_n] (B[K, total_N])
+            const void* group_b_ptr;
+            int ldb;
+
+            // Check if B is row-major or column-major
+            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
+                // Row-major B [K, total_N]: slice columns [start_n:end_n]
+                group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
+                ldb = mat_b.stride(0); // distance between rows (should be total_N)
+            } else {
+                // Column-major B [K, total_N]: slice columns [start_n:end_n]
+                group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
+                ldb = mat_b.stride(1); // distance between columns (should be K)
+            }
+
+            // Select output slice for group i: Output[:, start_n:end_n]
+            void* group_e_ptr = out_ptr_base + start_n * out.stride(1) * out_element_size;
+
+            int lda, ldc;
+
+            // Row-major A: leading dimension = distance between rows = stride(1)
+            lda = mat_a.stride(1);
+            // Output is row-major: leading dimension = distance between rows = stride(0)
+            ldc = out.stride(0);
+
+            gemm_descs.push_back({
+                static_cast<ck::index_t>(M),
+                static_cast<ck::index_t>(n),
+                static_cast<ck::index_t>(K),
+                static_cast<ck::index_t>(lda),
+                static_cast<ck::index_t>(ldb),
+                static_cast<ck::index_t>(ldc)
+            });
+            p_a_ptrs.push_back(group_a_ptr);
+            p_b_ptrs.push_back(group_b_ptr);
+            p_e_ptrs.push_back(group_e_ptr);
+        }
+    } else {
+        TORCH_CHECK(false, "CK Group GEMM: Unsupported dimensions, mat A dim is ", mat_a_dim, ", mat B dim is ", mat_b_dim);
+    }
+
+    TORCH_INTERNAL_ASSERT(p_a_ptrs.size() > 0, "CK Group GEMM: No valid groups");
+
+    // Initialize d_ptrs with the correct size
+    std::vector<std::array<const void*, 0>> d_ptrs(p_a_ptrs.size());
+
+    static DeviceOp gemm_instance;
+    auto argument = gemm_instance.MakeArgument(
+        p_a_ptrs, p_b_ptrs, d_ptrs, p_e_ptrs,
+        gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}
+    );
+    TORCH_INTERNAL_ASSERT(gemm_instance.IsSupportedArgument(argument),
+        "CK Group GEMM: argument unsupported (shape/strides/type config)");
+    size_t arg_buf_size = gemm_instance.GetDeviceKernelArgSize(&argument);
+    size_t ws_size = gemm_instance.GetWorkSpaceSize(&argument);
+
+    void* gemm_arg_buf = nullptr;
+    void* ws_buf = nullptr;
+
+    hipMalloc(&gemm_arg_buf, arg_buf_size);
+    hipMalloc(&ws_buf, ws_size);
+
+    gemm_instance.SetDeviceKernelArgs(&argument, gemm_arg_buf);
+    gemm_instance.SetWorkSpacePointer(&argument, ws_buf);
+
+    auto invoker = gemm_instance.MakeInvoker();
+    hipStream_t stream = c10::hip::getCurrentHIPStream();
+    invoker.Run(argument, {stream});
+    hipFree(gemm_arg_buf);
+    hipFree(ws_buf);
+}
+
+void group_gemm_ck(
+    const at::Tensor& input_a,
+    const at::Tensor& input_b_colmajor,
+    const std::optional<at::Tensor>& offs,
+    const std::optional<at::Tensor>& /*bias*/,
+    at::Tensor& out)
+{
+    // Detect if input_a is row-major based on stride pattern
+    bool a_row_major = (input_a.dim() == 3) ? (input_a.stride(2) == 1) : (input_a.stride(1) == 1);
+    bool b_col_major = (input_b_colmajor.dim() == 3) ? (input_b_colmajor.stride(1) == 1) : (input_b_colmajor.stride(0) == 1);
+    // Ensure tensor A is row-major and contiguous if not already
+    at::Tensor mat_a = input_a;
+    if (!a_row_major) {
+        // If A is not row-major, make it contiguous (row-major)
+        mat_a = input_a.contiguous();
+    }
+    // Force tensor B to be column-major using double transpose trick
+    // This guarantees stride(0) == 1 and stride(1) == K for [K, N] shape
+    at::Tensor mat_b = input_b_colmajor;
+    if (!b_col_major) {
+        mat_b = input_b_colmajor.transpose(-2, -1).contiguous().transpose(-2, -1);
+    }
+
+    // For 3D tensors, check the last dimension stride for row-major detection
+    a_row_major = (mat_a.dim() == 3) ? (mat_a.stride(2) == 1) : (mat_a.stride(1) == 1);
+    bool b_row_major = (mat_b.dim() == 3) ? (mat_b.stride(2) == 1) : (mat_b.stride(1) == 1);
+
+    if (mat_a.dtype() == at::kBFloat16) {
+        // bf16 path
+        if (a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
+        } else if (a_row_major && !b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
+        } else if (!a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
+        } else {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
+        }
+    } else if (mat_a.dtype() == at::kHalf) {
+        // fp16 path
+        if (a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
+        } else if (a_row_major && !b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
+        } else if (!a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
+        } else {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
+        }
+    } else if (mat_a.dtype() == at::kFloat) {
+        // fp32 path
+        if (a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
+        } else if (a_row_major && !b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
+        } else if (!a_row_major && b_row_major) {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
+        } else {
+            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
+        }
+    } else {
+        TORCH_CHECK(false, "CK Group GEMM: Unsupported mat_a dtype");
+    }
+
+}
+
+} // namespace detail
+} // namespace hip
+} // namespace at
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.h
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.h
@ -1,4 +1,4 @@
-#pragma onces
+#pragma once
 #include <c10/metal/common.h>

 template <unsigned N = c10::metal::max_ndim>
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@ -57,6 +57,7 @@ Tensor& random_mps_impl(Tensor& self,
  if (self.numel() == 0) {
    return self;
  }
+  at::assert_no_internal_overlap(self);
  // MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
  const auto need_reshape = self.ndimension() > 4;
  auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
@ -153,8 +154,16 @@ Tensor& random_mps_impl(Tensor& self,
      feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
    }

-    Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
+    // Handle non-contiguous output tensors by creating a contiguous temporary
+    const auto needs_gather = needsGather(self);
+    Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
+
+    // Copy results back to original non-contiguous output
+    if (needs_gather) {
+      self.copy_(self_);
+    }
  }

  return self;
--- a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
+++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
@ -1,3 +1,5 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/Resize.h>
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/native/mps/OperationUtils.h>

@ -37,25 +39,12 @@ NSArray<NSNumber*>* IntArrayToNSArray(IntArrayRef arr) {
 } // anonymous namespace

 Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
-  TORCH_CHECK(self.is_complex());
-  auto in_sizes = self.sizes();
-  DimVector out_sizes(in_sizes.begin(), in_sizes.end());
-  out_sizes[dim.back()] = last_dim_size;
-  auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
+  auto out = at::empty({}, self.options().dtype(c10::toRealValueType(self.scalar_type())));
  return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out);
 }

 Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
-  TORCH_CHECK(self.is_floating_point());
-  auto input_sizes = self.sizes();
-  DimVector out_sizes(input_sizes.begin(), input_sizes.end());
-  auto last_dim = dim.back();
-  auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
-  if (onesided) {
-    out_sizes[last_dim] = last_dim_halfsize;
-  }
-
-  auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type())));
+  auto out = at::empty({}, self.options().dtype(c10::toComplexType(self.scalar_type())));
  return _fft_r2c_mps_out(self, dim, normalization, onesided, out);
 }

@ -72,6 +61,17 @@ using namespace mps;

 // TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237
 Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) {
+  TORCH_CHECK(self.scalar_type() == kFloat || self.scalar_type() == kHalf, "Only float and half dtypes are supported");
+  TORCH_CHECK(out.scalar_type() == c10::toComplexType(self.scalar_type()));
+  const auto input_sizes = self.sym_sizes();
+  SymDimVector out_sizes(input_sizes.begin(), input_sizes.end());
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
+  if (onesided) {
+    out_sizes[last_dim] = last_dim_halfsize;
+  }
+  at::native::resize_output_symint(out, out_sizes);
+
  auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" +
      std::to_string(normalization) + ":" + std::to_string(onesided);
  @autoreleasepool {
@ -112,6 +112,12 @@ Tensor& _fft_c2r_mps_out(const Tensor& self,
                         int64_t normalization,
                         int64_t last_dim_size,
                         Tensor& out) {
+  TORCH_CHECK(self.is_complex(), "Input must be complex");
+  TORCH_CHECK(out.scalar_type() == c10::toRealValueType(self.scalar_type()), "Unexpected output type");
+  const auto in_sizes = self.sym_sizes();
+  SymDimVector out_sizes(in_sizes.begin(), in_sizes.end());
+  out_sizes[dim.back()] = last_dim_size;
+  at::native::resize_output_symint(out, out_sizes);
  auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
      std::to_string(normalization) + ":" + std::to_string(last_dim_size);
  @autoreleasepool {
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -617,6 +617,9 @@ Tensor& index_select_out_mps(const Tensor& self, int64_t dim, const Tensor& inde
  TORCH_CHECK(self.scalar_type() == output.scalar_type(),
              "index_select(): self and output must have the same scalar type");
  TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
+  at::assert_no_internal_overlap(output);
+  at::assert_no_overlap(output, self);
+  at::assert_no_overlap(output, index);
  auto output_size = self.sizes().vec();
  if (self.dim() > 0) {
    output_size[dim] = num_indices;
--- a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
@ -73,8 +73,7 @@ void upsample_bilinear2d_out_frame(
  const auto rwidth = area_pixel_compute_scale<float>(
      input_width, output_width, align_corners, scales_w);

-  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-  float output_scale = output.q_scale() / input.q_scale();
+  float output_scale = static_cast<float>(output.q_scale() / input.q_scale());

  const int64_t input_q_zero_point = input.q_zero_point();
  const int64_t output_q_zero_point = output.q_zero_point();
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@ -148,7 +148,7 @@ Tensor qcat_nhwc_kernel(
          // Vectorized loop
          if (c + VLEN <= curr_C) {
            auto curr_scale_vec = Vectorized<float>(curr_scale);
-            auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
+            auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
            auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
            for (; c + VLEN <= curr_C; c += VLEN) {
              auto inp_vec = Vec::loadu(iptr + c);
@ -174,7 +174,7 @@ Tensor qcat_nhwc_kernel(
          int64_t elem_size = curr_C - c;
          if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
            auto curr_scale_vec = Vectorized<float>(curr_scale);
-            auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
+            auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
            auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
            int64_t vec_num = elem_size / kVLEN;
            std::array<typename scalar_t::underlying, VLEN> buf_in{};
@ -611,12 +611,10 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
 void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
                                   const Scalar& negval_) {
  int64_t i_zp = qx.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float i_scale = qx.q_scale();
+  float i_scale = static_cast<float>(qx.q_scale());

  int64_t o_zp = out.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float o_scale = out.q_scale();
+  float o_scale = static_cast<float>(out.q_scale());
  float o_inv_scale = 1.0f / o_scale;

  float negval = negval_.to<float>();
@ -627,8 +625,8 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
    Vec zero_vec = Vec(0.0f);
    Vec one_vec = Vec(1.0f);

-    Vec i_scale_vec = Vec((float)i_scale);
-    Vec i_zp_vec = Vec((float)i_zp);
+    Vec i_scale_vec = Vec(i_scale);
+    Vec i_zp_vec = Vec(i_zp);
    Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();

    Vec negval_vec = Vec(negval);
@ -738,10 +736,9 @@ void qprelu_out_kernel(Tensor& out,

 void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
  int64_t zero_point = qx.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float scale = qx.q_scale();
+  float scale = static_cast<float>(qx.q_scale());
  auto scale_vec = Vectorized<float>(scale);
-  auto zero_point_vec = Vectorized<float>((float)zero_point);
+  auto zero_point_vec = Vectorized<float>(zero_point);
  auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
  int64_t output_zero_point = zero_point;
  float output_scale = scale;
@ -828,10 +825,9 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
 void qsigmoid_kernel(
    const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
  int64_t zero_point = qx.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float scale = qx.q_scale();
+  float scale = static_cast<float>(qx.q_scale());
  auto scale_vec = Vectorized<float>(scale);
-  auto zero_point_vec = Vectorized<float>((float)zero_point);
+  auto zero_point_vec = Vectorized<float>(zero_point);

  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
    float inv_output_scale = 1.0 / output_scale;
@ -870,10 +866,9 @@ void qsigmoid_kernel(

 void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
  int64_t zero_point = qx.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float scale = qx.q_scale();
+  float scale = static_cast<float>(qx.q_scale());
  auto scale_vec = Vectorized<float>(scale);
-  auto zero_point_vec = Vectorized<float>((float)zero_point);
+  auto zero_point_vec = Vectorized<float>(zero_point);
  auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();

  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
@ -1029,13 +1024,10 @@ void qthreshold_kernel(

  // defines input and output scales and zero_points
  int64_t input_zero_point = qx.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float input_scale = qx.q_scale();
+  float input_scale = static_cast<float>(qx.q_scale());
  int64_t output_zero_point = qy.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float output_scale = qy.q_scale();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float inv_output_scale = 1.0 / output_scale;
+  float output_scale = static_cast<float>(qy.q_scale());
+  float inv_output_scale = static_cast<float>(1.0 / output_scale);

  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
    qy = at::_empty_affine_quantized(
@ -1096,8 +1088,7 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {

  const auto o_scale = qy.q_scale();
  const auto o_zero_point = qy.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  const float o_inv_scale = 1.0 / o_scale;
+  const float o_inv_scale = static_cast<float>(1.0 / o_scale);

  using fVec = Vectorized<float>;
  fVec i_scale_vec(i_scale);
@ -1135,10 +1126,9 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {

 void qtanh_kernel(const Tensor& qx, Tensor& qy) {
  int64_t zero_point = qx.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float scale = qx.q_scale();
+  float scale = static_cast<float>(qx.q_scale());
  auto scale_vec = Vectorized<float>(scale);
-  auto zero_point_vec = Vectorized<float>((float)zero_point);
+  auto zero_point_vec = Vectorized<float>(zero_point);
  auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();

  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
@ -1198,16 +1188,13 @@ void qelu_kernel(
  // they are NOT related to the quantization scale term

  int64_t i_zp = qx.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float i_scale = qx.q_scale();
+  float i_scale = static_cast<float>(qx.q_scale());

  // In a future PR, we can improve on output scale and zero_point
  // selection.
  int64_t o_zp = qy.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float o_scale = qy.q_scale();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float inv_o_scale = 1.0 / o_scale;
+  float o_scale = static_cast<float>(qy.q_scale());
+  float inv_o_scale = static_cast<float>(1.0 / o_scale);

  float alpha_float = alpha.to<float>();
  float scale_coef = scale.to<float>();
@ -1227,7 +1214,7 @@ void qelu_kernel(
    Vec scale_coef_vec = Vec(scale_coef);
    Vec input_scale_coef_vec = Vec(input_scale_coef);
    Vec i_scale_vec = Vec(i_scale);
-    Vec i_zero_point_vec = Vec((float)i_zp);
+    Vec i_zero_point_vec = Vec(i_zp);
    Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();

    cpu_kernel_vec(
@ -1326,23 +1313,20 @@ void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
 template <bool ReLUFused = false>
 void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
  int64_t zero_point = out.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float scale = out.q_scale();
+  float scale = static_cast<float>(out.q_scale());
  float inv_scale = 1.0f / scale;
  int64_t self_zero_point = self.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float self_scale = self.q_scale();
+  float self_scale = static_cast<float>(self.q_scale());
  int64_t other_zero_point = other.q_zero_point();
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  float other_scale = other.q_scale();
+  float other_scale = static_cast<float>(other.q_scale());

  // Broadcast out the parameters here to amortize out that cost across
  // loop iterations.
  // TODO: we can optimize dequantization by doing a premultiplication
  // of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
-  auto self_zero_point_vec = Vectorized<float>((float)self_zero_point);
+  auto self_zero_point_vec = Vectorized<float>(self_zero_point);
  auto self_scale_vec = Vectorized<float>(self_scale);
-  auto other_zero_point_vec = Vectorized<float>((float)other_zero_point);
+  auto other_zero_point_vec = Vectorized<float>(other_zero_point);
  auto other_scale_vec = Vectorized<float>(other_scale);

  auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
@ -2965,7 +2949,7 @@ void quantized_normalize_kernel(
    const bool beta_null = beta_data == nullptr;
    int64_t x_zp = X.q_zero_point();
    float x_scale = X.q_scale();
-    fVec x_zp_vec((float)x_zp);
+    fVec x_zp_vec(x_zp);
    fVec one_vec(1.0f);
    fVec zero_vec(0.0f);
    float x_fake_scale = 1.0f;
@ -3253,7 +3237,7 @@ void quantized_groupnorm_nhwc_kernel(
    const bool beta_null = beta_data == nullptr;
    int64_t x_zp = X.q_zero_point();
    float x_scale = X.q_scale();
-    fVec x_zp_vec((float)x_zp);
+    fVec x_zp_vec(x_zp);
    fVec one_vec(1.0f);
    fVec zero_vec(0.0f);
    float x_fake_scale = 1.0f;
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@ -414,7 +414,6 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
  TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
  TORCH_CHECK(input.dim() >= 2);

-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
  const int64_t N = packed_weight_fp16.numCols();
  std::vector<int64_t> output_sizes = input.sizes().vec();
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
@ -22,6 +22,7 @@
 #else
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros_like.h>
 #include <ATen/ops/reshape.h>
 #include <ATen/ops/scalar_tensor.h>
 #include <ATen/ops/sum.h>
@ -42,7 +43,6 @@ C10_DIAGNOSTIC_POP()
 #include <static_switch.h>
 #include <ATen/native/transformers/cuda/flash_attn/flash_api.h>

-
 #include <c10/util/Exception.h>

 namespace FLASH_NAMESPACE {
@ -417,6 +417,26 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
    const int head_size_og = sizes[3];
    const int seqlen_k = k.size(1);
    const int num_heads_k = k.size(2);
+
+    if (batch_size == 0) {
+        auto opts = q.options();
+        at::Tensor out = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
+        at::Tensor q_padded = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
+        at::Tensor k_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
+        at::Tensor v_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
+        at::Tensor softmax_lse = at::empty({0, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+        at::Tensor rng_state = at::empty({2}, at::dtype(c10::kUInt64).device(at::kCUDA));
+        at::Tensor _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
+        at::Tensor p = at::empty({0}, opts);
+        if (return_softmax) {
+            auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+            const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+            const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+            p = at::empty({0, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
+        }
+        return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), _unused, std::move(p)};
+    }
+
    TORCH_CHECK(batch_size > 0, "batch size must be positive");
    TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
@ -547,7 +567,7 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
        q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
    }
-    return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
+    return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), std::move(_unused), std::move(p)};
 }

 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@ -852,7 +872,6 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
-    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");

    const auto sizes = q.sizes();

@ -863,6 +882,20 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
    const int head_size = sizes[3];
    const int seqlen_k = k.size(1);
    const int num_heads_k = k.size(2);
+
+    if (batch_size == 0) {
+        auto opts = q.options();
+        at::Tensor dq = at::empty_like(q);
+        at::Tensor dk = at::empty_like(k);
+        at::Tensor dv = at::empty_like(v);
+        auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+        const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+        at::Tensor softmax_d = at::empty({0, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
+        return {dq, dk, dv, softmax_d};
+    }
+
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+
    TORCH_CHECK(batch_size > 0, "batch size must be positive");
    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
    TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
--- a/benchmarks/operator_benchmark/pt/configs.py
+++ b/benchmarks/operator_benchmark/pt/configs.py
@ -11,11 +11,6 @@ def remove_cuda(config_list):
    return [config for config in config_list if cuda_config not in config]


-def remove_cpu(config_list):
-    cpu_config = {"device": "cpu"}
-    return [config for config in config_list if cpu_config not in config]
-
-
 # Configs for conv-1d ops
 conv_1d_configs_short = op_bench.config_list(
    attr_names=["IC", "OC", "kernel", "stride", "N", "L"],
@ -132,20 +127,6 @@ conv_3d_configs_short = op_bench.config_list(
    },
    tags=["short"],
 )
-conv_3d_configs_long = op_bench.cross_product_configs(
-    IC=[16, 32],
-    OC=[32, 64],
-    kernel=[3, 5],
-    stride=[1, 2],
-    N=[1],
-    D=[128],
-    H=[128],
-    W=[128],
-    G=[1],
-    pad=[0],
-    device=["cpu", "cuda"],
-    tags=["long"],
-)

 linear_configs_short = op_bench.config_list(
    attr_names=["N", "IN", "OUT"],
--- a/benchmarks/operator_benchmark/pt/conv_test.py
+++ b/benchmarks/operator_benchmark/pt/conv_test.py
@ -38,10 +38,6 @@ class ConvTranspose1dBenchmark(op_bench.TorchBenchmarkBase):
 op_bench.generate_pt_test(
    configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
 )
-op_bench.generate_pt_gradient_test(
-    configs.remove_cpu(configs.conv_1d_configs_short + configs.conv_1d_configs_long),
-    Conv1dBenchmark,
-)


 if not torch.backends.mkldnn.is_acl_available():
@ -107,20 +103,6 @@ op_bench.generate_pt_test(
    configs.conv_2d_pw_configs_short + configs.conv_2d_pw_configs_long,
    Conv2dPointwiseBenchmark,
 )
-op_bench.generate_pt_gradient_test(
-    configs.remove_cpu(configs.conv_2d_configs_short + configs.conv_2d_configs_long),
-    Conv2dBenchmark,
-)
-op_bench.generate_pt_gradient_test(
-    configs.remove_cpu(configs.conv_2d_configs_short + configs.conv_2d_configs_long),
-    ConvTranspose2dBenchmark,
-)
-op_bench.generate_pt_gradient_test(
-    configs.remove_cpu(
-        configs.conv_2d_pw_configs_short + configs.conv_2d_pw_configs_long
-    ),
-    Conv2dPointwiseBenchmark,
-)


 """
@ -152,12 +134,6 @@ class ConvTranspose3dBenchmark(op_bench.TorchBenchmarkBase):

 op_bench.generate_pt_test(configs.conv_3d_configs_short, Conv3dBenchmark)
 op_bench.generate_pt_test(configs.conv_3d_configs_short, ConvTranspose3dBenchmark)
-op_bench.generate_pt_gradient_test(
-    configs.remove_cpu(configs.conv_3d_configs_long), Conv3dBenchmark
-)
-op_bench.generate_pt_gradient_test(
-    configs.remove_cpu(configs.conv_3d_configs_long), ConvTranspose3dBenchmark
-)


 if __name__ == "__main__":
--- a/benchmarks/transformer/config_utils.py
+++ b/benchmarks/transformer/config_utils.py
@ -0,0 +1,157 @@
+"""Configuration utilities for parsing JSON and YAML config files."""
+
+import json
+import re
+
+
+def heads_input_type(s: str) -> tuple[int, int]:
+    """Convert string format 'Hq,Hkv' to tuple (Hq, Hkv)."""
+    try:
+        hq, hkv = map(int, s.split(","))
+        return hq, hkv
+    except Exception as e:
+        raise ValueError("Heads must be Hq,Hkv") from e
+
+
+default_config = {
+    "dynamic": False,
+    "calculate_bwd": False,
+    "dtype": "bfloat16",
+    "b": [2, 8, 16],
+    "nh": ["16,16", "16,2"],
+    "s": [512, 1024, 4096],
+    "d": [64, 128],
+    "mods": ["noop", "causal", "alibi", "sliding_window"],
+    "backend": ["efficient"],
+    "max_autotune": False,
+    "decoding": False,
+    "kv_size": None,
+    "throughput": True,
+    "save_path": None,
+    "output_json_for_dashboard": None,
+    "benchmark_name": "PyTorch operator microbenchmark",
+}
+
+
+def load_config_file(config_path: str) -> dict:
+    """Load configuration from JSON or YAML file.
+
+    Automatically converts 'nh' field from strings to tuples.
+
+    Args:
+        config_path: Path to the configuration file
+
+    Returns:
+        Dictionary containing the configuration
+
+    Raises:
+        FileNotFoundError: If config file doesn't exist
+        ValueError: If config file format is invalid
+    """
+    with open(config_path) as f:
+        config_str = f.read()
+
+    # Try to load as JSON first
+    try:
+        config = json.loads(config_str)
+    except json.JSONDecodeError:
+        # Fall back to YAML parsing
+        config = _parse_simple_yaml(config_str)
+
+    # Apply automatic conversions for 'nh' field
+    if "nh" in config and isinstance(config["nh"], list):
+        config["nh"] = [
+            heads_input_type(h) if isinstance(h, str) else h for h in config["nh"]
+        ]
+
+    return config
+
+
+def _parse_simple_yaml(yaml_str: str) -> dict:
+    """Simple YAML parser for basic configs (without external dependencies).
+
+    Supports:
+    - key: value pairs
+    - booleans (true/false)
+    - null values
+    - integers and floats
+    - strings (quoted and unquoted)
+    - lists in JSON format [item1, item2, ...]
+    - comments (lines starting with # or after #)
+
+    Args:
+        yaml_str: YAML content as string
+
+    Returns:
+        Dictionary containing parsed YAML content
+    """
+    config = {}
+
+    for line in yaml_str.split("\n"):
+        # Remove comments
+        line = line.split("#")[0].strip()
+
+        if not line or ":" not in line:
+            continue
+
+        key, value = line.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+
+        # Parse value based on type
+        if value.lower() == "true":
+            config[key] = True
+        elif value.lower() == "false":
+            config[key] = False
+        elif value.lower() in ("null", "none", ""):
+            config[key] = None
+        elif value.startswith("[") and value.endswith("]"):
+            # Parse list - handle quoted strings properly
+            pattern = r'"([^"]+)"|\'([^\']+)\'|([^,\[\]\s]+)'
+            matches = re.findall(pattern, value[1:-1])  # Remove [ ]
+            parsed_items = []
+            for match in matches:
+                # match is a tuple of (double_quoted, single_quoted, unquoted)
+                item = match[0] or match[1] or match[2]
+                item = item.strip()
+                if item:
+                    try:
+                        parsed_items.append(int(item))
+                    except ValueError:
+                        parsed_items.append(item)
+            config[key] = parsed_items
+        elif value.startswith(('"', "'")):
+            config[key] = value.strip("\"'")
+        else:
+            # Try to parse as number
+            try:
+                config[key] = int(value)
+            except ValueError:
+                try:
+                    config[key] = float(value)
+                except ValueError:
+                    config[key] = value
+
+    return config
+
+
+def print_default_config(output_format: str) -> None:
+    """Print a default configuration template in JSON or YAML format.
+
+    Args:
+        output_format: Either "json" or "yaml"
+    """
+    if output_format == "json":
+        print(json.dumps(default_config, indent=2))
+    else:  # yaml
+        for key, value in default_config.items():
+            if value is None:
+                print(f"{key}: null")
+            elif isinstance(value, bool):
+                print(f"{key}: {str(value).lower()}")
+            elif isinstance(value, str):
+                print(f'{key}: "{value}"')
+            elif isinstance(value, list):
+                print(f"{key}: {json.dumps(value)}")
+            else:
+                print(f"{key}: {value}")
--- a/benchmarks/transformer/configs/config_basic.yaml
+++ b/benchmarks/transformer/configs/config_basic.yaml
@ -0,0 +1,29 @@
+# Basic benchmark configuration for PyTorch transformer benchmarks
+# Usage: python score_mod.py --config config_basic.yaml
+
+# Core parameters
+dynamic: false
+calculate_bwd: true
+dtype: "bfloat16"
+
+# Shape parameters - larger sweep
+b: [1, 2, 4, 8, 16]  # batch sizes
+nh: ["16,16", "16,2", "32,32", "32,4"]  # [query_heads,key_value_heads]
+s: [512, 1024, 2048, 4096, 8192]  # sequence lengths
+d: [64, 128]  # head dimensions (limited to 128 for Flash Attention/cuDNN compatibility)
+
+# All attention types
+mods: ["noop", "causal", "rel", "head_bias", "alibi", "sliding_window", "prefix_lm", "softcap"]
+
+# Multiple backends for comparison (SDPA + Flash Attention) - flex is always included internally
+backend: ["efficient", "math", "cudnn", "fav2"]
+max_autotune: true  # Enable torch.compile with max-autotune for optimal performance
+
+# Decoding and cache settings
+decoding: false
+kv_size: null
+
+# Metrics and output
+throughput: true  # Calculate memory bandwidth & TFLOPS
+save_path: "comprehensive_results.csv"  # Save to CSV
+output_json_for_dashboard: "attn_bench_basic.json"
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@ -1,15 +1,19 @@
 import argparse
 import csv
+import gc
 import itertools
+import json
 import random
+import sys
 from collections import defaultdict
 from collections.abc import Callable
 from contextlib import nullcontext
 from dataclasses import asdict, dataclass
-from functools import partial
-from typing import Optional, Union
+from functools import partial, wraps
+from typing import Literal, Optional, Union

 import numpy as np
+from config_utils import heads_input_type, load_config_file, print_default_config
 from tabulate import tabulate
 from tqdm import tqdm

@ -33,6 +37,96 @@ torch._dynamo.config.recompile_limit = 1000
 from torch._inductor.runtime.benchmarking import benchmarker


+def cleanup_memory():
+    """Aggressively free GPU memory"""
+    torch.cuda.empty_cache()
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+
+def safe_backend(backend_name=None, return_dict=False):
+    """Decorator that wraps backend functions with error handling
+
+    Args:
+        backend_name: Name of the backend for error messages
+        return_dict: If True, returns dict of results for all backends (for run_single_experiment)
+                     If False, returns single ExperimentResults (for individual backend functions)
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(config, *args, **kwargs):
+            try:
+                return func(config, *args, **kwargs)
+            except torch.OutOfMemoryError:
+                print(
+                    f"[SKIP] OOM for {backend_name or func.__name__} with shape {config.shape}"
+                )
+                cleanup_memory()
+            except RuntimeError as e:
+                error_msg = str(e)
+                if "out of resource" in error_msg or "OutOfMemoryError" in error_msg:
+                    print(
+                        f"[SKIP] Triton OOM for {backend_name or func.__name__} with shape {config.shape}"
+                    )
+                    cleanup_memory()
+                elif "No valid triton configs" in error_msg:
+                    print(
+                        f"[SKIP] No valid Triton config for {backend_name or func.__name__} with shape {config.shape}"
+                    )
+                else:
+                    print(
+                        f"[SKIP] Runtime error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
+                    )
+            except Exception as e:
+                print(
+                    f"[SKIP] Error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
+                )
+
+            # Return appropriate NaN result based on function type
+            if return_dict:
+                # For run_single_experiment: return dict with NaN for all backends
+                nan_result = ExperimentResults(
+                    fwd_time=float("nan"),
+                    bwd_time=float("nan") if config.calculate_bwd_time else None,
+                )
+                results = dict.fromkeys(config.backends, nan_result)
+                results["flex"] = ExperimentResults(
+                    fwd_time=float("nan"),
+                    bwd_time=float("nan") if config.calculate_bwd_time else None,
+                    sparsity=None,
+                )
+                return results
+            else:
+                # For individual backend functions: return single ExperimentResults
+                return ExperimentResults(
+                    fwd_time=float("nan"),
+                    bwd_time=float("nan") if config.calculate_bwd_time else None,
+                )
+
+        return wrapper
+
+    return decorator
+
+
+# Type definitions
+Backend = Literal["math", "efficient", "cudnn", "fav2", "fav3", "fakv", "og-eager"]
+AttentionType = Literal[
+    "noop",
+    "causal",
+    "rel",
+    "head_bias",
+    "alibi",
+    "sliding_window",
+    "document_mask",
+    "prefix_lm",
+    "softcap",
+]
+DtypeString = Literal["bfloat16", "float16", "float32"]
+SpeedupType = Literal["fwd", "bwd"]
+
+
 def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
    # warmup
    for _ in range(5):
@ -48,6 +142,7 @@ class ExperimentConfig:
    calculate_bwd_time: bool
    cal_bandwidth: bool
    backends: list[str]
+    max_autotune: bool

    def __post_init__(self):
        assert len(self.shape) == 6, (
@ -62,6 +157,7 @@ class ExperimentConfig:
        d.pop("cal_bandwidth", None)
        d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
        d.pop("backends", None)
+        d.pop("max_autotune", False)
        return d


@ -209,6 +305,7 @@ def query_key_value_clones(
    return query_ref, key_ref, value_ref


+@safe_backend("SDPA")
 def run_single_backend_sdpa(
    config: ExperimentConfig,
    query: torch.Tensor,
@ -223,6 +320,7 @@ def run_single_backend_sdpa(
    backend_context = get_backend_context(backend)
    with backend_context:
        _device = torch.device("cuda")
+
        eager_sdpa = generate_eager_sdpa(
            config.attn_type, config.shape, config.dtype, block_mask, score_mod
        )
@ -290,6 +388,7 @@ def run_single_backend_sdpa(
            )


+@safe_backend("FlashAttention")
 def run_single_backend_FA(
    config: ExperimentConfig,
    query: torch.Tensor,
@ -301,9 +400,9 @@ def run_single_backend_FA(
    mask_kwargs,
    backend: str,
 ) -> ExperimentResults:
-    assert backend in ["fav2", "fav3", "fakv"]
+    assert backend in ["fav3", "fakv"]
    # Generate callable for specific backend.
-    if backend in ["fav2", "fav3"]:
+    if backend in ["fav3"]:
        FA = generate_FA_callable(
            config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
        )
@ -354,10 +453,10 @@ def run_single_backend_FA(
    )


+@safe_backend("flex_attention", return_dict=True)
 def run_single_experiment(
    config: ExperimentConfig,
    dynamic=False,
-    max_autotune=False,
 ) -> dict[str, ExperimentResults]:
    device = torch.device("cuda")
    batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
@ -377,7 +476,7 @@ def run_single_experiment(
    block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
    kernel_options = get_kernel_options(config.attn_type, config.shape)

-    if max_autotune:
+    if config.max_autotune:
        compiled_sdpa = torch.compile(
            flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
        )
@ -407,7 +506,7 @@ def run_single_experiment(

    results = {}
    for backend in config.backends:
-        if backend in ["fav2", "fav3", "fakv"]:
+        if backend in ["fav3", "fakv"]:
            results[backend] = run_single_backend_FA(
                config,
                query,
@ -419,7 +518,7 @@ def run_single_experiment(
                mask_kwargs,
                backend,
            )
-        else:  # sdpa
+        else:  # sdpa (also supports fav2)
            results[backend] = run_single_backend_sdpa(
                config,
                query,
@ -440,7 +539,7 @@ def run_single_experiment(
    sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
    sparsity = sparsity if config.attn_type != "document_mask" else 0.5

-    results["compiled"] = ExperimentResults(
+    results["flex"] = ExperimentResults(
        fwd_time=forward_compiled_time,
        bwd_time=backward_compile_time if config.calculate_bwd_time else None,
        sparsity=sparsity,
@ -501,15 +600,15 @@ def calculate_tflops(config: ExperimentConfig, results: ExperimentResults) -> fl
    softmax_flops = M * N * 2  # Not counting online softmax overhead
    o_flops = M * D * N * 2
    # Not counting split k overhead
-    total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - results.sparsity)
+    sparsity = results.sparsity if results.sparsity is not None else 0.0
+    total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - sparsity)
    return total_flops / results.fwd_time / 1e6  # in TFLOPs/


 def get_average_speedups(results: list[Experiment], type: str, backend: str):
    # Calculate speedups
    speedups = [
-        calculate_speedup(r.results["compiled"], r.results[backend], type)
-        for r in results
+        calculate_speedup(r.results["flex"], r.results[backend], type) for r in results
    ]

    # Find indices of max and min speedups
@ -537,7 +636,7 @@ def get_average_speedups(results: list[Experiment], type: str, backend: str):
 def print_results(results: list[Experiment], save_path: Optional[str] = None):
    table_data = defaultdict(list)
    for experiment in results:
-        backends = experiment.config.backends + ["compiled"]
+        backends = experiment.config.backends + ["flex"]
        for key, value in experiment.asdict().items():
            if key in backends:
                if value.fwd_time:
@ -550,45 +649,43 @@ def print_results(results: list[Experiment], save_path: Optional[str] = None):
    # Calculate speedups
    for backend in results[0].config.backends:
        fwd_speedups = [
-            calculate_speedup(r.results["compiled"], r.results[backend], type="fwd")
+            calculate_speedup(r.results["flex"], r.results[backend], type="fwd")
            for r in results
        ]
-        table_data[f"fwd_{backend}_speedup"] = fwd_speedups
+        table_data[f"fwd_speedup_flex_over_{backend}"] = fwd_speedups

    if results[0].config.calculate_bwd_time:
        for backend in results[0].config.backends:
            bwd_speedups = [
-                calculate_speedup(r.results["compiled"], r.results[backend], type="bwd")
+                calculate_speedup(r.results["flex"], r.results[backend], type="bwd")
                for r in results
            ]
-            table_data[f"bwd_{backend}_speedup"] = bwd_speedups
+            table_data[f"bwd_speedup_flex_over_{backend}"] = bwd_speedups

    # Calculate mem + computational throughput
    if results[0].config.cal_bandwidth:
        fwd_bandwidth = [
-            calculate_bandwidth(r.config, r.results["compiled"], type="fwd")
+            calculate_bandwidth(r.config, r.results["flex"], type="fwd")
            for r in results
        ]
        table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
-        fwd_tflops = [
-            calculate_tflops(r.config, r.results["compiled"]) for r in results
-        ]
+        fwd_tflops = [calculate_tflops(r.config, r.results["flex"]) for r in results]
        table_data["TFlops/s"] = fwd_tflops

    print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))

    for backend in results[0].config.backends:
-        if np.isnan(table_data[f"fwd_{backend}_speedup"]).all():
+        if np.isnan(table_data[f"fwd_speedup_flex_over_{backend}"]).all():
            continue
        print("\n")
-        print(f"FWD Speedups vs. {backend}".center(125, "="))
+        print(f"FWD Speedup of Flex over {backend}".center(125, "="))
        print("\n")
        average_data = get_average_speedups(results, type="fwd", backend=backend)
        print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))

        if results[0].config.calculate_bwd_time:
            print("\n")
-            print(f"BWD Speedups vs. {backend}".center(125, "="))
+            print(f"BWD Speedup of Flex over {backend}".center(125, "="))
            print("\n")
            average_data = get_average_speedups(results, type="bwd", backend=backend)
            print(
@ -791,14 +888,14 @@ def get_backend_context(backend: str):
    Returns a context manager for the specified backend.
    Args:
        backend (str): The name of the backend to use.
-                       Valid options are 'fav2', 'cudnn', 'math', 'efficient', 'fav3', 'fakv', 'og-eager'.
+                       Valid options are 'math', 'efficient', 'cudnn', 'fav2', 'fav3', 'fakv', 'og-eager'.
    Returns:
        A context manager for the specified backend.
    Raises:
        ValueError: If an invalid backend is specified.
    """
    backends = {
-        "fav2": nullcontext(),
+        "fav2": sdpa_kernel(SDPBackend.FLASH_ATTENTION),
        "cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
        "math": sdpa_kernel(SDPBackend.MATH),
        "efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
@ -820,15 +917,7 @@ def generate_FA_callable(
 ) -> Callable | None:
    if dtype not in [torch.float16, torch.bfloat16]:
        return None
-    if backend == "fav2":
-        try:
-            from flash_attn import flash_attn_func, flash_attn_varlen_func
-        except ImportError:
-            print(
-                "Flash attention 2 is not installed. Please install it to run fav2 backend. "
-            )
-            raise
-    elif backend == "fav3":
+    if backend == "fav3":
        try:
            from flash_attn.flash_attn_interface import (
                flash_attn_func,
@ -1034,6 +1123,7 @@ def generate_experiment_configs(
    kv_cache_size: list[int],
    cal_bandwidth: bool,
    backends: list[str],
+    max_autotune: bool,
 ) -> list[ExperimentConfig]:
    assert not (calculate_bwd and decoding), "Decoding does not support backward"

@ -1077,52 +1167,333 @@ def generate_experiment_configs(
                calculate_bwd_time=calculate_bwd,
                cal_bandwidth=cal_bandwidth,
                backends=backends,
+                max_autotune=max_autotune,
            )
        )

    return all_configs


-def main(args):
+def _output_json_for_dashboard(
+    experiments,
+    output_file,
+    benchmark_name="PyTorch operator microbenchmark",
+):
+    """
+    Write the result into JSON format for PyTorch OSS dashboard.
+    The JSON format is defined at
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+
+    Args:
+        experiments: List of experiment results
+        output_file: Path to output JSON file
+        benchmark_name: Name of the benchmark
+    """
+    if not experiments:
+        return
+
+    import math
+    import platform
+    from dataclasses import asdict, dataclass
+    from typing import Any, Optional
+
+    # Prepare headers and records for JSON output
+    records = []
+    for experiment in experiments:
+        config = experiment.config
+        results_dict = (
+            experiment.results
+        )  # This is a dict: backend -> ExperimentResults
+
+        # Process each backend result
+        for backend, results in results_dict.items():
+            # Skip backends that were not run (NaN results)
+            if math.isnan(results.fwd_time):
+                continue
+
+            # Extract data from experiment
+            test_name = f"{backend}_{config.attn_type}_"
+            input_config = f"shape: {config.shape}, dtype: {config.dtype}"
+
+            # Determine mode based on backward pass
+            mode = "training" if config.calculate_bwd_time else "inference"
+
+            # Extract dtype
+            dtype = (
+                str(config.dtype).split(".")[1]
+                if "." in str(config.dtype)
+                else str(config.dtype)
+            )
+
+            # Determine device
+            device = "cuda"
+
+            # Get device architecture
+            device_arch = (
+                torch.cuda.get_device_name(0)
+                if device == "cuda"
+                else platform.processor()
+                if device == "cpu"
+                else "unknown"
+            )
+
+            # Create dataclasses for JSON structure
+            @dataclass
+            class BenchmarkInfo:
+                name: str
+                mode: Optional[str]
+                dtype: str
+                extra_info: dict[str, Any]
+
+            @dataclass
+            class ModelInfo:
+                name: str
+                type: str
+                origins: list[str]
+                extra_info: dict[str, Any]
+
+            @dataclass
+            class MetricInfo:
+                name: str
+                unit: str
+                benchmark_values: list[float]
+                target_value: Optional[float]
+
+            @dataclass
+            class BenchmarkRecord:
+                benchmark: BenchmarkInfo
+                model: ModelInfo
+                metric: MetricInfo
+
+            # Benchmark extra info
+            benchmark_extra_info = {
+                "input_config": input_config,
+                "device": device,
+                "arch": device_arch,
+                "operator_name": backend,
+                "attn_type": config.attn_type,
+                "shape": str(config.shape),
+                "max_autotune": config.max_autotune,
+            }
+            # Add record for forward latency
+            record_fwd_latency = BenchmarkRecord(
+                benchmark=BenchmarkInfo(
+                    name=benchmark_name,
+                    mode=mode,
+                    dtype=dtype,
+                    extra_info=benchmark_extra_info,
+                ),
+                model=ModelInfo(
+                    name=test_name + str(config.shape),
+                    type="attention-benchmark",
+                    origins=["pytorch"],
+                    extra_info={
+                        "operator_name": backend,
+                        "attn_type": config.attn_type,
+                    },
+                ),
+                metric=MetricInfo(
+                    name="forward latency",
+                    unit="us",
+                    benchmark_values=[results.fwd_time],
+                    target_value=None,
+                ),
+            )
+            records.append(asdict(record_fwd_latency))
+
+            # Add record for forward memory bandwidth (if available)
+            if config.cal_bandwidth:
+                record_fwd_bandwidth = BenchmarkRecord(
+                    benchmark=BenchmarkInfo(
+                        name=benchmark_name,
+                        mode=mode,
+                        dtype=dtype,
+                        extra_info=benchmark_extra_info,
+                    ),
+                    model=ModelInfo(
+                        name=test_name + str(config.shape),
+                        type="attention-benchmark",
+                        origins=["pytorch"],
+                        extra_info={
+                            "operator_name": backend,
+                        },
+                    ),
+                    metric=MetricInfo(
+                        name="memory bandwidth",
+                        unit="TB/s",
+                        benchmark_values=[calculate_bandwidth(config, results, "fwd")],
+                        target_value=None,
+                    ),
+                )
+                records.append(asdict(record_fwd_bandwidth))
+
+            # Add record for forward TFLOPS (if available)
+            if config.cal_bandwidth:
+                record_fwd_tflops = BenchmarkRecord(
+                    benchmark=BenchmarkInfo(
+                        name=benchmark_name,
+                        mode=mode,
+                        dtype=dtype,
+                        extra_info=benchmark_extra_info,
+                    ),
+                    model=ModelInfo(
+                        name=test_name + str(config.shape),
+                        type="attention-benchmark",
+                        origins=["pytorch"],
+                        extra_info={
+                            "operator_name": backend,
+                        },
+                    ),
+                    metric=MetricInfo(
+                        name="tflops",
+                        unit="TFLOPS/s",
+                        benchmark_values=[calculate_tflops(config, results)],
+                        target_value=None,
+                    ),
+                )
+                records.append(asdict(record_fwd_tflops))
+
+            # Add record for backward latency (if available and not NaN)
+            if (
+                config.calculate_bwd_time
+                and results.bwd_time is not None
+                and not math.isnan(results.bwd_time)
+            ):
+                record_bwd_latency = BenchmarkRecord(
+                    benchmark=BenchmarkInfo(
+                        name=benchmark_name,
+                        mode=mode,
+                        dtype=dtype,
+                        extra_info=benchmark_extra_info,
+                    ),
+                    model=ModelInfo(
+                        name=test_name + str(config.shape),
+                        type="attention-benchmark",
+                        origins=["pytorch"],
+                        extra_info={
+                            "operator_name": backend,
+                        },
+                    ),
+                    metric=MetricInfo(
+                        name="backward latency",
+                        unit="us",
+                        benchmark_values=[results.bwd_time],
+                        target_value=None,
+                    ),
+                )
+                records.append(asdict(record_bwd_latency))
+
+    # Write all records to the output file
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(records, f, indent=2)
+
+
+def main(
+    dynamic: bool = False,
+    calculate_bwd: bool = False,
+    dtype: DtypeString = "bfloat16",
+    b: list[int] | None = None,
+    nh: list[str] | None = None,
+    s: list[int] | None = None,
+    d: list[int] | None = None,
+    mods: list[AttentionType] | None = None,
+    backend: list[Backend] | None = None,
+    max_autotune: bool = False,
+    decoding: bool = False,
+    kv_size: Optional[list[int]] = None,
+    throughput: bool = True,
+    save_path: Optional[str] = None,
+    output_json_for_dashboard: Optional[str] = None,
+    benchmark_name: str = "PyTorch operator microbenchmark",
+) -> None:
+    """Run sweep over sizes and score mods for flex attention.
+
+    Usage Examples:
+        # Use a yml config file
+        python score_mod.py --config basic_config.yaml
+
+        # Use a json config file
+        python score_mod.py --config my_config.json
+
+        # Generate a config template
+        python score_mod.py --print-config json > my_config.json # For a json config
+        python score_mod.py --print-config yaml > my_config.yaml # For a yaml config
+
+        # Override config with CLI args
+        python score_mod.py --config my_config.json -dtype float16 --max-autotune
+
+        # Pure CLI usage
+        python score_mod.py -b 4 8 -s 1024 2048 -mods causal alibi --backend efficient
+
+    Args:
+        dynamic: Runs a dynamic shapes version of compiled flex attention
+        calculate_bwd: Calculate backward pass times
+        dtype: Data type for tensors (bfloat16, float16, float32)
+        b: Batch sizes to benchmark
+        nh: Number of query and key/value heads in format "Hq,Hkv"
+        s: Sequence lengths to benchmark
+        d: Head dimensions to benchmark
+        mods: Score modifications: noop, causal, rel, head_bias, alibi, sliding_window, document_mask, prefix_lm, softcap
+        backend: Backends for attention computation: math, efficient, cudnn, fav2, fav3, fakv, og-eager
+        max_autotune: Turn on max-autotune optimization
+        decoding: Benchmark decoding mode (query sequence length = 1)
+        kv_size: Key/value cache size in MiB (ignores batch size if specified)
+        throughput: Calculate kernel memory bandwidth & computational throughput (always True)
+        save_path: Path to save the results CSV file
+        output_json_for_dashboard: Path to save results in JSON format for PyTorch OSS dashboard
+        benchmark_name: Name of the benchmark for dashboard output
+    """
+    # Convert dtype string to torch dtype (if not already converted)
+    import torch
+
+    if isinstance(dtype, str):
+        dtype = getattr(torch, dtype)
+
+    # Always calculate throughput
+    throughput = True
+    print("Backend: ", backend)
    seed = 123
    np.random.seed(seed)
    torch.manual_seed(seed)
    results = []
-    for config in tqdm(
-        generate_experiment_configs(
-            args.calculate_bwd,
-            args.dtype,
-            args.b,
-            args.nh,
-            args.s,
-            args.d,
-            args.mods,
-            args.decoding,
-            args.kv_size,
-            args.throughput,
-            args.backend,
-        )
+    for experiment_count, config in enumerate(
+        tqdm(
+            generate_experiment_configs(
+                calculate_bwd,
+                dtype,
+                b,
+                nh,
+                s,
+                d,
+                mods,
+                decoding,
+                kv_size,
+                throughput,
+                backend,
+                max_autotune,
+            )
+        ),
+        start=1,
    ):
        results.append(
            Experiment(
                config,
                run_single_experiment(
                    config,
-                    dynamic=args.dynamic,
-                    max_autotune=args.max_autotune,
+                    dynamic=dynamic,
                ),
            )
        )

-    print_results(results, args.save_path)
+        # Periodic memory cleanup every 50 experiments
+        if experiment_count % 50 == 0:
+            cleanup_memory()

+    print_results(results, save_path)

-def heads_input_type(s):
-    try:
-        hq, hkv = map(int, s.split(","))
-        return hq, hkv
-    except Exception as e:
-        raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
+    # Output JSON for dashboard if requested
+    if output_json_for_dashboard:
+        _output_json_for_dashboard(results, output_json_for_dashboard, benchmark_name)


 if __name__ == "__main__":
@ -1130,6 +1501,12 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Run sweep over sizes and score mods for flex attention"
    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        help="Path to JSON config file. CLI args override config file values.",
+        default=None,
+    )
    parser.add_argument(
        "--dynamic",
        action="store_true",
@ -1199,8 +1576,49 @@ Ignores -b batch size and calculate batch size from kv size instead when specifi
        default=["efficient"],
        help="Backend to use for attention computation",
    )
+    parser.add_argument(
+        "--output-json-for-dashboard",
+        type=str,
+        help="Path to save results in JSON format for PyTorch OSS dashboard",
+        default=None,
+    )
+    parser.add_argument(
+        "--benchmark-name",
+        type=str,
+        help="Name of the benchmark for dashboard output",
+        default="PyTorch operator microbenchmark",
+    )
+    parser.add_argument(
+        "--print-config",
+        type=str,
+        choices=["json", "yaml"],
+        help="Print a default config template in JSON or YAML format and exit",
+        default=None,
+    )
    # Parse arguments
    args = parser.parse_args()
-    args.dtype = getattr(torch, args.dtype)

-    main(args)
+    # Handle --print-config
+    if args.print_config:
+        print_default_config(args.print_config)
+        sys.exit(0)
+
+    # Load and merge config if provided
+    if args.config:
+        config = load_config_file(args.config)
+
+        # Merge config with CLI args (CLI args take precedence)
+        json_args = argparse.Namespace()
+        json_args.__dict__ = config
+        args = parser.parse_args(namespace=json_args)
+
+    # Convert dtype string to torch dtype (only if it's still a string)
+    if isinstance(args.dtype, str):
+        args.dtype = getattr(torch, args.dtype)
+
+    # Remove config and print_config from args before passing to main
+    args_dict = vars(args)
+    args_dict.pop("config", None)
+    args_dict.pop("print_config", None)
+
+    main(**args_dict)
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -916,6 +916,7 @@ libtorch_python_core_sources = [
    "torch/csrc/autograd/python_torch_functions_manual.cpp",
    "torch/csrc/autograd/python_variable.cpp",
    "torch/csrc/autograd/python_variable_indexing.cpp",
+    "torch/csrc/distributed/python_placement.cpp",
    "torch/csrc/dynamo/python_compiled_autograd.cpp",
    "torch/csrc/dynamo/cache_entry.cpp",
    "torch/csrc/dynamo/cpp_shim.cpp",
@ -1073,6 +1074,7 @@ aten_cpu_non_globed_sources = [
    "aten/src/ATen/detail/MPSHooksInterface.cpp",
    "aten/src/ATen/detail/MAIAHooksInterface.cpp",
    "aten/src/ATen/detail/PrivateUse1HooksInterface.cpp",
+    "aten/src/ATen/detail/XLAHooksInterface.cpp",
    "aten/src/ATen/detail/XPUHooksInterface.cpp",
    "aten/src/ATen/detail/MTIAHooksInterface.cpp",
    "aten/src/ATen/detail/IPUHooksInterface.cpp",
@ -1091,6 +1093,7 @@ aten_cpu_non_globed_headers = [
    "aten/src/ATen/detail/HPUHooksInterface.h",
    "aten/src/ATen/detail/MAIAHooksInterface.h",
    "aten/src/ATen/detail/PrivateUse1HooksInterface.h",
+    "aten/src/ATen/detail/XLAHooksInterface.h",
    "aten/src/ATen/detail/XPUHooksInterface.h",
    "aten/src/ATen/detail/MTIAHooksInterface.h",
    "aten/src/ATen/detail/IPUHooksInterface.h",
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@ -556,3 +556,26 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
 }

 } // namespace c10
+
+#include <limits>
+
+namespace std {
+
+template <>
+class numeric_limits<c10::SymInt> {
+ public:
+  static constexpr bool is_specialized = true;
+
+  static constexpr int64_t max() noexcept {
+    return std::numeric_limits<int64_t>::max();
+  }
+
+  static constexpr int64_t min() noexcept {
+    return std::numeric_limits<int64_t>::min();
+  }
+
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = true;
+};
+
+} // namespace std
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@ -329,17 +329,17 @@ struct pair {
 };

 template <typename T>
-static T conj(T a) {
+inline T conj(T a) {
  return a;
 }

 template <>
-half2 conj(half2 a) {
+inline half2 conj(half2 a) {
  return half2(a.x, -a.y);
 }

 template <>
-float2 conj(float2 a) {
+inline float2 conj(float2 a) {
  return float2(a.x, -a.y);
 }

--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -123,6 +123,8 @@ class DeviceCachingAllocator {
  ska::flat_hash_map<xpu::XPUStream, std::deque<std::pair<sycl::event, Block*>>>
      xpu_events;
  DeviceIndex device_index;
+  size_t allowed_memory_maximum = 0;
+  bool set_fraction = false;

  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
    if (!src || src->allocated || src->event_count > 0 ||
@ -245,6 +247,12 @@ class DeviceCachingAllocator {
    if (isRetry) {
      stats.num_alloc_retries += 1;
    }
+    if (set_fraction &&
+        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current +
+                size >
+            allowed_memory_maximum) {
+      return false;
+    }
    void* ptr = sycl::aligned_alloc_device(
        kDeviceAlignment,
        size,
@ -435,6 +443,11 @@ class DeviceCachingAllocator {
        device_free =
            raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
      }
+      std::string allowed_info;
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+      }
+
      auto allocated_bytes =
          stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
              .current;
@ -459,7 +472,9 @@ class DeviceCachingAllocator {
          format_size(device_total),
          " of which ",
          format_size(device_free),
-          " is free. Of the allocated memory ",
+          " is free. ",
+          allowed_info,
+          "Of the allocated memory ",
          format_size(allocated_bytes),
          " is allocated by PyTorch, and ",
          format_size(reserved_bytes - allocated_bytes),
@ -538,6 +553,14 @@ class DeviceCachingAllocator {
      stats.requested_bytes[statType].reset_peak();
    }
  }
+
+  void setMemoryFraction(double fraction) {
+    c10::xpu::DeviceProp device_prop;
+    c10::xpu::get_device_properties(&device_prop, device_index);
+    auto device_total = device_prop.global_mem_size;
+    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+    set_fraction = true;
+  }
 };

 static void local_raw_delete(void* ptr);
@ -700,6 +723,16 @@ class XPUAllocator : public DeviceAllocator {
    assertValidDevice(device);
    device_allocators[device]->resetAccumulatedStats();
  }
+
+  void setMemoryFraction(double fraction, DeviceIndex device) {
+    assertValidDevice(device);
+    TORCH_CHECK_VALUE(
+        0 < fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within (0, 1].");
+    device_allocators[device]->setMemoryFraction(fraction);
+  }
 };

 static XPUAllocator allocator;
@ -744,6 +777,10 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
  return allocator.recordStream(dataPtr, stream);
 }

+void setMemoryFraction(double fraction, DeviceIndex device) {
+  return allocator.setMemoryFraction(fraction, device);
+}
+
 REGISTER_ALLOCATOR(kXPU, &allocator)

 } // namespace c10::xpu::XPUCachingAllocator
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@ -25,4 +25,6 @@ C10_XPU_API void raw_delete(void* ptr);

 C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);

+C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
+
 } // namespace c10::xpu::XPUCachingAllocator
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1358,9 +1358,15 @@ if(BUILD_TEST)
    )
  else()
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
+    add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
    # NativeRT is disabled
    # add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
+    add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
+    if(BUILD_AOT_INDUCTOR_TEST)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
+    endif()
+
    if(USE_DISTRIBUTED)
      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
      if(NOT WIN32)
@ -1378,16 +1384,6 @@ if(BUILD_TEST)
        ${CMAKE_BINARY_DIR}/test_mobile_nnc
      )
    endif()
-    add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
-                     ${CMAKE_BINARY_DIR}/test_lazy)
-  endif()
-  if(BUILD_AOT_INDUCTOR_TEST)
-    add_subdirectory(
-      ${TORCH_ROOT}/test/cpp/aoti_abi_check
-      ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
-    add_subdirectory(
-      ${TORCH_ROOT}/test/cpp/aoti_inference
-      ${CMAKE_BINARY_DIR}/test_aoti_inference)
  endif()
 endif()

--- a/cmake/Modules/FindOpenBLAS.cmake
+++ b/cmake/Modules/FindOpenBLAS.cmake
@ -29,10 +29,15 @@ SET(Open_BLAS_LIB_SEARCH_PATHS
        $ENV{OpenBLAS}/lib
        $ENV{OpenBLAS_HOME}
        $ENV{OpenBLAS_HOME}/lib
- )
+)
+
+SET(Open_BLAS_LIB_NAME openblas)
+IF(DEFINED ENV{OpenBLAS_LIB_NAME})
+  SET(Open_BLAS_LIB_NAME $ENV{OpenBLAS_LIB_NAME})
+ENDIF()

 FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
-FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
+FIND_LIBRARY(OpenBLAS_LIB NAMES ${Open_BLAS_LIB_NAME} PATHS ${Open_BLAS_LIB_SEARCH_PATHS})

 SET(OpenBLAS_FOUND ON)

--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -383,7 +383,7 @@ function(torch_compile_options libname)
      -Wno-strict-aliasing
      )
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      list(APPEND private_compile_options -Wredundant-move)
+      list(APPEND private_compile_options -Wredundant-move -Wno-interference-size)
    endif()
    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
      list(APPEND private_compile_options -Wextra-semi -Wmove)
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@ -14,7 +14,7 @@ Combining, these building blocks form a research and
 production ready C++ library for tensor computation and dynamic neural
 networks with strong emphasis on GPU acceleration as well as fast CPU
 performance. It is currently in use at Facebook in research and
-production; we are looking forward to welcome more users of the PyTorch C++ API.
+production; we are looking forward to welcoming more users of the PyTorch C++ API.

 .. warning::

--- a/docs/cpp/source/notes/inference_mode.rst
+++ b/docs/cpp/source/notes/inference_mode.rst
@ -64,7 +64,7 @@ users should pay additional attention to:

  - Both guards affects tensor execution process to skip work not related to inference, but ``InferenceMode``
    also affects tensor creation while ``AutoNonVariableTypeMode`` doesn't. In other words, tensors created
-    inside ``InferenceMode`` are marked as inference tensors so that certain limitation can be applied after
+    inside ``InferenceMode`` are marked as inference tensors so that certain limitations can be applied after
    exiting ``InferenceMode``.
  - Enabled/disabled ``InferenceMode`` states can be nested while ``AutoNonVariableTypeMode`` only allows enabled state.

--- a/docs/source/checkpoint.md
+++ b/docs/source/checkpoint.md
@ -17,7 +17,7 @@ restoring the RNG state during each checkpoint.
 The stashing logic saves and restores the RNG state for CPU and another
 device type (infer the device type from Tensor arguments excluding CPU
 tensors by `_infer_device_type`) to the `run_fn`. If there are multiple
-device, device state will only be saved for devices of a single device type,
+devices, device state will only be saved for devices of a single device type,
 and the remaining devices will be ignored. Consequently, if any checkpointed
 functions involve randomness, this may result in incorrect gradients. (Note
 that if CUDA devices are among the devices detected, it will be prioritized;
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -1066,6 +1066,8 @@ coverage_ignore_functions = [
    "set_current_meta",
    "set_grad_fn_seq_nr",
    "set_stack_trace",
+    "set_current_replay_node",
+    "get_current_replay_node",
    # torch.jit.annotations
    "ann_to_type",
    "check_fn",
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@ -59,14 +59,14 @@ MPI supports CUDA only if the implementation used to build PyTorch supports it.

 ### Backends that come with PyTorch

-PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype).
+PyTorch distributed package supports Linux (stable), macOS (stable), and Windows (prototype).
 By default for Linux, the Gloo and NCCL backends are built and included in PyTorch
 distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be
 included if you build PyTorch from source. (e.g. building PyTorch on a host that has MPI
 installed.)

 :::{note}
-As of PyTorch v1.8, Windows supports all collective communications backend but NCCL,
+As of PyTorch v1.8, Windows supports all collective communications backends but NCCL,
 If the `init_method` argument of {func}`init_process_group` points to a file it must adhere
 to the following schema:

--- a/docs/source/distributed.tensor.md
+++ b/docs/source/distributed.tensor.md
@ -99,6 +99,12 @@ DTensor supports the following types of {class}`Placement` on each {class}`Devic
  :undoc-members:
 ```

+```{eval-rst}
+.. autoclass:: MaskPartial
+  :members:
+  :undoc-members:
+```
+
 ```{eval-rst}
 .. autoclass:: Placement
  :members:
--- a/docs/source/mtia.md
+++ b/docs/source/mtia.md
@ -1,6 +1,6 @@
 # torch.mtia

-The MTIA backend is implemented out of the tree, only interfaces are be defined here.
+The MTIA backend is implemented out of the tree, only interfaces are defined here.

 ```{eval-rst}
 .. automodule:: torch.mtia
--- a/docs/source/mtia.memory.md
+++ b/docs/source/mtia.memory.md
@ -1,6 +1,6 @@
 # torch.mtia.memory

-The MTIA backend is implemented out of the tree, only interfaces are be defined here.
+The MTIA backend is implemented out of the tree, only interfaces are defined here.

 ```{eval-rst}
 .. automodule:: torch.mtia.memory
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@ -263,12 +263,31 @@ offers a comprehensive example of using these features to manipulate a checkpoin
 Starting in version 2.6, ``torch.load`` will use ``weights_only=True`` if the ``pickle_module``
 argument is not passed.

+.. _weights-only-security:
+
+weights_only security
+^^^^^^^^^^^^^^^^^^^^^
+
 As discussed in the documentation for :func:`torch.load`, ``weights_only=True`` restricts
 the unpickler used in ``torch.load`` to only executing functions/building classes required for
 ``state_dicts`` of plain ``torch.Tensors`` as well as some other primitive types. Further,
 unlike the default ``Unpickler`` provided by the ``pickle`` module, the ``weights_only`` Unpickler
 is not allowed to dynamically import anything during unpickling.

+``weights_only=True`` narrows the surface of remote code execution attacks but has the following limitations:
+
+1. ``weights_only=True`` does not guard against denial of service attacks.
+2. We try to prevent memory corruptions during ``torch.load(weights_only=True)`` but they might still be possible.
+
+Note that even if memory corruption does not occur during ``torch.load`` itself, loading CAN create
+unexpected objects for the downstream code that can also lead to memory corruption (e.g. a Tensor of
+indices and values made to a sparse Tensor in user code might write/read out of bounds).
+
+.. _weights-only-allowlist:
+
+weights_only allowlist
+^^^^^^^^^^^^^^^^^^^^^^
+
 As mentioned above, saving a module's ``state_dict`` is a best practice when using ``torch.save``. If loading an old
 checkpoint that contains an ``nn.Module``, we recommend ``weights_only=False``. When loading a checkpoint that contains
 tensor subclasses, there will likely be functions/classes that need to be allowlisted, see below for further details.
--- a/docs/source/xpu.md
+++ b/docs/source/xpu.md
@ -85,6 +85,7 @@
     memory_stats_as_nested_dict
     reset_accumulated_memory_stats
     reset_peak_memory_stats
+     set_per_process_memory_fraction
 ```

 ```{eval-rst}
--- a/test/conftest.py
+++ b/test/conftest.py
@ -238,7 +238,7 @@ def pytest_pycollect_makemodule(module_path, path, parent) -> Module:

@pytest.hookimpl(hookwrapper=True)
 def pytest_report_teststatus(report, config):
-    # Add the test time to the verbose output, unforunately I don't think this
+    # Add the test time to the verbose output, unfortunately I don't think this
    # includes setup or teardown
    pluggy_result = yield
    if not isinstance(report, pytest.TestReport):
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@ -1,3 +1,8 @@
+# Skip on windows
+if(WIN32)
+  return()
+endif()
+
 set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)

 # Build the cpp gtest binary containing the cpp-only tests.
@ -30,8 +35,15 @@ target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)

 # WARNING: DO NOT LINK torch!!!
 # The purpose is to check if the used aten/c10 headers are written in a header-only way
-target_link_libraries(test_aoti_abi_check PRIVATE gtest_main)
+target_link_libraries(test_aoti_abi_check PRIVATE gtest_main sleef)
 target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
+if(NOT USE_SYSTEM_SLEEF)
+  target_include_directories(test_aoti_abi_check PRIVATE ${CMAKE_BINARY_DIR}/include)
+endif()
+
+# Disable unused-variable warnings for variables that are only used to test compilation
+target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
+target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)

 foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
  foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
@ -41,12 +53,17 @@ foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
    separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}")
    add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")

-    target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main)
+    target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main sleef)
    target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${ATen_CPU_INCLUDE})
+    if(NOT USE_SYSTEM_SLEEF)
+      target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${CMAKE_BINARY_DIR}/include)
+    endif()

    # Define CPU_CAPABILITY and CPU_CAPABILITY_XXX macros for conditional compilation
    target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY} CPU_CAPABILITY_${CPU_CAPABILITY})
    target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE ${FLAGS})
+    target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-variable)
+    target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-but-set-variable)
  endforeach()
 endforeach()

--- a/test/cpp/aoti_abi_check/test_vec.cpp
+++ b/test/cpp/aoti_abi_check/test_vec.cpp
@ -2,10 +2,27 @@

 #include <ATen/cpu/vec/vec.h>

-#include <iostream>
 namespace torch {
 namespace aot_inductor {

+template <typename T>
+void ExpectVecEqual(
+    const at::vec::Vectorized<T>& expected,
+    const at::vec::Vectorized<T>& actual) {
+  using Vec = at::vec::Vectorized<T>;
+  // Have to use std::vector for comparison because at::vec::Vectorized doesn't
+  // support operator[] on aarch64
+  std::vector<T> expected_data(Vec::size());
+  std::vector<T> actual_data(Vec::size());
+
+  expected.store(expected_data.data());
+  actual.store(actual_data.data());
+
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_data[i], actual_data[i]);
+  }
+}
+
 TEST(TestVec, TestAdd) {
  using Vec = at::vec::Vectorized<int>;
  std::vector<int> a(1024, 1);
@ -16,9 +33,7 @@ TEST(TestVec, TestAdd) {
  std::vector<int> expected(1024, 3);
  Vec expected_vec = Vec::loadu(expected.data());

-  for (int i = 0; i < Vec::size(); i++) {
-    EXPECT_EQ(expected_vec[i], actual_vec[i]);
-  }
+  ExpectVecEqual(expected_vec, actual_vec);
 }

 TEST(TestVec, TestMax) {
@ -30,9 +45,7 @@ TEST(TestVec, TestMax) {
  Vec actual_vec = at::vec::maximum(a_vec, b_vec);
  Vec expected_vec = b_vec;

-  for (int i = 0; i < Vec::size(); i++) {
-    EXPECT_EQ(expected_vec[i], actual_vec[i]);
-  }
+  ExpectVecEqual(expected_vec, actual_vec);
 }

 TEST(TestVec, TestMin) {
@ -44,9 +57,7 @@ TEST(TestVec, TestMin) {
  Vec actual_vec = at::vec::minimum(a_vec, b_vec);
  Vec expected_vec = a_vec;

-  for (int i = 0; i < Vec::size(); i++) {
-    EXPECT_EQ(expected_vec[i], actual_vec[i]);
-  }
+  ExpectVecEqual(expected_vec, actual_vec);
 }

 TEST(TestVec, TestConvert) {
@ -58,9 +69,7 @@ TEST(TestVec, TestConvert) {
  auto actual_vec = at::vec::convert<float>(a_vec);
  auto expected_vec = b_vec;

-  for (int i = 0; i < at::vec::Vectorized<int>::size(); i++) {
-    EXPECT_EQ(expected_vec[i], actual_vec[i]);
-  }
+  ExpectVecEqual(expected_vec, actual_vec);
 }

 TEST(TestVec, TestClampMin) {
@ -72,9 +81,7 @@ TEST(TestVec, TestClampMin) {
  Vec actual_vec = at::vec::clamp_min(a_vec, min_vec);
  Vec expected_vec = min_vec;

-  for (int i = 0; i < Vec::size(); i++) {
-    EXPECT_EQ(expected_vec[i], actual_vec[i]);
-  }
+  ExpectVecEqual(expected_vec, actual_vec);
 }

 } // namespace aot_inductor
--- a/test/cpp/aoti_inference/CMakeLists.txt
+++ b/test/cpp/aoti_inference/CMakeLists.txt
@ -1,4 +1,3 @@
-
 set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_inference)

 # Build custom TorchScript op for AOTInductor
@ -8,27 +7,12 @@ set_target_properties(aoti_custom_class PROPERTIES
 if(USE_CUDA)
  target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA)
 elseif(USE_ROCM)
-    target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
+  target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
 endif()
+
 # Link against LibTorch
 target_link_libraries(aoti_custom_class torch)

-# the custom command that generates the TorchScript module
-add_custom_command(
-    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
-           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
-           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
-    # This script requires the torch package to be installed.
-    COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
-    DEPENDS torch torch_python aoti_custom_class ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
-)
-add_custom_target(aoti_script_model ALL
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
-)
-add_dependencies(aoti_script_model aoti_custom_class)
-
 # Build the cpp gtest binary containing the cpp-only tests.
 set(INDUCTOR_TEST_SRCS
  ${AOT_INDUCTOR_TEST_ROOT}/test.cpp
@ -37,23 +21,12 @@ set(INDUCTOR_TEST_SRCS
 add_executable(test_aoti_inference
  ${TORCH_ROOT}/test/cpp/common/main.cpp
  ${INDUCTOR_TEST_SRCS}
-  data.pt
-  script_data.pt
-  script_model_cpu.pt
-  script_model_cuda.pt
 )
-add_dependencies(test_aoti_inference aoti_custom_class aoti_script_model)
+add_dependencies(test_aoti_inference aoti_custom_class)

 # TODO temporary until we can delete the old gtest polyfills.
 target_compile_definitions(test_aoti_inference PRIVATE USE_GTEST)

-# Define a custom command to generate the library
-add_custom_command(
-        OUTPUT data.pt
-        COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py
-        DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py
-)
-
 target_link_libraries(test_aoti_inference PRIVATE
  torch
  gtest_main
@ -71,6 +44,10 @@ target_compile_definitions(test_aoti_inference PRIVATE
    CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
 )

+target_compile_options_if_supported(test_aoti_inference -Wno-unused-variable)
+target_compile_options_if_supported(test_aoti_inference -Wno-unused-but-set-variable)
+target_compile_options_if_supported(test_aoti_inference -Wno-unused-function)
+
 if(INSTALL_TEST)
  install(TARGETS test_aoti_inference DESTINATION bin)
  # Install PDB files for MSVC builds
--- a/test/cpp/aoti_inference/test.cpp
+++ b/test/cpp/aoti_inference/test.cpp
@ -2,7 +2,9 @@
 #include <gtest/gtest.h>
 #include <atomic>
 #include <condition_variable>
+#include <cstdlib>
 #include <filesystem>
+#include <fstream>
 #include <functional>
 #include <mutex>
 #include <queue>
@ -28,6 +30,64 @@

 namespace {

+// Function to check if test data files exist and are valid
+bool testDataFilesExist() {
+  std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
+  std::array<std::string, 4> required_files = {
+      "data.pt",
+      "script_data.pt",
+      "script_model_cpu.pt",
+      "script_model_cuda.pt"};
+
+  for (const auto& filename : required_files) {
+    std::string filepath = bindir + "/" + filename;
+    std::ifstream file(filepath);
+    if (!file.good()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Function to ensure test data files are generated at runtime
+void ensureTestDataGenerated() {
+  static std::once_flag generated_flag;
+  std::call_once(generated_flag, []() {
+    // Only generate if files don't exist or are placeholders
+    if (testDataFilesExist()) {
+      return;
+    }
+
+    std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
+
+    // Calculate path to source directory: build/test_aoti_inference -> build ->
+    // pytorch
+    std::string pytorch_root = bindir.substr(0, bindir.find_last_of("/"));
+    pytorch_root = pytorch_root.substr(0, pytorch_root.find_last_of("/"));
+    std::string source_dir = pytorch_root + "/test/cpp/aoti_inference";
+
+    // Generate test data files (data.pt, etc.) by running test.py directly
+    std::string test_script = source_dir + "/test.py";
+    std::string test_data_cmd = "cd " + bindir + " && python " + test_script;
+    std::cout << "Generating test data: " << test_data_cmd << std::endl;
+    int result1 = std::system(test_data_cmd.c_str());
+    if (result1 != 0) {
+      std::cerr << "Warning: Test data generation failed with code " << result1
+                << std::endl;
+    }
+
+    // Generate model files (script_*.pt) by running compile_model.py directly
+    std::string compile_script = source_dir + "/compile_model.py";
+    std::string models_cmd = "cd " + bindir + " && python " + compile_script;
+    std::cout << "Generating model files: " << models_cmd << std::endl;
+    int result2 = std::system(models_cmd.c_str());
+    if (result2 != 0) {
+      std::cerr << "Warning: Model generation failed with code " << result2
+                << std::endl;
+    }
+  });
+}
+
 const std::unordered_map<std::string, at::Tensor> derefTensorConstantMap(
    torch::inductor::TensorConstantMap tensor_constant_map) {
  std::unordered_map<std::string, at::Tensor> ret;
@ -855,7 +915,6 @@ void test_aoti_free_buffer(bool use_runtime_constant_folding) {
  }
 }

-#if defined(USE_CUDA) || defined(USE_ROCM)
 void test_cuda_alloc_test() {
  torch::NoGradGuard no_grad;

@ -895,8 +954,8 @@ void test_cuda_alloc_test() {
      runner->run(data_loader.attr(inputs_attr.c_str()).toTensorList().vec());
  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
 }
-#endif

+#ifdef USE_CUDA
 class ThreadPool {
 private:
  struct Task {
@ -1037,86 +1096,96 @@ void test_multi_cuda_streams(const std::string& device) {
    ASSERT_TRUE(torch::allclose(ref_output_tensors[0], all_outputs[i][0]));
  }
 }
-#endif
+#endif // USE_CUDA
+#endif // USE_CUDA || USE_ROCM
 } // namespace

 namespace torch::aot_inductor {

-TEST(AotInductorTest, BasicTestCpu) {
+// Test fixture that ensures test data is generated once for all tests
+class AotInductorTest : public ::testing::Test {
+ public:
+  // This runs once before all tests in this test suite
+  static void SetUpTestSuite() {
+    ensureTestDataGenerated();
+  }
+};
+
+TEST_F(AotInductorTest, BasicTestCpu) {
  test_aoti("cpu", false);
 }

-TEST(AotInductorTest, BasicScriptTestCpu) {
+TEST_F(AotInductorTest, BasicScriptTestCpu) {
  test_aoti_script("cpu");
 }

-TEST(AotInductorTest, BasicPackageLoaderTestCpu) {
+TEST_F(AotInductorTest, BasicPackageLoaderTestCpu) {
  test_aoti_package_loader("cpu", false);
 }

-TEST(AotInductorTest, ExtractConstantsMapCpu) {
+TEST_F(AotInductorTest, ExtractConstantsMapCpu) {
  test_aoti_extract_constants_map("cpu");
 }

 #ifdef USE_CUDA
-TEST(AotInductorTest, BasicTestCuda) {
+TEST_F(AotInductorTest, BasicTestCuda) {
  test_aoti("cuda", true);
  test_aoti("cuda", false);
 }

-TEST(AotInductorTest, BasicScriptTestCuda) {
+TEST_F(AotInductorTest, BasicScriptTestCuda) {
  test_aoti_script("cuda");
 }

-TEST(AotInductorTest, BasicPackageLoaderTestCuda) {
+TEST_F(AotInductorTest, BasicPackageLoaderTestCuda) {
  test_aoti_package_loader("cuda", false);
 }

-TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
+TEST_F(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
  test_aoti_package_loader_multi_gpu("cuda", false);
 }

-TEST(AotInductorTest, UpdateUserManagedConstantsCuda) {
+TEST_F(AotInductorTest, UpdateUserManagedConstantsCuda) {
  test_aoti_user_managed_buffer();
 }

-TEST(AotInductorTest, RuntimeUpdateConstantsCuda) {
+TEST_F(AotInductorTest, RuntimeUpdateConstantsCuda) {
  test_aoti_constants_update("cuda", true);
 }

-TEST(AotInductorTest, UpdateConstantsCuda) {
+TEST_F(AotInductorTest, UpdateConstantsCuda) {
  test_aoti_constants_update("cuda", false);
 }

-TEST(AotInductorTest, ExtractConstantsMapCuda) {
+TEST_F(AotInductorTest, ExtractConstantsMapCuda) {
  test_aoti_extract_constants_map("cuda");
 }

-TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
+TEST_F(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
  test_aoti_double_buffering("cuda", true);
 }

-TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
+TEST_F(AotInductorTest, UpdateInactiveConstantsCuda) {
  test_aoti_double_buffering("cuda", false);
 }

-TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
+TEST_F(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
  test_aoti_double_buffering_with_tensor_constants();
 }

-TEST(AotInductorTest, FreeInactiveConstantBufferCuda) {
+TEST_F(AotInductorTest, FreeInactiveConstantBufferCuda) {
  test_aoti_free_buffer(false);
 }

-TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
+TEST_F(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
  test_aoti_free_buffer(true);
 }

-TEST(AotInductorTest, MultiStreamTestCuda) {
+TEST_F(AotInductorTest, MultiStreamTestCuda) {
  test_multi_cuda_streams("cuda");
 }

-TEST(AotInductorTest, CudaAllocTestCuda) {
+TEST_F(AotInductorTest, CudaAllocTestCuda) {
  test_cuda_alloc_test();
 }
 #endif
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@ -584,7 +584,7 @@ TEST(CustomAutogradTest, MarkDirty) {
    }
  };

-  // Clone here because modifying leafs inplace is not allowed
+  // Clone here because modifying leaves inplace is not allowed
  auto x = torch::randn({5, 5}, torch::requires_grad()).clone();
  auto version_before = x._version();
  auto out = MyFunction::apply(x);
--- a/test/cpp/api/parallel.cpp
+++ b/test/cpp/api/parallel.cpp
@ -264,7 +264,7 @@ TEST_F(ParallelTest, DataParallelNumericalEquivalence_MultiCUDA) {
    input += i;
    input_dp += i;

-    // non-prallel training
+    // non-parallel training
    torch::optim::SGD optim(model->parameters(), torch::optim::SGDOptions(0.1));
    auto output = model->forward(input);
    auto loss = torch::mse_loss(output, torch::zeros_like(output));
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
@ -149,8 +149,8 @@ When `import torch`, installed accelerators (such as `torch_openreg`) will be au
 ### Installation

 ```python
-pip3 install --no-build-isolation -e . # for develop
-pip3 install --no-build-isolation . # for install
+python -m pip install --no-build-isolation -e . # for develop
+python -m pip install --no-build-isolation . # for install
 ```

 ### Usage Example
@ -188,7 +188,7 @@ Please refer to [this](https://docs.pytorch.org/docs/main/accelerator/index.html
  - Device-agnostic APIs
  - Memory Management
  - Generator
-  - Distrubuted
+  - Distributed
  - Custom Tensor&Storage
  - ...
 - **Improve Tests**: Add more test cases related to the integration mechanism.
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autocast.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autocast.py
@ -8,7 +8,8 @@ class TestAutocast(TestCase):
    def test_autocast_with_unsupported_type(self):
        with self.assertWarnsRegex(
            UserWarning,
-            "In openreg autocast, but the target dtype torch.float32 is not supported.",
+            "In openreg autocast, but the target dtype is not supported. Disabling autocast.\n"
+            "openreg Autocast only supports dtypes of torch.float16, torch.bfloat16 currently.",
        ):
            with torch.autocast(device_type="openreg", dtype=torch.float32):
                _ = torch.ones(10)
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_streams.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_streams.py
@ -5,6 +5,7 @@ from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, T


 class TestStream(TestCase):
+    @skipIfTorchDynamo()
    def test_stream_create(self):
        stream = torch.Stream(device="openreg")
        self.assertEqual(stream.device_index, torch.openreg.current_device())
@ -24,6 +25,7 @@ class TestStream(TestCase):
        )
        self.assertEqual(stream, stream1)

+    @skipIfTorchDynamo()
    def test_stream_context(self):
        with torch.Stream(device="openreg:1") as stream:
            self.assertEqual(torch.accelerator.current_stream(), stream)
@ -40,6 +42,7 @@ class TestStream(TestCase):
        current_stream = torch.accelerator.current_stream()
        self.assertEqual(current_stream, stream2)

+    @skipIfTorchDynamo()
    def test_stream_synchronize(self):
        stream = torch.Stream(device="openreg:1")
        self.assertEqual(True, stream.query())
@ -49,12 +52,14 @@ class TestStream(TestCase):
        stream.synchronize()
        self.assertEqual(True, stream.query())

+    @skipIfTorchDynamo()
    def test_stream_repr(self):
        stream = torch.Stream(device="openreg:1")
        self.assertTrue(
            "torch.Stream device_type=openreg, device_index=1" in repr(stream)
        )

+    @skipIfTorchDynamo()
    def test_stream_wait_stream(self):
        stream_1 = torch.Stream(device="openreg:0")
        stream_2 = torch.Stream(device="openreg:1")
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@ -218,7 +218,7 @@ class TestFullyShard2DTraining(FSDPTest):

        torch.manual_seed(42 + global_mesh.get_local_rank("dp"))
        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type)
-        for iter_idx in range(5):
+        for _ in range(5):
            ref_loss = ref_model(inp).sum()
            loss = model(inp).sum()
            self.assertEqual(ref_loss, loss)
@ -238,9 +238,7 @@ class TestFullyShard2DTraining(FSDPTest):
            # runs its reduce-scatter
            self.assertIsInstance(model.pos_embeddings.weight.placements[1], Shard)
            self.assertIsInstance(model.pos_embeddings.weight.grad.placements[1], Shard)
-            for ref_param, (param_name, param) in zip(
-                ref_model.parameters(), model.named_parameters()
-            ):
+            for ref_param, param in zip(ref_model.parameters(), model.parameters()):
                full_grad = param.grad.full_tensor()
                self.assertEqual(ref_param.grad, full_grad)

--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@ -101,14 +101,14 @@ class ComposabilityTest(MultiProcessTestCase):

    @property
    def world_size(self):
-        return 4
+        return 8

    @property
    def device(self):
        return self.rank

    @requires_accelerator_dist_backend(["nccl", "xccl"])
-    @skip_if_lt_x_gpu(4)
+    @skip_if_lt_x_gpu(8)
    @skip_but_pass_in_sandcastle_if(
        not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
    )
@ -169,8 +169,8 @@ class ComposabilityTest(MultiProcessTestCase):
            {f"{i}": MLPModule(dim) for i in range(total_layers)}
        )
        # Calculate start and end indices based on rank
-        start_index = self.rank * 2
-        end_index = start_index + 2
+        start_index = self.rank
+        end_index = start_index + 1
        pp_model = PPModelChunk(full_model, start_index, end_index)

        pp_model.to(self.device)
@ -224,7 +224,6 @@ class ComposabilityTest(MultiProcessTestCase):
        ],
    )
    def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
-        _device_raii = torch.device(device_type, self.device)
        torch.accelerator.set_device_index(self.device)
        store = torch.distributed.FileStore(self.file_name, self.world_size)
        torch.distributed.init_process_group(
@ -286,56 +285,44 @@ class ComposabilityTest(MultiProcessTestCase):
                parallelize_module(layer, tp_mesh, parallelize_plan)
            return model

-        # Attach to a schedule
        if issubclass(ScheduleClass, PipelineScheduleSingle):
-            stage_idx = pp_group.rank()
-            partial_model = nn.Sequential(
-                *full_model[stage_idx * 2 : stage_idx * 2 + 2]
-            )
-            partial_model.to(self.device)
+            n_virtual = 1
+        else:
+            n_virtual = 2

+        num_stages = pp_group.size() * n_virtual
+        layers_per_stage = total_layers // num_stages
+        stages = []
+        for i in range(n_virtual):
+            stage_idx = pp_group.rank() + pp_group.size() * i
+            start_layer = stage_idx * layers_per_stage
+            end_layer = start_layer + layers_per_stage
+            # divide the model layers by the number of stages
+            partial_model = nn.Sequential(*full_model[start_layer:end_layer])
+            partial_model.to(self.device)
            tp_model = apply_tp(partial_model, tp_mesh)
            dp_model = apply_fsdp(tp_model)
-            pipeline_stage = PipelineStage(
+
+            stage = PipelineStage(
                dp_model,
                stage_idx,
-                pp_group.size(),
+                num_stages,
                self.device,
                group=pp_group,
            )
-            partial_models = [pipeline_stage.submod]
-            pipeline_schedule = ScheduleClass(
-                pipeline_stage,
-                n_microbatches=num_microbatches,
-                loss_fn=loss_fn,
-            )
-        else:
-            n_virtual = 2
-            num_stages = pp_group.size() * n_virtual
-            stages = []
-            for i in range(n_virtual):
-                stage_idx = pp_group.rank() + n_virtual * i
-                # divide the model layers by the number of stages
-                partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1])
-                partial_model.to(self.device)

-                tp_model = apply_tp(partial_model, tp_mesh)
-                dp_model = apply_fsdp(tp_model)
-                stage = PipelineStage(
-                    dp_model,
-                    stage_idx,
-                    num_stages,
-                    self.device,
-                    group=pp_group,
-                )
+            stages.append(stage)
+            partial_models = [pipeline_stage.submod for pipeline_stage in stages]

-                stages.append(stage)
-                partial_models = [pipeline_stage.submod for pipeline_stage in stages]
-            pipeline_schedule = ScheduleClass(
-                stages,
-                n_microbatches=num_microbatches,
-                loss_fn=loss_fn,
-            )
+        if issubclass(ScheduleClass, PipelineScheduleSingle):
+            stages = stages[0]
+
+        pipeline_schedule = ScheduleClass(
+            stages,
+            n_microbatches=num_microbatches,
+            loss_fn=loss_fn,
+            scale_grads=False,
+        )

        optimizer_kwargs = {
            "lr": 0.01,
@ -349,7 +336,7 @@ class ComposabilityTest(MultiProcessTestCase):
            for model in partial_models
        ]

-        for train_step in range(5):
+        for _train_step in range(5):
            for optimizer in optimizers:
                optimizer.zero_grad()
            inputs = torch.rand((num_microbatches, dim), device=self.device)
@ -369,7 +356,7 @@ class ComposabilityTest(MultiProcessTestCase):
        torch.distributed.destroy_process_group()

    @requires_accelerator_dist_backend(["nccl", "xccl"])
-    @skip_if_lt_x_gpu(4)
+    @skip_if_lt_x_gpu(8)
    @skip_but_pass_in_sandcastle_if(
        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
    )
@ -447,109 +434,71 @@ class ComposabilityTest(MultiProcessTestCase):
                partial_model = partial_model.to(dtype=MixedPrecisionParam)
            return partial_model

-        # Attach to a schedule
        if issubclass(ScheduleClass, PipelineScheduleSingle):
-            stage_idx = pp_group.rank()
-            partial_model = nn.Sequential(
-                *full_model[stage_idx * 2 : stage_idx * 2 + 2]
-            )
-            partial_model.to(self.device)
-
-            dp_model = apply_replicate(partial_model)
-            pipeline_stage = PipelineStage(
-                dp_model,
-                stage_idx,
-                pp_group.size(),
-                self.device,
-                group=pp_group,
-            )
-            partial_models = [pipeline_stage.submod]
-            pipeline_schedule = ScheduleClass(
-                pipeline_stage,
-                n_microbatches=num_microbatches,
-                loss_fn=loss_fn,
-                scale_grads=False,
-            )
-
-            ref_partial_model = nn.Sequential(
-                *ref_full_model[stage_idx * 2 : stage_idx * 2 + 2]
-            )
-            ref_partial_model.to(self.device)
-            ref_partial_model = apply_same_precision(
-                ref_partial_model
-            )  # Apply same precision
-
-            ref_pipeline_stage = PipelineStage(
-                ref_partial_model,
-                stage_idx,
-                pp_group.size(),
-                self.device,
-                group=pp_group,
-            )
-            ref_partial_models = [ref_pipeline_stage.submod]
-            ref_pipeline_schedule = ScheduleClass(
-                ref_pipeline_stage,
-                n_microbatches=num_microbatches,
-                loss_fn=loss_fn,
-                scale_grads=False,
-            )
+            n_virtual = 1
        else:
            n_virtual = 2
-            num_stages = pp_group.size() * n_virtual
-            stages = []
-            ref_stages = []
-            for i in range(n_virtual):
-                stage_idx = pp_group.rank() + n_virtual * i
-                # divide the model layers by the number of stages
-                partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1])
-                partial_model.to(self.device)

-                dp_model = apply_replicate(partial_model)
-                stage = PipelineStage(
-                    dp_model,
-                    stage_idx,
-                    num_stages,
-                    self.device,
-                    group=pp_group,
-                )
+        num_stages = pp_group.size() * n_virtual
+        layers_per_stage = total_layers // num_stages
+        stages = []
+        ref_stages = []
+        for i in range(n_virtual):
+            stage_idx = pp_group.rank() + pp_group.size() * i
+            start_layer = stage_idx * layers_per_stage
+            end_layer = start_layer + layers_per_stage
+            # divide the model layers by the number of stages
+            partial_model = nn.Sequential(*full_model[start_layer:end_layer])
+            partial_model.to(self.device)

-                stages.append(stage)
-                partial_models = [pipeline_stage.submod for pipeline_stage in stages]
+            ref_partial_model = nn.Sequential(*ref_full_model[start_layer:end_layer])
+            ref_partial_model.to(self.device)

-                ref_partial_model = nn.Sequential(
-                    *ref_full_model[stage_idx : stage_idx + 1]
-                )
-                ref_partial_model.to(self.device)
-                ref_partial_model = apply_same_precision(
-                    ref_partial_model
-                )  # Apply same precision
+            dp_model = apply_replicate(partial_model)
+            ref_dp_model = apply_same_precision(ref_partial_model)

-                ref_stage = PipelineStage(
-                    ref_partial_model,
-                    stage_idx,
-                    num_stages,
-                    self.device,
-                    group=pp_group,
-                )
-
-                ref_stages.append(ref_stage)
-                ref_partial_models = [
-                    pipeline_stage.submod for pipeline_stage in ref_stages
-                ]
-            pipeline_schedule = ScheduleClass(
-                stages,
-                n_microbatches=num_microbatches,
-                loss_fn=loss_fn,
-                scale_grads=False,
+            stage = PipelineStage(
+                dp_model,
+                stage_idx,
+                num_stages,
+                self.device,
+                group=pp_group,
            )

-            ref_pipeline_schedule = ScheduleClass(
-                ref_stages,
-                n_microbatches=num_microbatches,
-                loss_fn=loss_fn,
-                scale_grads=False,
+            ref_stage = PipelineStage(
+                ref_dp_model,
+                stage_idx,
+                num_stages,
+                self.device,
+                group=pp_group,
            )

+            stages.append(stage)
+            ref_stages.append(ref_stage)
+
+            partial_models = [pipeline_stage.submod for pipeline_stage in stages]
+            ref_partial_models = [
+                pipeline_stage.submod for pipeline_stage in ref_stages
+            ]
+
+        if issubclass(ScheduleClass, PipelineScheduleSingle):
+            stages = stages[0]
+            ref_stages = ref_stages[0]
+
+        pipeline_schedule = ScheduleClass(
+            stages,
+            n_microbatches=num_microbatches,
+            loss_fn=loss_fn,
+            scale_grads=False,
+        )
+
+        ref_pipeline_schedule = ScheduleClass(
+            ref_stages,
+            n_microbatches=num_microbatches,
+            loss_fn=loss_fn,
+            scale_grads=False,
+        )
+
        optimizer_kwargs = {
            "lr": 0.01,
            "betas": (0.9, 0.95),
@ -568,7 +517,7 @@ class ComposabilityTest(MultiProcessTestCase):
            for model in ref_partial_models
        ]

-        for train_step in range(5):
+        for _train_step in range(5):
            for optimizer in optimizers:
                optimizer.zero_grad()
            for ref_optimizer in ref_optimizers:
@ -604,7 +553,7 @@ class ComposabilityTest(MultiProcessTestCase):
        torch.distributed.destroy_process_group()

    @requires_accelerator_dist_backend(["nccl", "xccl"])
-    @skip_if_lt_x_gpu(4)
+    @skip_if_lt_x_gpu(8)
    @skip_but_pass_in_sandcastle_if(
        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
    )
@ -736,67 +685,44 @@ class ComposabilityTest(MultiProcessTestCase):

        pipeline_model_parameter_dict = {}

-        # Attach to a schedule
        if issubclass(ScheduleClass, PipelineScheduleSingle):
-            stage_idx = pp_group.rank()
-            # Calculate layers per stage correctly
-            layers_per_stage = total_layers // pp_group.size()  # 8 // 2 = 4
+            n_virtual = 1
+        else:
+            n_virtual = 2
+
+        num_stages = pp_group.size() * n_virtual
+        layers_per_stage = total_layers // num_stages
+        stages = []
+        for i in range(n_virtual):
+            stage_idx = pp_group.rank() + pp_group.size() * i
            start_layer = stage_idx * layers_per_stage
            end_layer = start_layer + layers_per_stage
-
+            # divide the model layers by the number of stages
            partial_model = nn.Sequential(*full_model[start_layer:end_layer])
            partial_model.to(self.device)

            dp_model = apply_replicate(partial_model)
            pipelined_models_parameters(start_layer, dp_model)
-
-            pipeline_stage = PipelineStage(
+            stage = PipelineStage(
                dp_model,
                stage_idx,
-                pp_group.size(),
+                num_stages,
                self.device,
                group=pp_group,
            )
-            partial_models = [pipeline_stage.submod]
-            pipeline_schedule = ScheduleClass(
-                pipeline_stage,
-                n_microbatches=num_microbatches,
-                loss_fn=loss_fn,
-                scale_grads=False,
-            )

-        else:
-            n_virtual = 2
-            num_stages = pp_group.size() * n_virtual
-            layers_per_stage = total_layers // num_stages
-            stages = []
-            for i in range(n_virtual):
-                stage_idx = pp_group.rank() + pp_group.size() * i
-                start_layer = stage_idx * layers_per_stage
-                end_layer = start_layer + layers_per_stage
-                # divide the model layers by the number of stages
-                partial_model = nn.Sequential(*full_model[start_layer:end_layer])
-                partial_model.to(self.device)
+            stages.append(stage)
+            partial_models = [pipeline_stage.submod for pipeline_stage in stages]

-                dp_model = apply_replicate(partial_model)
-                pipelined_models_parameters(start_layer, dp_model)
-                stage = PipelineStage(
-                    dp_model,
-                    stage_idx,
-                    num_stages,
-                    self.device,
-                    group=pp_group,
-                )
+        if issubclass(ScheduleClass, PipelineScheduleSingle):
+            stages = stages[0]

-                stages.append(stage)
-                partial_models = [pipeline_stage.submod for pipeline_stage in stages]
-
-            pipeline_schedule = ScheduleClass(
-                stages,
-                n_microbatches=num_microbatches,
-                loss_fn=loss_fn,
-                scale_grads=False,
-            )
+        pipeline_schedule = ScheduleClass(
+            stages,
+            n_microbatches=num_microbatches,
+            loss_fn=loss_fn,
+            scale_grads=False,
+        )

        optimizer_kwargs = {
            "lr": 0.01,
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@ -216,7 +216,7 @@ class TestSavePlan(TestCase):
        # Number of plans should remain unchanged
        self.assertEqual(len(all_plans), len(deduped_plans))

-        # Numer of items in the deduped plans should be less than the original plans
+        # Number of items in the deduped plans should be less than the original plans
        for new_plan, old_plan in zip(deduped_plans, all_plans):
            self.assertFalse(_compare_save_plans(new_plan, old_plan))
            self.assertTrue(len(new_plan.items) < len(old_plan.items))
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@ -4,7 +4,7 @@ import copy
 import functools
 import sys
 from collections.abc import Callable
-from itertools import chain
+from itertools import chain, product
 from typing import Union

 import torch
@ -708,29 +708,43 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
    @with_comms
    @skip_if_lt_x_gpu(2)
    def test_flattened_osd(self) -> None:
-        device_mesh = init_device_mesh(device_type, (self.world_size,))
-        model = CompositeParamModel(device=torch.device(device_type))
-        fsdp_model = fully_shard(copy.deepcopy(model), mesh=device_mesh)
-        fsdp_optim = torch.optim.AdamW(fsdp_model.parameters())
-        batch = torch.rand(8, 100, device=device_type)
-        fsdp_model(batch).sum().backward()
-        fsdp_optim.step()
-        fsdp_optim.zero_grad()
-        osd1 = get_optimizer_state_dict(fsdp_model, fsdp_optim)
-        osd2 = get_optimizer_state_dict(
-            fsdp_model,
-            fsdp_optim,
-            options=StateDictOptions(flatten_optimizer_state_dict=True),
-        )
-        fsdp_optim2 = torch.optim.AdamW(fsdp_model.parameters())
-        set_optimizer_state_dict(
-            fsdp_model, optimizers=fsdp_optim2, optim_state_dict=osd2
-        )
-        self.assertEqual(fsdp_optim.state_dict(), fsdp_optim2.state_dict())
-        set_optimizer_state_dict(
-            fsdp_model, optimizers=fsdp_optim2, optim_state_dict=osd1
-        )
-        self.assertEqual(fsdp_optim.state_dict(), fsdp_optim2.state_dict())
+        """
+        Test flattened optimizer state dictionaries with different combinations of
+        flatten_optimizer_state_dict flag for saving and loading.
+
+        This test verifies that:
+        1. We can save optimizer state dict with/without flattening
+        2. We can load optimizer state dict with/without flattening
+        3. The resulting optimizer state is equivalent regardless of flattening options
+        """
+        for flatten_to_save, flatten_to_load in product([True, False], repeat=2):
+            device_mesh = init_device_mesh(device_type, (self.world_size,))
+            model = CompositeParamModel(device=torch.device(device_type))
+            fsdp_model = fully_shard(copy.deepcopy(model), mesh=device_mesh)
+            fsdp_optim = torch.optim.AdamW(fsdp_model.parameters())
+            batch = torch.rand(8, 100, device=device_type)
+            fsdp_model(batch).sum().backward()
+            fsdp_optim.step()
+            fsdp_optim.zero_grad()
+
+            # Get optimizer state dict with/without flattening option
+            osd = get_optimizer_state_dict(
+                fsdp_model,
+                fsdp_optim,
+                options=StateDictOptions(flatten_optimizer_state_dict=flatten_to_save),
+            )
+
+            # Create a new optimizer and load the state from osd
+            fsdp_optim2 = torch.optim.AdamW(fsdp_model.parameters())
+            set_optimizer_state_dict(
+                fsdp_model,
+                optimizers=fsdp_optim2,
+                optim_state_dict=osd,
+                options=StateDictOptions(flatten_optimizer_state_dict=flatten_to_load),
+            )
+
+            # Verify the loaded optimizer state matches the original
+            self.assertEqual(fsdp_optim.state_dict(), fsdp_optim2.state_dict())

    def _test_deprecate_partial(self) -> None:
        model = CompositeParamModel(device=torch.device(device_type))
--- a/test/distributed/tensor/test_api.py
+++ b/test/distributed/tensor/test_api.py
@ -18,7 +18,9 @@ from torch.distributed.tensor import (
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
+    map_local_tensor_for_rank,
    with_comms,
 )

@ -78,17 +80,21 @@ class DTensorAPITest(DTensorTestBase):
        self.assertEqual(dist_tensor.placements[0].dim, 1)

        placement_combs = [[Shard(0)], [Shard(1)], [Replicate()]]
-        # test src_data_rank == 1
-        # set seed differently for each rank
-        torch.manual_seed(self.rank)
-        for placement in placement_combs:
-            tensor_to_distribute = torch.randn(3 * self.world_size, 3 * self.world_size)
-            dtensor = distribute_tensor(
-                tensor_to_distribute, device_mesh, placement, src_data_rank=1
-            )
-            full_dtensor = dtensor.full_tensor()
-            if self.rank == 1:
-                self.assertEqual(full_dtensor, tensor_to_distribute)
+
+        if not self.is_local_tensor_enabled:
+            # test src_data_rank == 1
+            # set seed differently for each rank
+            self.init_manual_seed_for_rank()
+            for placement in placement_combs:
+                tensor_to_distribute = torch.randn(
+                    3 * self.world_size, 3 * self.world_size
+                )
+                dtensor = distribute_tensor(
+                    tensor_to_distribute, device_mesh, placement, src_data_rank=1
+                )
+                full_dtensor = dtensor.full_tensor()
+                if self.rank == 1:
+                    self.assertEqual(full_dtensor, tensor_to_distribute)

        # test src_data_rank = None, make sure it does not have communication
        with comm_mode:
@ -156,7 +162,12 @@ class DTensorAPITest(DTensorTestBase):
            dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
            self.assertEqual(dist_tensor.size(), torch.Size(input_size))
            local_tensor = dist_tensor.to_local()
-            self.assertEqual(local_tensor, splitted_tensor_list[self.rank])
+            self.assertEqual(
+                local_tensor,
+                map_local_tensor_for_rank(
+                    splitted_tensor_list, self.rank, lambda tl, r: tl[r]
+                ),
+            )

    @with_comms
    def test_distribute_module(self):
@ -388,5 +399,9 @@ class DTensorAPITest(DTensorTestBase):
            dcp.save({"fqn": dtensor}, checkpoint_id=tempfile.mkdtemp())


+DTensorAPITestWithLocalTensor = create_local_tensor_test_class(
+    DTensorAPITest, skipped_tests=["test_checkpoint_apis_check_partial_placement"]
+)
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@ -158,7 +158,7 @@ class RingAttentionTest(DTensorTestBase):
            # parameters because when require_grad is True, resize_ is not
            # allowed. But requires_grad of cp_q, cp_k, and cp_v are False
            # now. So we can just use context_parallel() to shard q, k, v.
-            # In reality, context_paralle() should be used to shard the input.
+            # In reality, context_parallel() should be used to shard the input.
            # In reality, context_parallel() should only be used to shard
            # the model inputs (batch).

@ -701,7 +701,7 @@ class CPFlexAttentionTest(DTensorTestBase):
        )

        # TODO: change this for-loop to run_subtests
-        # Use a for-loop instead of run_subtests because we need to intialize the mask
+        # Use a for-loop instead of run_subtests because we need to initialize the mask
        # for each subtest. This can be baked into self._test_cp_flex_attention as
        # a str argument denoting mask type.
        for batch_size, max_seq_len, lb_type in itertools.product(
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@ -464,6 +464,25 @@ def forward(self, b_parametrizations_buffer_original0, x):
        run(g, 64, 8)
        self.assertEqual(cnt.frame_count, 2)

+    def test_dtensor_requires_grad_recompile(self):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def f(x):
+            y = x * x
+            return y.to_local()
+
+        full_x = torch.randn(8, 8, requires_grad=False)
+        x = distribute_tensor(full_x, mesh, [Shard(0)])
+        f(x)
+
+        full_x = torch.randn(8, 8, requires_grad=True)
+        x = distribute_tensor(full_x, mesh, [Shard(0)])
+        f(x)
+
+        self.assertEqual(cnt.frame_count, 2)
+
    def test_dtensor_attribute_access_on_intermediate(self):
        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))

--- a/test/distributed/tensor/test_dtensor_export.py
+++ b/test/distributed/tensor/test_dtensor_export.py
@ -22,6 +22,11 @@ from torch.distributed.tensor.parallel import (
    parallelize_module,
    RowwiseParallel,
 )
+from torch.nn.attention.flex_attention import (
+    BlockMask,
+    create_block_mask,
+    flex_attention,
+)
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
@ -31,6 +36,7 @@ from torch.testing._internal.common_utils import (
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
 from torch.testing._internal.distributed.fake_pg import FakeStore
+from torch.utils._pytree import register_pytree_node


 class SimpleModel(torch.nn.Module):
@ -82,7 +88,46 @@ class SimpleModelAnnotated(torch.nn.Module):
        return self.mlp_1(x)


-def strict_export_and_aot_export_joint_with_descriptors(model, inputs):
+class FlexAttentionModel(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.proj_q = torch.nn.Linear(16, 128, device=device)
+        self.proj_k = torch.nn.Linear(16, 128, device=device)
+        self.proj_v = torch.nn.Linear(16, 128, device=device)
+        self.proj_out = torch.nn.Linear(128, 16, device=device)
+        self.num_heads = 8
+        self.head_dim = 16
+
+    def forward(self, x, *, block_mask=None):
+        batch_size, seq_len, embed_dim = x.shape
+        # Project to Q, K, V
+        q = self.proj_q(x)
+        k = self.proj_k(x)
+        v = self.proj_v(x)
+        # After colwise parallel, q/k/v are sharded on the last dimension
+        # Get the actual size after sharding
+        hidden_size = q.shape[-1]
+        num_heads_local = hidden_size // self.head_dim
+        # Reshape to (batch, num_heads, seq_len, head_dim)
+        q = q.view(batch_size, seq_len, num_heads_local, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, seq_len, num_heads_local, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, seq_len, num_heads_local, self.head_dim).transpose(1, 2)
+        # Apply flex_attention
+        attn_output_raw = flex_attention(q, k, v, block_mask=block_mask)
+        # Reshape back to (batch, seq_len, hidden_size)
+        attn_output = (
+            attn_output_raw.transpose(1, 2)
+            .contiguous()
+            .view(batch_size, seq_len, hidden_size)
+        )
+        # Output projection
+        output = self.proj_out(attn_output)
+        return output
+
+
+def strict_export_and_aot_export_joint_with_descriptors(model, args, kwargs=None):
+    if kwargs is None:
+        kwargs = {}
    # needed for stric export
    torch.utils._pytree.register_constant(DTensorSpec)

@ -91,36 +136,43 @@ def strict_export_and_aot_export_joint_with_descriptors(model, inputs):
        install_free_tensors=True, inline_inbuilt_nn_modules=True
    ):
        with torch._export.utils._disable_aten_to_metadata_assertions():
-            ep = torch.export.export(model, (inputs,), strict=True)
+            ep = torch.export.export(model, args, kwargs, strict=True)

    # joint_gm produced here is missing the backward region, due to incompatiblility
    # between ep.module() and aot_export_joint_with_descriptors.
    # Keeping this here to show the issue.
-    return aot_export_joint_with_descriptors_alone(ep.module(), inputs)
+    return aot_export_joint_with_descriptors_alone(ep.module(), args, kwargs)


-def graph_capture_and_aot_export_joint_with_descriptors_v2(model, inputs):
-    gm = dynamo_graph_capture_for_export(model)(inputs)
+def graph_capture_and_aot_export_joint_with_descriptors_v2(model, args, kwargs=None):
+    if kwargs is None:
+        kwargs = {}
+    gm = dynamo_graph_capture_for_export(model)(*args, **kwargs)
    fake_mode = gm.meta.get("fake_mode", None)
    with tracing(TracingContext(fake_mode)):
-        return aot_export_joint_with_descriptors_alone(gm, inputs)
+        return aot_export_joint_with_descriptors_alone(gm, args, kwargs)


-def graph_capture_and_aot_export_joint_with_descriptors(model, inputs):
+def graph_capture_and_aot_export_joint_with_descriptors(model, args, kwargs=None):
+    if kwargs is None:
+        kwargs = {}
    with torch._dynamo.config.patch(install_free_tensors=True):
        # TODO: switch to use the official graph_capture API once it is ready
-        gm = _dynamo_graph_capture_for_export(model)(inputs)
+        gm = _dynamo_graph_capture_for_export(model)(*args, **kwargs)
        fake_mode = gm.meta.get("fake_mode", None)
    with tracing(TracingContext(fake_mode)):
-        return aot_export_joint_with_descriptors_alone(gm, inputs)
+        return aot_export_joint_with_descriptors_alone(gm, args, kwargs)


-def aot_export_joint_with_descriptors_alone(model, inputs):
+def aot_export_joint_with_descriptors_alone(model, args, kwargs=None):
+    if kwargs is None:
+        kwargs = {}
    with contextlib.ExitStack() as stack:
        joint_with_descriptors = aot_export_joint_with_descriptors(
            stack,
            model,
-            (inputs,),
+            args,
+            kwargs,
        )
        return joint_with_descriptors.graph_module

@ -129,6 +181,15 @@ def _count_op(gm, target):
    return sum(1 for node in gm.graph.nodes if node.target == target)


+register_pytree_node(
+    BlockMask,
+    BlockMask._flatten,
+    BlockMask._unflatten,
+    flatten_with_keys_fn=BlockMask._flatten_with_keys,
+    serialized_type_name="torch.nn.attention.flex_attention.BlockMask",
+)
+
+
@requires_cuda
 class DTensorExportTest(TestCase):
    def tearDown(self):
@ -168,8 +229,8 @@ class DTensorExportTest(TestCase):
        }
        tp_model = parallelize_module(model, mesh_2d["tp"], parallelize_plan)

-        inputs = torch.rand(20, 10, device=self.device_type)
-        inputs = distribute_tensor(inputs, mesh_2d["tp"], placements=[Replicate()])
+        inp = torch.rand(20, 10, device=self.device_type)
+        inputs = (distribute_tensor(inp, mesh_2d["tp"], placements=[Replicate()]),)

        joint_gm = export_fn(tp_model, inputs)
        fw_gm, bw_gm = min_cut_rematerialization_partition(
@ -352,9 +413,10 @@ class DTensorExportTest(TestCase):
        }
        tp_model = parallelize_module(model, mesh_2d["tp"], parallelize_plan)

-        inputs = torch.rand(20, 10, device=self.device_type)
-        inputs = distribute_tensor(inputs, mesh_2d["tp"], placements=[Replicate()])
-        torch._dynamo.mark_dynamic(inputs, 0, min=5, max=100)
+        inp = torch.rand(20, 10, device=self.device_type)
+        inp_dtensor = distribute_tensor(inp, mesh_2d["tp"], placements=[Replicate()])
+        torch._dynamo.mark_dynamic(inp_dtensor, 0, min=5, max=100)
+        inputs = (inp_dtensor,)

        joint_gm = export_fn(tp_model, inputs)

@ -390,15 +452,74 @@ class DTensorExportTest(TestCase):
        z = torch.randn(16, 16)
        y_dtensor = distribute_tensor(y, device_mesh, placements=[Replicate()])
        z_dtensor = DTensor.from_local(z, device_mesh, placements=[Partial()])
+        inputs = (x_dtensor, y_dtensor, z_dtensor)

        # Run model to verify it works
-        output = model(x_dtensor, y_dtensor, z_dtensor)
-        with torch._dynamo.config.patch(install_free_tensors=True):
+        output = model(*inputs)
+        with torch._dynamo.config.patch(
+            install_free_tensors=(export_fn is _dynamo_graph_capture_for_export)
+        ):
            # TODO: switch to use the official graph_capture API once it is ready
-            gm = export_fn(model)(x_dtensor, y_dtensor, z_dtensor)
-        output_gm = gm(x_dtensor, y_dtensor, z_dtensor)
+            gm = export_fn(model)(*inputs)
+        output_gm = gm(*inputs)
        self.assertEqual(output, output_gm)

+    @parametrize(
+        "export_fn",
+        [
+            graph_capture_and_aot_export_joint_with_descriptors_v2,
+            graph_capture_and_aot_export_joint_with_descriptors,
+        ],
+    )
+    def test_flex_attention_dtensor_export(self, export_fn):
+        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+        model = FlexAttentionModel(self.device_type)
+
+        # Parallelize the model: shard on head dimension
+        # proj_q, proj_k, proj_v are colwise parallel (output is sharded on head dimension)
+        # proj_out is rowwise parallel (input is sharded, output needs reduction)
+        parallelize_plan = {
+            "proj_q": ColwiseParallel(),
+            "proj_k": ColwiseParallel(),
+            "proj_v": ColwiseParallel(),
+            "proj_out": RowwiseParallel(),
+        }
+        tp_model = parallelize_module(model, device_mesh, parallelize_plan)
+        batch_size = 4
+        seq_len = 64
+        embed_dim = 16
+        num_heads = 8
+
+        # Input tensor replicated across all devices
+        inp = torch.randn(batch_size, seq_len, embed_dim, device=self.device_type)
+        inputs = (distribute_tensor(inp, device_mesh, placements=[Replicate()]),)
+
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        block_mask = create_block_mask(
+            causal_mask,
+            batch_size,
+            num_heads,
+            seq_len,
+            seq_len,
+            device=self.device_type,
+        )
+
+        flex_kwargs = {"block_mask": block_mask}
+
+        joint_gm = export_fn(tp_model, inputs, flex_kwargs)
+
+        self.assertTrue(
+            _count_op(joint_gm, torch.ops.higher_order.flex_attention),
+            1,
+        )
+
+        self.assertTrue(
+            _count_op(joint_gm, torch.ops.higher_order.flex_attention_backward),
+            2,
+        )
+

 instantiate_parametrized_tests(DTensorExportTest)

--- a/test/distributed/tensor/test_dynamic.py
+++ b/test/distributed/tensor/test_dynamic.py
@ -12,6 +12,7 @@ from torch.testing._internal.common_utils import (
    run_tests,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
    with_comms,
 )
@ -60,6 +61,9 @@ class TestDynamic(DTensorTestBase):

 instantiate_parametrized_tests(TestDynamic)

+TestDynamicWithLocalTensor = create_local_tensor_test_class(
+    TestDynamic,
+)

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_embedding_ops.py
+++ b/test/distributed/tensor/test_embedding_ops.py
@ -13,6 +13,7 @@ from torch.distributed.tensor import (
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
    with_comms,
 )
@ -167,7 +168,7 @@ class TestEmbeddingOp(DTensorTestBase):
        self._run_embedding_op_test(mesh, 0, [6, 7, 6], 13, 22)
        self._run_embedding_op_test(mesh, 0, [34], 15, 14, padding_idx=10)

-        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+        from torch.distributed.tensor.placement_types import MaskPartial

        # test collectives
        embedding_mod = torch.nn.Embedding(10, 20, device=self.device_type)
@ -175,7 +176,7 @@ class TestEmbeddingOp(DTensorTestBase):
        inp = torch.randint(0, 10, (8, 8), device=self.device_type)
        replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)
        output = sharded_embedding(replicated_inp)
-        self.assertIsInstance(output.placements[0], _MaskPartial)
+        self.assertIsInstance(output.placements[0], MaskPartial)

        comm_mode = CommDebugMode()

@ -191,9 +192,9 @@ class TestEmbeddingOp(DTensorTestBase):
        inp = torch.randint(0, 10, (4, 4), device=self.device_type)
        replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)

-        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+        from torch.distributed.tensor.placement_types import MaskPartial

-        # case 1: two embeddings with the same shape, thus sharing the underlying _MaskPartial
+        # case 1: two embeddings with the same shape, thus sharing the underlying MaskPartial
        # and MaskBuffer, because of cache hit from sharding propagation

        emb1 = torch.nn.Embedding(10, 23, device=self.device_type)
@ -205,28 +206,32 @@ class TestEmbeddingOp(DTensorTestBase):
        output2 = sharded_emb2(replicated_inp)

        partial_placement1 = output1.placements[0]
-        self.assertIsInstance(partial_placement1, _MaskPartial)
+        self.assertIsInstance(partial_placement1, MaskPartial)
        output1.full_tensor()

        partial_placement2 = output2.placements[0]
-        self.assertIsInstance(partial_placement2, _MaskPartial)
+        self.assertIsInstance(partial_placement2, MaskPartial)
        output2.full_tensor()

        self.assertTrue(id(partial_placement1), id(partial_placement2))

        # case 2: two embeddings with the same logical_dim_size, but different logical_shape
-        # thus they will have different _MaskPartial placements (with no cache hit)
+        # thus they will have different MaskPartial placements (with no cache hit)

        emb3 = torch.nn.Embedding(10, 29, device=self.device_type)
        sharded_emb3 = self._apply_sharding(emb3, 0, mesh)
        output3 = sharded_emb3(replicated_inp)
        partial_placement3 = output3.placements[0]
-        self.assertIsInstance(partial_placement3, _MaskPartial)
+        self.assertIsInstance(partial_placement3, MaskPartial)
        output2.full_tensor()

        # not equal because of different logical_shape, despite of same logical_dim_size
        self.assertNotEqual(partial_placement1, partial_placement3)


+TestEmbeddingOpWithLocalTensor = create_local_tensor_test_class(
+    TestEmbeddingOp,
+)
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_experimental_ops.py
+++ b/test/distributed/tensor/test_experimental_ops.py
@ -7,6 +7,7 @@ import torch.distributed as dist
 from torch.distributed.tensor import distribute_tensor, Replicate
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
    with_comms,
 )
@ -188,5 +189,11 @@ class DistOtherOpsTest(DTensorTestBase):
            )


+DistOtherOpsTestWithLocalTensor = create_local_tensor_test_class(
+    DistOtherOpsTest,
+    # Send / recv ops are not supported
+    skipped_tests=["test_bernoulli"],
+)
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_init.py
+++ b/test/distributed/tensor/test_init.py
@ -2,9 +2,11 @@
 # Owner(s): ["oncall: distributed"]

 import torch
+from torch.distributed._local_tensor import maybe_run_for_local_tensor
 from torch.distributed.tensor import DeviceMesh, DTensor, Replicate, Shard, zeros
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
    with_comms,
 )
@ -77,8 +79,13 @@ class DTensorConstructorTest(DTensorTestBase):
                        dim=shard_dim,
                    )
                )
-                if self.rank < len(exp_tensor_list):
-                    eq_op(exp_tensor_list[self.rank], dist_tensor.to_local())
+
+                @maybe_run_for_local_tensor
+                def check_per_rank_chunk(rank, local_tensor):
+                    if rank < len(exp_tensor_list):
+                        eq_op(exp_tensor_list[rank], local_tensor)
+
+                check_per_rank_chunk(self.rank, dist_tensor.to_local())
            else:
                exp_tensor = init_op(tensor_size, *args, **kwargs)
                eq_op(exp_tensor, dist_tensor.to_local())
@ -150,12 +157,17 @@ class DTensorConstructorTest(DTensorTestBase):
        dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
        self.assertEqual(dist_tensor.size(), torch.Size(size))
        local_tensor = dist_tensor.to_local()
-        if self.rank <= 2:
-            self.assertEqual(local_tensor.size(), torch.Size([8, 3]))
-            self.assertEqual(torch.zeros(8, 3), local_tensor)
-        else:
-            self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
-            self.assertEqual(torch.zeros(7, 3), local_tensor)
+
+        @maybe_run_for_local_tensor
+        def check_per_rank_tensors(rank, local_tensor):
+            if rank <= 2:
+                self.assertEqual(local_tensor.size(), torch.Size([8, 3]))
+                self.assertEqual(torch.zeros(8, 3), local_tensor)
+            else:
+                self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
+                self.assertEqual(torch.zeros(7, 3), local_tensor)
+
+        check_per_rank_tensors(self.rank, local_tensor)

        # construct a gpu device mesh with 2d: shard, replicate
        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
@ -250,5 +262,13 @@ class DTensorConstructorTest(DTensorTestBase):
            self.assertEqual(local_tensor, torch.tensor([]))


+DTensorConstructorTestWithLocalTensor = create_local_tensor_test_class(
+    DTensorConstructorTest,
+    skipped_tests=[
+        # Non-contigous sub-meshes are not supported
+        "test_zeros_submesh",
+    ],
+)
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@ -7,6 +7,7 @@ from pprint import pformat
 from typing import NamedTuple

 import torch
+import torch.distributed as dist
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import (
    DeviceMesh,
@ -27,7 +28,9 @@ from torch.distributed.tensor.parallel import (
 )
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
+    map_local_for_rank,
    skip_unless_torch_gpu,
    with_comms,
 )
@ -471,11 +474,10 @@ class DistMathOpsTest(DTensorTestBase):
            out_req_grad: bool

        subtest_fails = {}
-        valid_filter = (  # noqa: E731
-            lambda cfg: (
-                not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[3:])
-            )
-        )
+
+        def valid_filter(cfg):
+            return not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[3:])
+
        subtest_cfgs = list(
            filter(
                valid_filter,
@ -670,10 +672,11 @@ class DistMathOpsTest(DTensorTestBase):
    def test_vector_norm_partial(self):
        device_mesh = self.build_device_mesh()

-        rank = device_mesh.get_local_rank()
        all_ranks = list(range(self.world_size))

-        local_grad = torch.tensor([rank, 1], dtype=torch.float32)
+        local_grad = map_local_for_rank(
+            self.rank, lambda rank: torch.tensor([rank, 1], dtype=torch.float32)
+        )
        full_grad = torch.tensor([sum(all_ranks), self.world_size], dtype=torch.float32)

        partial_grad = DTensor.from_local(local_grad, device_mesh, [Partial()])
@ -708,11 +711,14 @@ class DistMathOpsTest(DTensorTestBase):
    def test_foreach_norm_partial(self):
        device_mesh = self.build_device_mesh()

-        rank = device_mesh.get_local_rank()
        all_ranks = list(range(self.world_size))

-        local_grad0 = torch.tensor([rank, 1], dtype=torch.float32)
-        local_grad1 = torch.tensor([rank + 1, 2], dtype=torch.float32)
+        local_grad0 = map_local_for_rank(
+            self.rank, lambda rank: torch.tensor([rank, 1], dtype=torch.float32)
+        )
+        local_grad1 = map_local_for_rank(
+            self.rank, lambda rank: torch.tensor([rank + 1, 2], dtype=torch.float32)
+        )

        grad0 = torch.tensor([sum(all_ranks), self.world_size], dtype=torch.float32)
        grad1 = torch.tensor(
@ -971,6 +977,68 @@ class DistMathOpsTest(DTensorTestBase):
                    self.assertTrue(output_dtensor.placements[0].is_shard(shard_dim))
                self.assertEqual(output_dtensor.full_tensor(), output)

+    @with_comms
+    def test_partial_reduction_ops(self):
+        mesh = self.build_device_mesh()
+        rank = dist.get_rank()
+
+        torch.manual_seed(rank)
+        local_tensor = torch.rand(3, dtype=torch.float32, device=self.device_type)
+        dt = DTensor.from_local(
+            local_tensor, device_mesh=mesh, placements=[Partial("sum")]
+        )
+        out_without_redistribute = torch.norm(dt)
+
+        dt = dt.redistribute(dt.device_mesh, placements=[Replicate()])
+        out_with_redistribute = torch.norm(dt)
+
+        self.assertEqual(out_without_redistribute, out_with_redistribute)
+
+        local_tensor = torch.rand(3, dtype=torch.float32, device=self.device_type)
+        dt = DTensor.from_local(
+            local_tensor, device_mesh=mesh, placements=[Partial("sum")]
+        )
+        out_without_redistribute = torch.max(dt)
+
+        dt = dt.redistribute(dt.device_mesh, placements=[Replicate()])
+        out_with_redistribute = torch.max(dt)
+
+        self.assertEqual(out_without_redistribute, out_with_redistribute)
+
+        local_tensor = torch.rand(3, dtype=torch.float32, device=self.device_type)
+        dt = DTensor.from_local(
+            local_tensor, device_mesh=mesh, placements=[Partial("sum")]
+        )
+        out_without_redistribute = torch.min(dt)
+
+        dt = dt.redistribute(dt.device_mesh, placements=[Replicate()])
+        out_with_redistribute = torch.min(dt)
+
+        self.assertEqual(out_without_redistribute, out_with_redistribute)
+
+    @with_comms
+    def test_matching_partial_reduction_ops(self):
+        mesh = self.build_device_mesh()
+        rank = dist.get_rank()
+
+        torch.manual_seed(rank)
+        local_tensor = torch.rand(3, dtype=torch.float32, device=self.device_type)
+        dt = DTensor.from_local(
+            local_tensor, device_mesh=mesh, placements=[Partial("max")]
+        )
+        out_without_redistribute = torch.max(dt)
+
+        dt = dt.redistribute(dt.device_mesh, placements=[Replicate()])
+        out_with_redistribute = torch.max(dt)
+
+        self.assertTrue(out_without_redistribute.placements[0].is_partial())
+        self.assertTrue(out_with_redistribute.placements[0].is_replicate())
+        self.assertEqual(out_without_redistribute, out_with_redistribute)
+
+
+DistMathOpsTestWithLocalTensor = create_local_tensor_test_class(
+    DistMathOpsTest,
+)

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_matrix_ops.py
+++ b/test/distributed/tensor/test_matrix_ops.py
@ -26,6 +26,7 @@ from torch.testing._internal.common_utils import (
    TEST_WITH_ROCM,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
    skip_unless_torch_gpu,
    with_comms,
@ -614,5 +615,9 @@ class DistMatrixOpsTest(DTensorTestBase):

 instantiate_parametrized_tests(DistMatrixOpsTest)

+DistMatrixOpsTestWithLocalTensor = create_local_tensor_test_class(
+    DistMatrixOpsTest,
+)
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_op_strategy.py
+++ b/test/distributed/tensor/test_op_strategy.py
@ -37,6 +37,7 @@ from torch.distributed.tensor._ops.utils import (
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorOpTestBase,
    DTensorTestBase,
    with_comms,
@ -644,5 +645,15 @@ class TestStrategyHashing(DTensorTestBase):
        self.assertEqual(out1.full_tensor(), out2.full_tensor())


+DistTensorReplicateStrategyRegistrationTestWithLocalTensor = (
+    create_local_tensor_test_class(
+        DistTensorReplicateStrategyRegistrationTest,
+    )
+)
+
+TestStrategyHashingWithLocalTensor = create_local_tensor_test_class(
+    TestStrategyHashing,
+)
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_optimizers.py
+++ b/test/distributed/tensor/test_optimizers.py
@ -14,6 +14,7 @@ from torch.distributed.tensor import (
 )
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
    DTensorTestBase,
    MLPModule,
    with_comms,
@ -716,5 +717,9 @@ class TestDTensorOptimizer(DTensorTestBase):
            self._assert_optimizer(None, mod, opt, mod_copy, dist_opt, inp)


+TestDTensorOptimizerWithLocalTensor = create_local_tensor_test_class(
+    TestDTensorOptimizer,
+)
+
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_placement_types.py
+++ b/test/distributed/tensor/test_placement_types.py
@ -0,0 +1,88 @@
+# Owner(s): ["oncall: distributed"]
+import copy
+import itertools
+import sys
+import unittest
+
+from torch._dynamo.variables.distributed import PlacementClassVariable
+from torch.distributed.tensor.placement_types import (
+    _StridedShard,
+    Partial,
+    Replicate,
+    Shard,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+# Basic functionality test for Placement types.
+class PlacementTypesTestCase(TestCase):
+    def test_type_identification(self):
+        shard = Shard(3)
+        strided_shard = _StridedShard(dim=3, split_factor=7)
+        partial_sum = Partial("sum")
+        partial_max = Partial("max")
+        replicate = Replicate()
+
+        ident_tests = (
+            (shard, True, False, False),
+            (strided_shard, True, False, False),
+            (partial_sum, False, True, False),
+            (partial_max, False, True, False),
+            (replicate, False, False, True),
+        )
+        for do_deepcopy in (False, True):
+            for placement, is_shard, is_partial, is_replicate in ident_tests:
+                if do_deepcopy:
+                    placement = copy.deepcopy(placement)
+                self.assertEqual(placement.is_shard(), is_shard)
+                self.assertEqual(placement.is_partial(), is_partial)
+                self.assertEqual(placement.is_replicate(), is_replicate)
+
+    def test_equality(self):
+        equivalence_classes = (
+            (Shard(3), _StridedShard(dim=3, split_factor=7)),
+            (Shard(4), _StridedShard(dim=4, split_factor=9)),
+            (Replicate(),),
+            (Partial("sum"),),
+            (Partial("max"),),
+        )
+        for eq_class in equivalence_classes:
+            # Each item in the equivalence class should be equal to every other item in
+            # its class.
+            for lhs, rhs in itertools.product(eq_class, eq_class):
+                self.assertEqual(lhs, rhs)
+
+            # Each item in the equivalence class should not be equal to any item in any
+            # other class.
+            for other_class in equivalence_classes:
+                if other_class is eq_class:
+                    continue
+                for lhs, rhs in itertools.product(eq_class, other_class):
+                    self.assertNotEqual(lhs, rhs)
+
+        # Testing this case doesn't seem to fit neatly into the above equivalence class
+        # framework.
+        self.assertNotEqual(
+            _StridedShard(dim=3, split_factor=1), _StridedShard(dim=3, split_factor=2)
+        )
+
+    @unittest.skipIf(
+        sys.version_info < (3, 10), "kw_only is only available in python >= 3.10"
+    )
+    def test_strided_shard_kwonly_argument(self):
+        with self.assertRaises(TypeError):
+            _StridedShard(3, 4)
+        _StridedShard(3, split_factor=4)
+
+    def test_strided_shard_isinstance_shard(self):
+        assert isinstance(_StridedShard(dim=3, split_factor=7), Shard)
+
+    def test_dynamo_can_identify_placement_classes(self):
+        for cls in (Replicate, Shard, _StridedShard, Partial):
+            self.assertTrue(
+                PlacementClassVariable.is_placement_type(cls), msg=f"failed on {cls}"
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@ -511,7 +511,7 @@ class DistTensorOpsTest(DTensorTestBase):
        # case 2 input sharding: input sharded, index replicated, output mask partial
        # only works when index has size 1 on the gather dimension and
        # input is sharded on the gather dimension
-        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+        from torch.distributed.tensor.placement_types import MaskPartial

        gather_dim = 1
        global_input = torch.randn(12, 8, 16)
@ -522,7 +522,7 @@ class DistTensorOpsTest(DTensorTestBase):
        with comm_mode:
            output_dt = torch.gather(input_dt, gather_dim, index_dt)
            self.assertEqual(comm_mode.get_total_counts(), 0)
-        self.assertIsInstance(output_dt.placements[0], _MaskPartial)
+        self.assertIsInstance(output_dt.placements[0], MaskPartial)
        self.assertEqual(output_dt.full_tensor(), global_output)

        # case 3 index sharding: input replicated, index sharded, output sharded
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@ -4901,7 +4901,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
            for p2p_op_idx, input_sizes in zip(
                range(first_op, coalesced_op, 1), op_sizes_per_coalesce
            ):
-                # the indivudal ops inside the coalescing group the individual op metadata,
+                # the individual ops inside the coalescing group the individual op metadata,
                # but not the timing info coming from the actual coalesced kernel
                profiling_name = (
                    "nccl:recv 0<-1" if self.rank == 0 else "nccl:send 1->0"
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@ -106,6 +106,30 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinuousTest):
        torch.ops.symm_mem.nvshmem_broadcast(tensor, src_rank, group_name)
        self.assertEqual(tensor, torch.arange(numel, dtype=dtype, device=self.device))

+    @skipIfRocm
+    def test_mempool_tensor_w_collective(self) -> None:
+        """
+        Test the effectiveness of MemPool on tensor factory ops.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            tensor = torch.ones(numel, dtype=dtype, device=self.device)
+
+        symm_mem.rendezvous(tensor, group=group_name)
+        dist.all_reduce(tensor)
+        self.assertEqual(
+            tensor, torch.ones(numel, dtype=dtype, device=self.device) * self.world_size
+        )
+
    @skipIfRocm
    def test_mempool_compute_ops(self) -> None:
        """
@ -374,7 +398,7 @@ class NVSHMEMAll2AllTest(MultiProcContinuousTest):
            nsplits, dtype=torch.int64, device=self.device
        ).copy_(inp_splits)
        # 2 rows: output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
+        # Initializing all values to -1 to check if they are updated
        out_splits_offsets = symm_mem.empty(
            (2, nsplits), dtype=torch.int64, device=self.device
        ).fill_(-1)
@ -479,7 +503,7 @@ class NVSHMEMAll2AllTest(MultiProcContinuousTest):
            (2, nsplits), dtype=torch.int64, device=self.device
        )
        # 2 rows: output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
+        # Initializing all values to -1 to check if they are updated
        out_splits_offsets = symm_mem.empty(
            (2, nsplits), dtype=torch.int64, device=self.device
        ).fill_(-1)
@ -593,7 +617,7 @@ def dispatch_then_combine(device, align: int, group) -> None:
        inp_splits
    )
    # 2 rows: output splits, output offsets
-    # Initiallizing all values to -1 to check if they are updated
+    # Initializing all values to -1 to check if they are updated
    out_splits_offsets = symm_mem.empty(
        (2, nsplits), dtype=torch.int64, device=device
    ).fill_(-1)
@ -601,7 +625,7 @@ def dispatch_then_combine(device, align: int, group) -> None:
    # Buffers for combine
    combine_out = symm_mem.empty(max_out_numel, dtype=dtype, device=device).fill_(-1)
    # 2 rows: output splits, output offsets
-    # Initiallizing all values to -1 to check if they are updated
+    # Initializing all values to -1 to check if they are updated
    combine_out_splits_offsets = symm_mem.empty(
        (2, nsplits), dtype=torch.int64, device=device
    ).fill_(-1)
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@ -274,11 +274,12 @@ class SymmetricMemoryTest(MultiProcContinuousTest):
            self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())


-# We move AsyncTP tests to a seperate test suite because 1) Async TP ops are not
+# We move AsyncTP tests to a separate test suite because 1) Async TP ops are not
 # the core symmetric memory APIs, they are more like applications, 2)
 # MultiProcContinuousTest will skip all the following tests if a test fails (
 # we should fix this too). We still want to get the test signals for the core
 # symmetric memory APIs when Async TP ops fail.
+@skip_if_rocm_multiprocess  # AsyncTP is not yet supported on ROCm
@instantiate_parametrized_tests
@requires_cuda_p2p_access()
 class AsyncTPTest(MultiProcContinuousTest):
@ -620,7 +621,7 @@ class AsyncTPTest(MultiProcContinuousTest):

 # [READ ME FIRST]
 # The `SymmMemEmptySetDeviceTest` suite parameterizes whether user sets the
-# device before calling symm_mem.emtpy.  Either way should work.
+# device before calling symm_mem.empty.  Either way should work.
 # However, since `set_device` is persistent, we cannot use the
 # `MultiProcContinuousTest` template because the next function will be
 # "contaminated", leading to flaky tests (e.g. hang). Therefore, we use
--- a/test/dynamo/cpython/3_13/mathdata/ieee754.txt
+++ b/test/dynamo/cpython/3_13/mathdata/ieee754.txt
@ -51,7 +51,7 @@ nan
 >>> INF / INF
 nan

-However unambigous operations with inf return inf:
+However unambiguous operations with inf return inf:
 >>> INF * INF
 inf
 >>> 1.5 * INF
--- a/test/dynamo/cpython/3_13/test_itertools.py
+++ b/test/dynamo/cpython/3_13/test_itertools.py
@ -1711,7 +1711,7 @@ class TestBasicOps(__TestCase):
        t3 = tnew(t1)
        self.assertTrue(list(t1) == list(t2) == list(t3) == list('abc'))

-        # test that tee objects are weak referencable
+        # test that tee objects are weak referenceable
        a, b = tee(range(10))
        p = weakref.proxy(a)
        self.assertEqual(getattr(p, '__class__'), type(b))
@ -2243,7 +2243,7 @@ class TestPurePythonRoughEquivalents(__TestCase):
        t3 = tnew(t1)
        self.assertTrue(list(t1) == list(t2) == list(t3) == list('abc'))

-        # test that tee objects are weak referencable
+        # test that tee objects are weak referenceable
        a, b = tee(range(10))
        p = weakref.proxy(a)
        self.assertEqual(getattr(p, '__class__'), type(b))
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@ -153,7 +153,9 @@ def _get_custom_policy(no_recompute_list=None, must_recompute_list=None):
    return _custom_policy


-class ActivationCheckpointingViaTagsTests(torch._dynamo.test_case.TestCase):
+class ActivationCheckpointingViaTagsTests(
+    torch._dynamo.test_case.TestCaseWithNestedGraphBreaks
+):
    def _validate(
        self,
        fn,
--- a/Show More
+++ b/Show More