debugging cudnn numerics

ghstack-source-id: 460fd38569b797bdd607f6672aa16f35177aa5c8 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164950
bwd pass
2025-10-31 12:15:03 +08:00 · 2025-10-29 14:10:53 -07:00 · 2025-10-29 14:10:52 -07:00
38 changed files with 2063 additions and 2511 deletions
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -129,7 +129,7 @@ function install_129 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
+  CUDNN_VERSION=9.10.2.21
  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 12.8.1 in the same container
  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
@ -150,7 +150,7 @@ function install_130 {
  CUDNN_VERSION=9.13.0.50
  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 13.0 in the same container
-  install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
+  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 13 $CUDNN_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -100,8 +100,6 @@ COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
-# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
-ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -272,6 +272,18 @@ def smoke_test_cuda(
        torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version())
        print(f"Torch cuDNN version: {torch_cudnn_version}")

+        torch_cudnn_compile_version = torch._C._cudnn.getCompileVersion()
+        print(f"Torch cuDNN compile-time version: {torch_cudnn_compile_version}")
+        torch_cudnn_runtime_version = tuple(
+            [int(x) for x in torch_cudnn_version.split(".")]
+        )
+        if torch_cudnn_runtime_version != torch_cudnn_compile_version:
+            raise RuntimeError(
+                "cuDNN runtime version doesn't match comple version. "
+                f"Loaded: {torch_cudnn_runtime_version} "
+                f"Expected: {torch_cudnn_compile_version}"
+            )
+
        if sys.platform in ["linux", "linux2"]:
            torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
            print(f"Torch nccl; version: {torch_nccl_version}")
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -460,18 +460,28 @@ test_inductor_shard() {
    --verbose
 }

-test_inductor_aoti_cpp() {
+test_inductor_aoti() {
+  # docker build uses bdist_wheel which does not work with test_aot_inductor
+  # TODO: need a faster way to build
  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
    # We need to hipify before building again
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
+    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
+    BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi

+  # aoti cmake custom command requires `torch` to be installed
+  # initialize the cmake build cache and install torch
+  /usr/bin/env "${BUILD_COMMAND[@]}"
+  # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
+  /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
+
  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
 }

@ -1766,7 +1776,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
-    test_inductor_aoti_cpp
+    test_inductor_aoti
  fi
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -7,9 +7,12 @@ if "%DESIRED_PYTHON%" == "3.13t" (
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.13t"
+) else if "%DESIRED_PYTHON%"=="3.14" (
+    echo Python version is set to 3.14 or 3.14t
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
 ) else if "%DESIRED_PYTHON%"=="3.14t" (
    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.14t"
 ) else (
--- a/.claude/skills/pytorch-docstring.md
+++ b/.claude/skills/pytorch-docstring.md
@ -1,8 +1,3 @@
---
-name: docstring
-description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
---
-
 # PyTorch Docstring Writing Guide

 This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.
--- a/.claude/skills/skill-writer/SKILL.md
+++ b/.claude/skills/skill-writer/SKILL.md
@ -1,385 +0,0 @@
---
-name: skill-writer
-description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
---
-
-# Skill Writer
-
-This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
-
-## When to use this Skill
-
-Use this Skill when:
- Creating a new Agent Skill
- Writing or updating SKILL.md files
- Designing skill structure and frontmatter
- Troubleshooting skill discovery issues
- Converting existing prompts or workflows into Skills
-
-## Instructions
-
-### Step 1: Determine Skill scope
-
-First, understand what the Skill should do:
-
-1. **Ask clarifying questions**:
-   - What specific capability should this Skill provide?
-   - When should Claude use this Skill?
-   - What tools or resources does it need?
-   - Is this for personal use or team sharing?
-
-2. **Keep it focused**: One Skill = one capability
-   - Good: "PDF form filling", "Excel data analysis"
-   - Too broad: "Document processing", "Data tools"
-
-### Step 2: Choose Skill location
-
-Determine where to create the Skill:
-
-**Personal Skills** (`~/.claude/skills/`):
- Individual workflows and preferences
- Experimental Skills
- Personal productivity tools
-
-**Project Skills** (`.claude/skills/`):
- Team workflows and conventions
- Project-specific expertise
- Shared utilities (committed to git)
-
-### Step 3: Create Skill structure
-
-Create the directory and files:
-
-```bash
-# Personal
-mkdir -p ~/.claude/skills/skill-name
-
-# Project
-mkdir -p .claude/skills/skill-name
-```
-
-For multi-file Skills:
-```
-skill-name/
-├── SKILL.md (required)
-├── reference.md (optional)
-├── examples.md (optional)
-├── scripts/
-│   └── helper.py (optional)
-└── templates/
-    └── template.txt (optional)
-```
-
-### Step 4: Write SKILL.md frontmatter
-
-Create YAML frontmatter with required fields:
-
-```yaml
---
-name: skill-name
-description: Brief description of what this does and when to use it
---
-```
-
-**Field requirements**:
-
- **name**:
-  - Lowercase letters, numbers, hyphens only
-  - Max 64 characters
-  - Must match directory name
-  - Good: `pdf-processor`, `git-commit-helper`
-  - Bad: `PDF_Processor`, `Git Commits!`
-
- **description**:
-  - Max 1024 characters
-  - Include BOTH what it does AND when to use it
-  - Use specific trigger words users would say
-  - Mention file types, operations, and context
-
-**Optional frontmatter fields**:
-
- **allowed-tools**: Restrict tool access (comma-separated list)
-  ```yaml
-  allowed-tools: Read, Grep, Glob
-  ```
-  Use for:
-  - Read-only Skills
-  - Security-sensitive workflows
-  - Limited-scope operations
-
-### Step 5: Write effective descriptions
-
-The description is critical for Claude to discover your Skill.
-
-**Formula**: `[What it does] + [When to use it] + [Key triggers]`
-
-**Examples**:
-
-✅ **Good**:
-```yaml
-description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
-```
-
-✅ **Good**:
-```yaml
-description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
-```
-
-❌ **Too vague**:
-```yaml
-description: Helps with documents
-description: For data analysis
-```
-
-**Tips**:
- Include specific file extensions (.pdf, .xlsx, .json)
- Mention common user phrases ("analyze", "extract", "generate")
- List concrete operations (not generic verbs)
- Add context clues ("Use when...", "For...")
-
-### Step 6: Structure the Skill content
-
-Use clear Markdown sections:
-
-```markdown
-# Skill Name
-
-Brief overview of what this Skill does.
-
-## Quick start
-
-Provide a simple example to get started immediately.
-
-## Instructions
-
-Step-by-step guidance for Claude:
-1. First step with clear action
-2. Second step with expected outcome
-3. Handle edge cases
-
-## Examples
-
-Show concrete usage examples with code or commands.
-
-## Best practices
-
- Key conventions to follow
- Common pitfalls to avoid
- When to use vs. not use
-
-## Requirements
-
-List any dependencies or prerequisites:
-```bash
-pip install package-name
-```
-
-## Advanced usage
-
-For complex scenarios, see [reference.md](reference.md).
-```
-
-### Step 7: Add supporting files (optional)
-
-Create additional files for progressive disclosure:
-
-**reference.md**: Detailed API docs, advanced options
-**examples.md**: Extended examples and use cases
-**scripts/**: Helper scripts and utilities
-**templates/**: File templates or boilerplate
-
-Reference them from SKILL.md:
-```markdown
-For advanced usage, see [reference.md](reference.md).
-
-Run the helper script:
-\`\`\`bash
-python scripts/helper.py input.txt
-\`\`\`
-```
-
-### Step 8: Validate the Skill
-
-Check these requirements:
-
-✅ **File structure**:
- [ ] SKILL.md exists in correct location
- [ ] Directory name matches frontmatter `name`
-
-✅ **YAML frontmatter**:
- [ ] Opening `---` on line 1
- [ ] Closing `---` before content
- [ ] Valid YAML (no tabs, correct indentation)
- [ ] `name` follows naming rules
- [ ] `description` is specific and < 1024 chars
-
-✅ **Content quality**:
- [ ] Clear instructions for Claude
- [ ] Concrete examples provided
- [ ] Edge cases handled
- [ ] Dependencies listed (if any)
-
-✅ **Testing**:
- [ ] Description matches user questions
- [ ] Skill activates on relevant queries
- [ ] Instructions are clear and actionable
-
-### Step 9: Test the Skill
-
-1. **Restart Claude Code** (if running) to load the Skill
-
-2. **Ask relevant questions** that match the description:
-   ```
-   Can you help me extract text from this PDF?
-   ```
-
-3. **Verify activation**: Claude should use the Skill automatically
-
-4. **Check behavior**: Confirm Claude follows the instructions correctly
-
-### Step 10: Debug if needed
-
-If Claude doesn't use the Skill:
-
-1. **Make description more specific**:
-   - Add trigger words
-   - Include file types
-   - Mention common user phrases
-
-2. **Check file location**:
-   ```bash
-   ls ~/.claude/skills/skill-name/SKILL.md
-   ls .claude/skills/skill-name/SKILL.md
-   ```
-
-3. **Validate YAML**:
-   ```bash
-   cat SKILL.md | head -n 10
-   ```
-
-4. **Run debug mode**:
-   ```bash
-   claude --debug
-   ```
-
-## Common patterns
-
-### Read-only Skill
-
-```yaml
---
-name: code-reader
-description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
-allowed-tools: Read, Grep, Glob
---
-```
-
-### Script-based Skill
-
-```yaml
---
-name: data-processor
-description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
---
-
-# Data Processor
-
-## Instructions
-
-1. Use the processing script:
-\`\`\`bash
-python scripts/process.py input.csv --output results.json
-\`\`\`
-
-2. Validate output with:
-\`\`\`bash
-python scripts/validate.py results.json
-\`\`\`
-```
-
-### Multi-file Skill with progressive disclosure
-
-```yaml
---
-name: api-designer
-description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
---
-
-# API Designer
-
-Quick start: See [examples.md](examples.md)
-
-Detailed reference: See [reference.md](reference.md)
-
-## Instructions
-
-1. Gather requirements
-2. Design endpoints (see examples.md)
-3. Document with OpenAPI spec
-4. Review against best practices (see reference.md)
-```
-
-## Best practices for Skill authors
-
-1. **One Skill, one purpose**: Don't create mega-Skills
-2. **Specific descriptions**: Include trigger words users will say
-3. **Clear instructions**: Write for Claude, not humans
-4. **Concrete examples**: Show real code, not pseudocode
-5. **List dependencies**: Mention required packages in description
-6. **Test with teammates**: Verify activation and clarity
-7. **Version your Skills**: Document changes in content
-8. **Use progressive disclosure**: Put advanced details in separate files
-
-## Validation checklist
-
-Before finalizing a Skill, verify:
-
- [ ] Name is lowercase, hyphens only, max 64 chars
- [ ] Description is specific and < 1024 chars
- [ ] Description includes "what" and "when"
- [ ] YAML frontmatter is valid
- [ ] Instructions are step-by-step
- [ ] Examples are concrete and realistic
- [ ] Dependencies are documented
- [ ] File paths use forward slashes
- [ ] Skill activates on relevant queries
- [ ] Claude follows instructions correctly
-
-## Troubleshooting
-
-**Skill doesn't activate**:
- Make description more specific with trigger words
- Include file types and operations in description
- Add "Use when..." clause with user phrases
-
-**Multiple Skills conflict**:
- Make descriptions more distinct
- Use different trigger words
- Narrow the scope of each Skill
-
-**Skill has errors**:
- Check YAML syntax (no tabs, proper indentation)
- Verify file paths (use forward slashes)
- Ensure scripts have execute permissions
- List all dependencies
-
-## Examples
-
-See the documentation for complete examples:
- Simple single-file Skill (commit-helper)
- Skill with tool permissions (code-reviewer)
- Multi-file Skill (pdf-processing)
-
-## Output format
-
-When creating a Skill, I will:
-
-1. Ask clarifying questions about scope and requirements
-2. Suggest a Skill name and location
-3. Create the SKILL.md file with proper frontmatter
-4. Include clear instructions and examples
-5. Add supporting files if needed
-6. Provide testing instructions
-7. Validate against all requirements
-
-The result will be a complete, working Skill that follows all best practices and validation rules.
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -22,7 +22,7 @@ CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
    "12.9": "12.9.1",
-    "13.0": "13.0.2",
+    "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "12.6": "9",
@ -96,21 +96,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
    ),
    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
-        "nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
-        "nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
-        "nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
-        "nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
-        "nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
-        "nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
-        "nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
-        "nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
-        "nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.2.1 | "
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -270,7 +270,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -519,7 +519,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -768,7 +768,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1017,7 +1017,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1266,7 +1266,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1515,7 +1515,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1764,7 +1764,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -325,7 +325,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda13_0-test:  # Testing
@ -991,7 +991,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda13_0-test:  # Testing
@ -1657,7 +1657,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda13_0-test:  # Testing
@ -2323,7 +2323,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda13_0-test:  # Testing
@ -2989,7 +2989,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda13_0-test:  # Testing
@ -3655,7 +3655,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda13_0-test:  # Testing
@ -4321,7 +4321,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda13_0-test:  # Testing
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)
+![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png)

 --------------------------------------------------------------------------------

@ -72,7 +72,7 @@ Elaborating Further:

 If you use NumPy, then you have used Tensors (a.k.a. ndarray).

-![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png)
+![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png)

 PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
 computation by a huge amount.
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
 While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
 You get the best of speed and flexibility for your crazy research.

-![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)
+![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif)

 ### Python First

--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
--- a/aten/src/ATen/native/cuda/cuBlasCommonArgs.h
+++ b/aten/src/ATen/native/cuda/cuBlasCommonArgs.h
@ -1,171 +0,0 @@
-#pragma once
-
-#include <ATen/core/Tensor.h>
-
-namespace at::native {
-
-using at::blas::ScalingType;
-using at::blas::SwizzleType;
-
-namespace {
-
-// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
-c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
-  if (resolve_conj && tensor.is_conj()) {
-    return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
-  } else {
-    return c10::MaybeOwned<Tensor>::borrowed(tensor);
-  }
-}
-
-c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
-  if (tensor.is_non_overlapping_and_dense()) { // common case
-      transpose_tensor = tensor.is_contiguous();
-      return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
-  }
-  IntArrayRef tensor_strides = tensor.strides();
-  IntArrayRef tensor_sizes = tensor.sizes();
-  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
-    transpose_tensor = false;
-    return resolve_conj_if_indicated(tensor, !transpose_result);
-  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
-    transpose_tensor = true;
-    return resolve_conj_if_indicated(tensor, transpose_result);
-  } else {
-    transpose_tensor = true;
-    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
-  }
-}
-
-c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
-  if (tensor.is_non_overlapping_and_dense()) { // common case
-      transpose_tensor = tensor.is_contiguous();
-      return resolve_conj_if_indicated(tensor, true);
-  }
-
-  IntArrayRef tensor_strides = tensor.strides();
-  IntArrayRef tensor_sizes = tensor.sizes();
-  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
-    transpose_tensor = false;
-    return resolve_conj_if_indicated(tensor, true);
-  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
-    transpose_tensor = true;
-    return resolve_conj_if_indicated(tensor, true);
-  } else {
-    transpose_tensor = true;
-    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
-  }
-}
-
-} // namespace
-
-/**
- * @brief Prepares matrices for CUBLAS operation
- *
- * This constructor prepares tensors for CUBLAS
- * The main difference is that PyTorch uses row-major as the default and
- * CUBLAS expects column-major.
- *
- * @details
- * To enable row-major output while using CUBLAS,
- * we use the mathematical identity that (A × B)^T = B^T × A^T.
- *
- * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
- * T = row-major, N = col-major
- *
- * Example:
- * For matrices A (M×K)(row-major) and B (K×N)(row-major):
- *   - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
- *   - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
- *   - However, since the output form cublas is column-major this is
- *   - equivalent to an output of size MxN row-major as expected
- *
- * The transpose flags are derived from the layouts of the passed in tensors
- *
- * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
- * to their unpacked values to match what cuBLAS expects.
- *
- * @param mat1 First input matrix
- * @param mat2 Second input matrix
- * @param c Output matrix (result)
- * @param scale_a Optional scaling factor for first matrix
- * @param scale_b Optional scaling factor for second matrix
- * @param scale_result Optional scaling factor for result
- */
-struct cublasCommonArgs {
-  cublasCommonArgs(
-      const Tensor& mat1,
-      const Tensor& mat2,
-      Tensor& c,
-      const std::optional<Tensor>& scale_a = std::nullopt,
-      const std::optional<Tensor>& scale_b = std::nullopt,
-      const std::optional<Tensor>& scale_result = std::nullopt,
-      const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
-      const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
-    bool transpose_result = false, transpose_a = false, transpose_b = false;
-    result = prepare_matrix_for_cublas(c, transpose_result);
-    mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
-    matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
-
-    // Handle scale tensors if provided
-    if (scale_a && scale_b) {
-      // By default since we return in row-major we run the gemm
-      // as B.T @ A.T, check transpose_result to determine if we flip the scales
-      scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
-      scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
-      scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
-      scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
-      scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
-      scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
-    }
-
-    if (scale_result) {
-      scale_result_ptr = scale_result->data_ptr();
-      scale_result_dtype = scale_result->scalar_type();
-    }
-
-    // Update transpose flags
-    if (transpose_result) {
-      transpose_a = !transpose_a;
-      transpose_b = !transpose_b;
-    }
-
-    auto sizes_a = mata->sizes();
-    auto sizes_b = matb->sizes();
-
-    m = sizes_a[transpose_result ? 1 : 0];
-    k = sizes_a[transpose_result ? 0 : 1];
-    n = sizes_b[transpose_result ? 0 : 1];
-    lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
-    ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
-    result_ld = result->stride(transpose_result ? 0 : 1);
-    transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
-    transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
-
-    // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
-    // if the gemm operands are in packed float4
-    if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
-      k = k * 2;
-      lda = lda * 2;
-      ldb = ldb * 2;
-    }
-  }
-
-  // Matrix members
-  char transa, transb;
-  int64_t m, n, k;
-  int64_t lda, ldb, result_ld;
-  c10::MaybeOwned<Tensor> mata, matb, result;
-
-  // Scale members
-  void* scale_mata_ptr = nullptr;
-  void* scale_matb_ptr = nullptr;
-  void* scale_result_ptr = nullptr;
-  std::optional<c10::ScalarType> scale_mata_dtype;
-  std::optional<ScalingType> scaling_mata_type;
-  std::optional<c10::ScalarType> scale_matb_dtype;
-  std::optional<ScalingType> scaling_matb_type;
-  std::optional<c10::ScalarType> scale_result_dtype;
-};
-
-} // namespace at::native
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@ -57,7 +57,6 @@ Tensor& random_mps_impl(Tensor& self,
  if (self.numel() == 0) {
    return self;
  }
-  at::assert_no_internal_overlap(self);
  // MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
  const auto need_reshape = self.ndimension() > 4;
  auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
@ -154,16 +153,8 @@ Tensor& random_mps_impl(Tensor& self,
      feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
    }

-    // Handle non-contiguous output tensors by creating a contiguous temporary
-    const auto needs_gather = needsGather(self);
-    Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
-
-    // Copy results back to original non-contiguous output
-    if (needs_gather) {
-      self.copy_(self_);
-    }
  }

  return self;
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1358,15 +1358,9 @@ if(BUILD_TEST)
    )
  else()
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
    # NativeRT is disabled
    # add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
-    if(BUILD_AOT_INDUCTOR_TEST)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
-    endif()
-
    if(USE_DISTRIBUTED)
      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
      if(NOT WIN32)
@ -1384,6 +1378,16 @@ if(BUILD_TEST)
        ${CMAKE_BINARY_DIR}/test_mobile_nnc
      )
    endif()
+    add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
+                     ${CMAKE_BINARY_DIR}/test_lazy)
+  endif()
+  if(BUILD_AOT_INDUCTOR_TEST)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/aoti_abi_check
+      ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/aoti_inference
+      ${CMAKE_BINARY_DIR}/test_aoti_inference)
  endif()
 endif()

--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@ -1,8 +1,3 @@
-# Skip on windows
-if(WIN32)
-  return()
-endif()
-
 set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)

 # Build the cpp gtest binary containing the cpp-only tests.
@ -35,15 +30,8 @@ target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)

 # WARNING: DO NOT LINK torch!!!
 # The purpose is to check if the used aten/c10 headers are written in a header-only way
-target_link_libraries(test_aoti_abi_check PRIVATE gtest_main sleef)
+target_link_libraries(test_aoti_abi_check PRIVATE gtest_main)
 target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
-if(NOT USE_SYSTEM_SLEEF)
-  target_include_directories(test_aoti_abi_check PRIVATE ${CMAKE_BINARY_DIR}/include)
-endif()
-
-# Disable unused-variable warnings for variables that are only used to test compilation
-target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
-target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)

 foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
  foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
@ -53,17 +41,12 @@ foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
    separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}")
    add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")

-    target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main sleef)
+    target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main)
    target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${ATen_CPU_INCLUDE})
-    if(NOT USE_SYSTEM_SLEEF)
-      target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${CMAKE_BINARY_DIR}/include)
-    endif()

    # Define CPU_CAPABILITY and CPU_CAPABILITY_XXX macros for conditional compilation
    target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY} CPU_CAPABILITY_${CPU_CAPABILITY})
    target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE ${FLAGS})
-    target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-variable)
-    target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-but-set-variable)
  endforeach()
 endforeach()

--- a/test/cpp/aoti_abi_check/test_vec.cpp
+++ b/test/cpp/aoti_abi_check/test_vec.cpp
@ -2,27 +2,10 @@

 #include <ATen/cpu/vec/vec.h>

+#include <iostream>
 namespace torch {
 namespace aot_inductor {

-template <typename T>
-void ExpectVecEqual(
-    const at::vec::Vectorized<T>& expected,
-    const at::vec::Vectorized<T>& actual) {
-  using Vec = at::vec::Vectorized<T>;
-  // Have to use std::vector for comparison because at::vec::Vectorized doesn't
-  // support operator[] on aarch64
-  std::vector<T> expected_data(Vec::size());
-  std::vector<T> actual_data(Vec::size());
-
-  expected.store(expected_data.data());
-  actual.store(actual_data.data());
-
-  for (int i = 0; i < Vec::size(); i++) {
-    EXPECT_EQ(expected_data[i], actual_data[i]);
-  }
-}
-
 TEST(TestVec, TestAdd) {
  using Vec = at::vec::Vectorized<int>;
  std::vector<int> a(1024, 1);
@ -33,7 +16,9 @@ TEST(TestVec, TestAdd) {
  std::vector<int> expected(1024, 3);
  Vec expected_vec = Vec::loadu(expected.data());

-  ExpectVecEqual(expected_vec, actual_vec);
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
 }

 TEST(TestVec, TestMax) {
@ -45,7 +30,9 @@ TEST(TestVec, TestMax) {
  Vec actual_vec = at::vec::maximum(a_vec, b_vec);
  Vec expected_vec = b_vec;

-  ExpectVecEqual(expected_vec, actual_vec);
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
 }

 TEST(TestVec, TestMin) {
@ -57,7 +44,9 @@ TEST(TestVec, TestMin) {
  Vec actual_vec = at::vec::minimum(a_vec, b_vec);
  Vec expected_vec = a_vec;

-  ExpectVecEqual(expected_vec, actual_vec);
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
 }

 TEST(TestVec, TestConvert) {
@ -69,7 +58,9 @@ TEST(TestVec, TestConvert) {
  auto actual_vec = at::vec::convert<float>(a_vec);
  auto expected_vec = b_vec;

-  ExpectVecEqual(expected_vec, actual_vec);
+  for (int i = 0; i < at::vec::Vectorized<int>::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
 }

 TEST(TestVec, TestClampMin) {
@ -81,7 +72,9 @@ TEST(TestVec, TestClampMin) {
  Vec actual_vec = at::vec::clamp_min(a_vec, min_vec);
  Vec expected_vec = min_vec;

-  ExpectVecEqual(expected_vec, actual_vec);
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
 }

 } // namespace aot_inductor
--- a/test/cpp/aoti_inference/CMakeLists.txt
+++ b/test/cpp/aoti_inference/CMakeLists.txt
@ -1,3 +1,4 @@
+
 set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_inference)

 # Build custom TorchScript op for AOTInductor
@ -7,12 +8,27 @@ set_target_properties(aoti_custom_class PROPERTIES
 if(USE_CUDA)
  target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA)
 elseif(USE_ROCM)
-  target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
+    target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
 endif()
-
 # Link against LibTorch
 target_link_libraries(aoti_custom_class torch)

+# the custom command that generates the TorchScript module
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
+           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
+           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
+    # This script requires the torch package to be installed.
+    COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
+    DEPENDS torch torch_python aoti_custom_class ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
+)
+add_custom_target(aoti_script_model ALL
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
+)
+add_dependencies(aoti_script_model aoti_custom_class)
+
 # Build the cpp gtest binary containing the cpp-only tests.
 set(INDUCTOR_TEST_SRCS
  ${AOT_INDUCTOR_TEST_ROOT}/test.cpp
@ -21,12 +37,23 @@ set(INDUCTOR_TEST_SRCS
 add_executable(test_aoti_inference
  ${TORCH_ROOT}/test/cpp/common/main.cpp
  ${INDUCTOR_TEST_SRCS}
+  data.pt
+  script_data.pt
+  script_model_cpu.pt
+  script_model_cuda.pt
 )
-add_dependencies(test_aoti_inference aoti_custom_class)
+add_dependencies(test_aoti_inference aoti_custom_class aoti_script_model)

 # TODO temporary until we can delete the old gtest polyfills.
 target_compile_definitions(test_aoti_inference PRIVATE USE_GTEST)

+# Define a custom command to generate the library
+add_custom_command(
+        OUTPUT data.pt
+        COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py
+        DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py
+)
+
 target_link_libraries(test_aoti_inference PRIVATE
  torch
  gtest_main
@ -44,10 +71,6 @@ target_compile_definitions(test_aoti_inference PRIVATE
    CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
 )

-target_compile_options_if_supported(test_aoti_inference -Wno-unused-variable)
-target_compile_options_if_supported(test_aoti_inference -Wno-unused-but-set-variable)
-target_compile_options_if_supported(test_aoti_inference -Wno-unused-function)
-
 if(INSTALL_TEST)
  install(TARGETS test_aoti_inference DESTINATION bin)
  # Install PDB files for MSVC builds
--- a/test/cpp/aoti_inference/test.cpp
+++ b/test/cpp/aoti_inference/test.cpp
@ -2,9 +2,7 @@
 #include <gtest/gtest.h>
 #include <atomic>
 #include <condition_variable>
-#include <cstdlib>
 #include <filesystem>
-#include <fstream>
 #include <functional>
 #include <mutex>
 #include <queue>
@ -30,64 +28,6 @@

 namespace {

-// Function to check if test data files exist and are valid
-bool testDataFilesExist() {
-  std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
-  std::array<std::string, 4> required_files = {
-      "data.pt",
-      "script_data.pt",
-      "script_model_cpu.pt",
-      "script_model_cuda.pt"};
-
-  for (const auto& filename : required_files) {
-    std::string filepath = bindir + "/" + filename;
-    std::ifstream file(filepath);
-    if (!file.good()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-// Function to ensure test data files are generated at runtime
-void ensureTestDataGenerated() {
-  static std::once_flag generated_flag;
-  std::call_once(generated_flag, []() {
-    // Only generate if files don't exist or are placeholders
-    if (testDataFilesExist()) {
-      return;
-    }
-
-    std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
-
-    // Calculate path to source directory: build/test_aoti_inference -> build ->
-    // pytorch
-    std::string pytorch_root = bindir.substr(0, bindir.find_last_of("/"));
-    pytorch_root = pytorch_root.substr(0, pytorch_root.find_last_of("/"));
-    std::string source_dir = pytorch_root + "/test/cpp/aoti_inference";
-
-    // Generate test data files (data.pt, etc.) by running test.py directly
-    std::string test_script = source_dir + "/test.py";
-    std::string test_data_cmd = "cd " + bindir + " && python " + test_script;
-    std::cout << "Generating test data: " << test_data_cmd << std::endl;
-    int result1 = std::system(test_data_cmd.c_str());
-    if (result1 != 0) {
-      std::cerr << "Warning: Test data generation failed with code " << result1
-                << std::endl;
-    }
-
-    // Generate model files (script_*.pt) by running compile_model.py directly
-    std::string compile_script = source_dir + "/compile_model.py";
-    std::string models_cmd = "cd " + bindir + " && python " + compile_script;
-    std::cout << "Generating model files: " << models_cmd << std::endl;
-    int result2 = std::system(models_cmd.c_str());
-    if (result2 != 0) {
-      std::cerr << "Warning: Model generation failed with code " << result2
-                << std::endl;
-    }
-  });
-}
-
 const std::unordered_map<std::string, at::Tensor> derefTensorConstantMap(
    torch::inductor::TensorConstantMap tensor_constant_map) {
  std::unordered_map<std::string, at::Tensor> ret;
@ -915,6 +855,7 @@ void test_aoti_free_buffer(bool use_runtime_constant_folding) {
  }
 }

+#if defined(USE_CUDA) || defined(USE_ROCM)
 void test_cuda_alloc_test() {
  torch::NoGradGuard no_grad;

@ -954,8 +895,8 @@ void test_cuda_alloc_test() {
      runner->run(data_loader.attr(inputs_attr.c_str()).toTensorList().vec());
  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
 }
+#endif

-#ifdef USE_CUDA
 class ThreadPool {
 private:
  struct Task {
@ -1096,96 +1037,86 @@ void test_multi_cuda_streams(const std::string& device) {
    ASSERT_TRUE(torch::allclose(ref_output_tensors[0], all_outputs[i][0]));
  }
 }
-#endif // USE_CUDA
-#endif // USE_CUDA || USE_ROCM
+#endif
 } // namespace

 namespace torch::aot_inductor {

-// Test fixture that ensures test data is generated once for all tests
-class AotInductorTest : public ::testing::Test {
- public:
-  // This runs once before all tests in this test suite
-  static void SetUpTestSuite() {
-    ensureTestDataGenerated();
-  }
-};
-
-TEST_F(AotInductorTest, BasicTestCpu) {
+TEST(AotInductorTest, BasicTestCpu) {
  test_aoti("cpu", false);
 }

-TEST_F(AotInductorTest, BasicScriptTestCpu) {
+TEST(AotInductorTest, BasicScriptTestCpu) {
  test_aoti_script("cpu");
 }

-TEST_F(AotInductorTest, BasicPackageLoaderTestCpu) {
+TEST(AotInductorTest, BasicPackageLoaderTestCpu) {
  test_aoti_package_loader("cpu", false);
 }

-TEST_F(AotInductorTest, ExtractConstantsMapCpu) {
+TEST(AotInductorTest, ExtractConstantsMapCpu) {
  test_aoti_extract_constants_map("cpu");
 }

 #ifdef USE_CUDA
-TEST_F(AotInductorTest, BasicTestCuda) {
+TEST(AotInductorTest, BasicTestCuda) {
  test_aoti("cuda", true);
  test_aoti("cuda", false);
 }

-TEST_F(AotInductorTest, BasicScriptTestCuda) {
+TEST(AotInductorTest, BasicScriptTestCuda) {
  test_aoti_script("cuda");
 }

-TEST_F(AotInductorTest, BasicPackageLoaderTestCuda) {
+TEST(AotInductorTest, BasicPackageLoaderTestCuda) {
  test_aoti_package_loader("cuda", false);
 }

-TEST_F(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
+TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
  test_aoti_package_loader_multi_gpu("cuda", false);
 }

-TEST_F(AotInductorTest, UpdateUserManagedConstantsCuda) {
+TEST(AotInductorTest, UpdateUserManagedConstantsCuda) {
  test_aoti_user_managed_buffer();
 }

-TEST_F(AotInductorTest, RuntimeUpdateConstantsCuda) {
+TEST(AotInductorTest, RuntimeUpdateConstantsCuda) {
  test_aoti_constants_update("cuda", true);
 }

-TEST_F(AotInductorTest, UpdateConstantsCuda) {
+TEST(AotInductorTest, UpdateConstantsCuda) {
  test_aoti_constants_update("cuda", false);
 }

-TEST_F(AotInductorTest, ExtractConstantsMapCuda) {
+TEST(AotInductorTest, ExtractConstantsMapCuda) {
  test_aoti_extract_constants_map("cuda");
 }

-TEST_F(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
+TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
  test_aoti_double_buffering("cuda", true);
 }

-TEST_F(AotInductorTest, UpdateInactiveConstantsCuda) {
+TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
  test_aoti_double_buffering("cuda", false);
 }

-TEST_F(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
+TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
  test_aoti_double_buffering_with_tensor_constants();
 }

-TEST_F(AotInductorTest, FreeInactiveConstantBufferCuda) {
+TEST(AotInductorTest, FreeInactiveConstantBufferCuda) {
  test_aoti_free_buffer(false);
 }

-TEST_F(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
+TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
  test_aoti_free_buffer(true);
 }

-TEST_F(AotInductorTest, MultiStreamTestCuda) {
+TEST(AotInductorTest, MultiStreamTestCuda) {
  test_multi_cuda_streams("cuda");
 }

-TEST_F(AotInductorTest, CudaAllocTestCuda) {
+TEST(AotInductorTest, CudaAllocTestCuda) {
  test_cuda_alloc_test();
 }
 #endif
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@ -279,7 +279,6 @@ class SymmetricMemoryTest(MultiProcContinuousTest):
 # MultiProcContinuousTest will skip all the following tests if a test fails (
 # we should fix this too). We still want to get the test signals for the core
 # symmetric memory APIs when Async TP ops fail.
-@skip_if_rocm_multiprocess  # AsyncTP is not yet supported on ROCm
@instantiate_parametrized_tests
@requires_cuda_p2p_access()
 class AsyncTPTest(MultiProcContinuousTest):
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@ -892,16 +892,10 @@ fn(torch.randn(5))
                os.remove(
                    file_path
                )  # Delete temp file manually, due to setup NamedTemporaryFile as delete=False.
-                orig_maxDiff = unittest.TestCase.maxDiff
-                unittest.TestCase.maxDiff = None
-                try:
-                    self.assertEqual(  # process wrap difference: /r/n on Windows, /n on posix.
-                        empty_line_normalizer(lines),
-                        empty_line_normalizer(stderr.decode("utf-8")),
-                    )
-                except Exception:
-                    unittest.TestCase.maxDiff = orig_maxDiff
-                    raise
+                self.assertEqual(  # process wrap difference: /r/n on Windows, /n on posix.
+                    empty_line_normalizer(lines),
+                    empty_line_normalizer(stderr.decode("utf-8")),
+                )

    @make_settings_test("torch._dynamo.eval_frame")
    def test_log_traced_frames(self, records):
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@ -1000,18 +1000,6 @@ class ReproTests(torch._dynamo.test_case.TestCase):
        self.exit_stack.close()
        super().tearDown()

-    def test_compiled_module_truthiness(self):
-        # Test with empty ModuleList
-        original_empty = nn.ModuleList()
-        compiled_empty = torch.compile(original_empty)
-        self.assertEqual(bool(original_empty), bool(compiled_empty))
-        self.assertFalse(bool(compiled_empty))
-        # Test with non-empty ModuleList
-        original_filled = nn.ModuleList([nn.Linear(10, 5)])
-        compiled_filled = torch.compile(original_filled)
-        self.assertEqual(bool(original_filled), bool(compiled_filled))
-        self.assertTrue(bool(compiled_filled))
-
    def guard_manager_clone_hook_fn(self, guard_manager_wrapper, f_locals, builder):
        root = guard_manager_wrapper.root
        cloned_root = root.clone_manager(lambda x: True)
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@ -2,11 +2,8 @@
 # flake8: noqa: B950

 import functools
-import json
-import os
 import random
 import string
-import tempfile
 import unittest
 import warnings
 from collections import namedtuple
@ -7048,120 +7045,6 @@ class TestLearnableBiases(InductorTestCase):
    def test_flex_attention_with_dynamic_max_autotune_graph_partition(self, device):
        self._test_flex_attention_with_dynamic_max_autotune(device)

-    @skip_on_cpu
-    def test_flex_attention_logging(self, device):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            log_file = os.path.join(tmpdir, "flex_attention_configs")
-
-            with patch.dict(
-                os.environ, {"TORCHINDUCTOR_FLEX_ATTENTION_LOGGING_FILE": log_file}
-            ):
-                query = torch.randn(
-                    1,
-                    2,
-                    128,
-                    64,
-                    device=device,
-                    dtype=torch.float16,
-                    requires_grad=True,
-                )
-                key = torch.randn(
-                    1,
-                    2,
-                    128,
-                    64,
-                    device=device,
-                    dtype=torch.float16,
-                    requires_grad=True,
-                )
-                value = torch.randn(
-                    1,
-                    2,
-                    128,
-                    64,
-                    device=device,
-                    dtype=torch.float16,
-                    requires_grad=True,
-                )
-
-                def score_mod(score, b, h, q_idx, kv_idx):
-                    return score * 2
-
-                def causal_mask(b, h, q_idx, kv_idx):
-                    return q_idx >= kv_idx
-
-                block_mask = torch.compile(create_block_mask)(
-                    causal_mask, 1, 1, 128, 128, device=device
-                )
-
-                compiled_flex = torch.compile(
-                    flex_attention, mode="max-autotune-no-cudagraphs"
-                )
-
-                out = compiled_flex(
-                    query=query,
-                    key=key,
-                    value=value,
-                    score_mod=score_mod,
-                    block_mask=block_mask,
-                )
-
-                out.sum().backward()
-
-                json_file = log_file + ".json"
-                self.assertTrue(
-                    os.path.exists(json_file), f"Log file {json_file} was not created"
-                )
-
-                with open(json_file) as f:
-                    log_data = json.load(f)
-
-                self.assertIsInstance(log_data, list)
-                self.assertEqual(len(log_data), 2)
-
-                keys_seen = [next(iter(entry.keys())) for entry in log_data]
-
-                expected_fwd_key = "('forward', 1, 2, 2, 128, 128, 64, 64)"
-                expected_bwd_key = "('backward', 1, 2, 2, 128, 128, 64, 64)"
-
-                self.assertIn(expected_fwd_key, keys_seen)
-                self.assertIn(expected_bwd_key, keys_seen)
-
-                for entry in log_data:
-                    self.assertIsInstance(entry, dict)
-                    self.assertEqual(len(entry), 1)
-
-                    dims_key = next(iter(entry.keys()))
-                    choices = entry[dims_key]
-
-                    kernel_type = eval(dims_key)[0]
-
-                    self.assertIsInstance(choices, list)
-                    self.assertGreater(len(choices), 0)
-
-                    for i, choice in enumerate(choices):
-                        self.assertIn("type", choice)
-                        self.assertIn("time", choice)
-
-                        if choice["type"] == "triton":
-                            self.assertIn("num_warps", choice)
-                            self.assertIn("num_stages", choice)
-
-                            if kernel_type == "forward":
-                                self.assertIn("BLOCK_M", choice)
-                                self.assertIn("BLOCK_N", choice)
-                                self.assertNotIn("BLOCK_M1", choice)
-                            elif kernel_type == "backward":
-                                self.assertIn("BLOCK_M1", choice)
-                                self.assertIn("BLOCK_N1", choice)
-                                self.assertIn("BLOCK_M2", choice)
-                                self.assertIn("BLOCK_N2", choice)
-                                self.assertNotIn("BLOCK_M", choice)
-                                self.assertNotIn("BLOCK_N", choice)
-
-                        if i > 0:
-                            self.assertLessEqual(choices[0]["time"], choice["time"])
-
    @skip_on_cpu
    def test_inspect_bug(self, device):
        # https://github.com/pytorch/pytorch/issues/139374
--- a/test/nn/test_dropout.py
+++ b/test/nn/test_dropout.py
@ -12,6 +12,7 @@ from torch.testing._internal.common_device_type import (
    dtypes,
    dtypesIfMPS,
    expectedFailureMPS,
+    expectedFailureMPSPre15,
    expectedFailureXLA,
    instantiate_device_type_tests,
 )
@ -172,6 +173,7 @@ class TestDropoutNNDeviceType(NNTestCase):
                    else:
                        self.assertNotEqual(permuted_inp, out)

+    @expectedFailureMPSPre15
    def test_Dropout(self, device):
        input = torch.empty(1000)
        self._test_dropout(nn.Dropout, device, input)
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@ -529,7 +529,7 @@ class TestProfiler(TestCase):
                found_mm = True
            if "gemm" in e.name.lower() or "Cijk" in e.name:
                found_gemm = True
-            if "memcpy" in e.name.lower() or "__amd_rocclr_copyBuffer" in e.name:
+            if "memcpy" in e.name.lower():
                found_memcpy = True
        if use_cuda:
            self.assertTrue(found_gemm)
--- a/test/run_test.py
+++ b/test/run_test.py
@ -27,7 +27,6 @@ import torch
 import torch.distributed as dist
 from torch.multiprocessing import current_process, get_context
 from torch.testing._internal.common_utils import (
-    get_report_dir,
    get_report_path,
    IS_CI,
    IS_MACOS,
@ -35,6 +34,7 @@ from torch.testing._internal.common_utils import (
    set_cwd,
    shell,
    TEST_CUDA,
+    TEST_SAVE_XML,
    TEST_WITH_ASAN,
    TEST_WITH_ROCM,
    TEST_WITH_SLOW_GRADCHECK,
@ -529,14 +529,6 @@ def run_test(
        replacement = {"-f": "-x", "-dist=loadfile": "--dist=loadfile"}
        unittest_args = [replacement.get(arg, arg) for arg in unittest_args]

-    xml_report_dir = get_report_dir(test_file, None, options.pytest)
-    if is_cpp_test:
-        unittest_args.append(
-            f"--junit-xml-reruns={get_report_path(xml_report_dir, test_file)}"
-        )
-    else:
-        unittest_args.append(f"--save-xml={xml_report_dir}")
-
    if options.showlocals:
        if options.pytest:
            unittest_args.extend(["--showlocals", "--tb=long", "--color=yes"])
@ -1234,6 +1226,12 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
        # is much slower than running them directly
        pytest_args.extend(["-n", str(NUM_PROCS)])

+        if TEST_SAVE_XML:
+            # Add the option to generate XML test report here as C++ tests
+            # won't go into common_utils
+            test_report_path = get_report_path(pytest=True)
+            pytest_args.extend(["--junit-xml-reruns", test_report_path])
+
    if options.pytest_k_expr:
        pytest_args.extend(["-k", options.pytest_k_expr])

--- a/test/test_mps.py
+++ b/test/test_mps.py
@ -7846,45 +7846,6 @@ class TestMPS(TestCaseMPS):
        y = torch.normal(torch.zeros(shape, device="mps"), torch.ones(shape, device="mps"))
        self.assertNotEqual(y[0], y[1])

-    def test_random_ops_noncontiguous(self):
-        """Test random in-place operations on non-contiguous tensors.
-
-        All random in-place operations should work on non-contiguous tensors.
-        See issues #165257 and #124029.
-        """
-        # Test each random in-place operation
-        ops = [
-            ("normal_", lambda t: t.normal_(0, 1)),
-            ("uniform_", lambda t: t.uniform_(0, 1)),
-            ("exponential_", lambda t: t.exponential_(1.0)),
-            ("bernoulli_", lambda t: t.bernoulli_(0.5)),
-            ("random_", lambda t: t.random_()),
-            ("random_with_to", lambda t: t.random_(10)),
-            ("random_with_range", lambda t: t.random_(0, 10)),
-        ]
-
-        for name, op_func in ops:
-            with self.subTest(operation=name):
-                # Create non-contiguous tensor via transpose
-                t_mps = torch.zeros(50, 50, device='mps').T.clone()
-                self.assertFalse(t_mps.is_contiguous(),
-                                 f"{name}: tensor should be non-contiguous")
-
-                # Apply operation
-                op_func(t_mps)
-
-                # Verify tensor was modified (not all zeros)
-                max_val = t_mps.max().item()
-                self.assertNotEqual(max_val, 0.0,
-                                    f"{name}: operation failed to modify non-contiguous tensor")
-
-        # Test rand_like specifically (issue #124029)
-        t = torch.ones((3, 2, 2), device='mps').permute(2, 0, 1)
-        self.assertFalse(t.is_contiguous(), "rand_like input should be non-contiguous")
-        result = torch.rand_like(t)
-        self.assertFalse(result.is_contiguous(), "rand_like result should be non-contiguous")
-        self.assertNotEqual(result.max().item(), 0.0, "rand_like should generate non-zero values")
-
    # Test exponential
    @unittest.skip("This does not test anything")
    def test_exponential(self):
--- a/test/test_varlen_attention.py
+++ b/test/test_varlen_attention.py
@ -5,22 +5,29 @@ from collections import namedtuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.attention import varlen_attn
+from torch.nn.attention.varlen import varlen_attn
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_nn import NNTestCase
-from torch.testing._internal.common_utils import parametrize, run_tests
+from torch.testing._internal.common_utils import parametrize, run_tests, skipIfRocm
+from torch.utils._python_dispatch import TorchDispatchMode


 VarlenShape = namedtuple(
    "VarlenShape", ["batch_size", "max_seq_len", "embed_dim", "num_heads"]
 )

-default_tolerances = {
-    torch.float16: {"atol": 1e-1, "rtol": 1e-1},
-    torch.bfloat16: {"atol": 9e-2, "rtol": 5e-2},
-    torch.float32: {"atol": 1e-5, "rtol": 1.3e-6},
-}
+
+class OpLoggingMode(TorchDispatchMode):
+    """Logging mode that captures all dispatched operations"""
+
+    def __init__(self):
+        self.called_ops = []
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        op_name = str(func)
+        self.called_ops.append(op_name)
+        return func(*args, **(kwargs or {}))


 class AttentionBlock(nn.Module):
@ -39,12 +46,9 @@ class AttentionBlock(nn.Module):
            embed_dim, embed_dim, bias=False, device=device, dtype=dtype
        )

-    def forward_varlen(
+    def get_varlen_qkv(
        self,
        x_packed: torch.Tensor,
-        cu_seq: torch.Tensor,
-        max_len: int,
-        is_causal: bool = False,
    ):
        qkv = self.qkv_proj(x_packed)
        q, k, v = qkv.chunk(3, dim=-1)
@ -53,24 +57,56 @@ class AttentionBlock(nn.Module):
        k = k.view(-1, self.num_heads, self.head_dim)
        v = v.view(-1, self.num_heads, self.head_dim)

-        attn_out = varlen_attn(
-            q, k, v, cu_seq, cu_seq, max_len, max_len, is_causal=is_causal
-        )
+        return q, k, v
+
+    def forward_varlen(
+        self,
+        x_packed: torch.Tensor,
+        cu_seq: torch.Tensor,
+        max_len: int,
+        is_causal: bool = False,
+    ):
+        q, k, v = self.get_varlen_qkv(x_packed)
+
+        attn_out = varlen_attn(q, k, v, cu_seq, cu_seq, max_len, max_len, is_causal)
        attn_out = attn_out.view(-1, self.embed_dim)

        return self.out_proj(attn_out)

-    def forward_sdpa(self, x_padded: torch.Tensor, is_causal: bool = False):
+    def forward_sdpa(
+        self,
+        x_padded: torch.Tensor,
+        seq_lengths: torch.Tensor,
+        is_causal: bool = False,
+    ):
        batch_size, seq_len, _ = x_padded.shape

        qkv = self.qkv_proj(x_padded)
        q, k, v = qkv.chunk(3, dim=-1)

+        mask = (
+            torch.arange(seq_len, device=x_padded.device)[None, :]
+            < seq_lengths[:, None]
+        )
+
+        attn_mask = mask[:, None, None, :].expand(
+            batch_size, self.num_heads, seq_len, seq_len
+        )
+
        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

-        attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=is_causal)
+        if is_causal:
+            causal_mask = torch.triu(
+                torch.ones(seq_len, seq_len, device=x_padded.device, dtype=torch.bool),
+                diagonal=1,
+            )
+            combined_mask = causal_mask[None, None, :, :] | ~attn_mask
+            attn_out = F.scaled_dot_product_attention(q, k, v, attn_mask=~combined_mask)
+        else:
+            attn_out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+
        attn_out = (
            attn_out.transpose(1, 2)
            .contiguous()
@ -91,7 +127,9 @@ def create_variable_length_batch(
    seq_lengths = torch.tensor(seq_lengths, device=device)
    total_tokens = seq_lengths.sum().item()

-    x_packed = torch.randn(total_tokens, shape.embed_dim, device=device, dtype=dtype)
+    x_packed = torch.randn(
+        total_tokens, shape.embed_dim, device=device, dtype=dtype, requires_grad=True
+    )

    cu_seq = torch.zeros(shape.batch_size + 1, device=device, dtype=torch.int32)
    cu_seq[1:] = seq_lengths.cumsum(0)
@ -106,6 +144,7 @@ def create_variable_length_batch(
        end_idx = start_idx + seq_len
        x_padded[i, :seq_len] = x_packed[start_idx:end_idx]
        start_idx = end_idx
+    x_padded = x_padded.clone().detach().requires_grad_()

    return {
        "seq_lengths": seq_lengths,
@ -118,6 +157,7 @@ def create_variable_length_batch(


 class TestVarlenAttention(NNTestCase):
+    @skipIfRocm(msg="ROCM does not support variable length attention")
    @unittest.skipIf(
        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
    )
@ -133,7 +173,11 @@ class TestVarlenAttention(NNTestCase):

        total_tokens = shape.batch_size * shape.max_seq_len
        x_packed = torch.randn(
-            total_tokens, shape.embed_dim, device=device, dtype=dtype
+            total_tokens,
+            shape.embed_dim,
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
        )
        cu_seq = torch.tensor(
            [0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32
@ -147,6 +191,131 @@ class TestVarlenAttention(NNTestCase):
        self.assertEqual(output.device, torch.device(device))
        self.assertEqual(output.dtype, dtype)

+        varlen_grad_out = torch.ones_like(output)
+
+        varlen_grad = torch.autograd.grad(
+            outputs=output,
+            inputs=x_packed,
+            grad_outputs=varlen_grad_out,
+            retain_graph=True,
+            create_graph=False,
+            allow_unused=False,
+        )[0]
+
+        self.assertIsNotNone(varlen_grad)
+        self.assertEqual(varlen_grad.shape, x_packed.shape)
+        self.assertEqual(varlen_grad.dtype, x_packed.dtype)
+
+    @skipIfRocm(msg="ROCM does not support variable length attention")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
+    )
+    @parametrize("dtype", [torch.bfloat16, torch.float16])
+    def test_custom_op_compliance(self, device, dtype):
+        torch.manual_seed(42)
+
+        shape = VarlenShape(batch_size=2, max_seq_len=512, embed_dim=1024, num_heads=16)
+
+        attention_block = AttentionBlock(
+            shape.embed_dim, shape.num_heads, device, dtype
+        )
+
+        total_tokens = shape.batch_size * shape.max_seq_len
+        x_packed = torch.randn(
+            total_tokens,
+            shape.embed_dim,
+            device=device,
+            dtype=dtype,
+        )
+        cu_seq = torch.tensor(
+            [0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32
+        )
+
+        q, k, v = attention_block.get_varlen_qkv(x_packed)
+
+        torch.library.opcheck(
+            torch.ops.torch_attn._varlen_attn,
+            (q, k, v, cu_seq, cu_seq, shape.max_seq_len, shape.max_seq_len, False),
+        )
+
+        out, lse, rng_state = torch.ops.torch_attn._varlen_attn(
+            q, k, v, cu_seq, cu_seq, shape.max_seq_len, shape.max_seq_len, False
+        )
+        grad_out = torch.randn_like(out)
+
+        # we don't support double backward
+        # skipping test_autograd_registration, test_aot_dispatch_dynamic, test_aot_dispatch_static
+        torch.library.opcheck(
+            torch.ops.torch_attn._varlen_attn_backward,
+            (
+                grad_out,
+                q,
+                k,
+                v,
+                out,
+                lse,
+                cu_seq,
+                cu_seq,
+                shape.max_seq_len,
+                shape.max_seq_len,
+                False,
+                rng_state,
+            ),
+            test_utils=["test_schema", "test_faketensor"],
+        )
+
+    @skipIfRocm(msg="ROCM does not support variable length attention")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
+    )
+    @parametrize("dtype", [torch.bfloat16, torch.float16])
+    def test_custom_op_registration(self, device, dtype):
+        torch.manual_seed(42)
+
+        shape = VarlenShape(batch_size=2, max_seq_len=512, embed_dim=1024, num_heads=16)
+
+        attention_block = AttentionBlock(
+            shape.embed_dim, shape.num_heads, device, dtype
+        )
+
+        total_tokens = shape.batch_size * shape.max_seq_len
+        x_packed = torch.randn(
+            total_tokens,
+            shape.embed_dim,
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        cu_seq = torch.tensor(
+            [0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32
+        )
+
+        compiled_forward = torch.compile(
+            attention_block.forward_varlen, backend="eager", fullgraph=True
+        )
+        with OpLoggingMode() as mode:
+            output = compiled_forward(
+                x_packed, cu_seq, shape.max_seq_len, is_causal=False
+            )
+
+            varlen_grad_out = torch.ones_like(output)
+            _ = torch.autograd.grad(
+                outputs=output,
+                inputs=x_packed,
+                grad_outputs=varlen_grad_out,
+                retain_graph=True,
+                create_graph=False,
+                allow_unused=False,
+            )[0]
+
+        called_ops = mode.called_ops
+
+        custom_ops_called = any(
+            "torch_attn._varlen_attn" in op for op in called_ops
+        ) and any("torch_attn._varlen_attn_backward" in op for op in called_ops)
+        assert custom_ops_called
+
+    @skipIfRocm(msg="ROCM does not support variable length attention")
    @unittest.skipIf(
        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
    )
@ -156,14 +325,21 @@ class TestVarlenAttention(NNTestCase):
        torch.manual_seed(42)

        shape = VarlenShape(
-            batch_size=8, max_seq_len=2048, embed_dim=1024, num_heads=16
+            batch_size=2, max_seq_len=128, embed_dim=32, num_heads=4
        )

        attention_block = AttentionBlock(
            shape.embed_dim, shape.num_heads, device, dtype
        )

+        golden_attention_block = AttentionBlock(
+            shape.embed_dim, shape.num_heads, device, torch.float64
+        )
+
        variable_length_batch_data = create_variable_length_batch(shape, device, dtype)
+        golden_variable_length_batch_data = create_variable_length_batch(
+            shape, device, torch.float64
+        )

        varlen_output = attention_block.forward_varlen(
            variable_length_batch_data["x_packed"],
@ -172,18 +348,89 @@ class TestVarlenAttention(NNTestCase):
            is_causal=is_causal,
        )
        sdpa_output = attention_block.forward_sdpa(
-            variable_length_batch_data["x_padded"], is_causal=is_causal
+            variable_length_batch_data["x_padded"],
+            variable_length_batch_data["seq_lengths"],
+            is_causal=is_causal,
+        )
+
+        golden_sdpa_output = golden_attention_block.forward_sdpa(
+            golden_variable_length_batch_data["x_padded"],
+            golden_variable_length_batch_data["seq_lengths"],
+            is_causal=is_causal,
        )

-        tolerances = default_tolerances[dtype]
        start_idx = 0
        for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]):
            end_idx = start_idx + seq_len

            varlen_seq = varlen_output[start_idx:end_idx]
            sdpa_seq = sdpa_output[i, :seq_len]
+            golden_sdpa_seq = golden_sdpa_output[i, :seq_len]
+
+            fwd_atol = (
+                2 * (golden_sdpa_seq + 0.3 - 0.3 - golden_sdpa_seq).abs().max().item()
+            )
+
+            varlen_error = (varlen_seq - fwd_atol).abs().max().item()
+            sdpa_error = (sdpa_seq - fwd_atol).abs().max().item()
+            assert varlen_error <= sdpa_error + fwd_atol
+
+            start_idx = end_idx
+
+        varlen_grad_out = torch.ones_like(varlen_output)
+        sdpa_grad_out = torch.ones_like(sdpa_output)
+        golden_sdpa_grad_out = torch.ones_like(golden_sdpa_output)
+
+        start_idx = 0
+        for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]):
+            end_idx = start_idx + seq_len
+            sdpa_grad_out[i, :seq_len] = varlen_grad_out[start_idx:end_idx]
+            start_idx = end_idx
+
+        varlen_grad = torch.autograd.grad(
+            outputs=varlen_output,
+            inputs=variable_length_batch_data["x_packed"],
+            grad_outputs=varlen_grad_out,
+            retain_graph=True,
+            create_graph=False,
+            allow_unused=False,
+        )[0]
+
+        sdpa_grad = torch.autograd.grad(
+            outputs=sdpa_output,
+            inputs=variable_length_batch_data["x_padded"],
+            grad_outputs=sdpa_grad_out,
+            retain_graph=True,
+            create_graph=False,
+            allow_unused=False,
+        )[0]
+
+        golden_sdpa_grad = torch.autograd.grad(
+            outputs=golden_sdpa_output,
+            inputs=golden_variable_length_batch_data["x_padded"],
+            grad_outputs=golden_sdpa_grad_out,
+            retain_graph=True,
+            create_graph=False,
+            allow_unused=False,
+        )[0]
+
+        start_idx = 0
+        for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]):
+            end_idx = start_idx + seq_len
+
+            varlen_grad_seq = varlen_grad[start_idx:end_idx]
+            sdpa_grad_seq = sdpa_grad[i, :seq_len]
+            golden_sdpa_seq = golden_sdpa_grad[i, :seq_len]
+
+            fwd_atol = (
+                2 * (golden_sdpa_seq + 0.3 - 0.3 - golden_sdpa_seq).abs().max().item()
+            )
+
+            varlen_error = (varlen_grad_seq - fwd_atol).abs().max().item()
+            sdpa_error = (sdpa_grad_seq - fwd_atol).abs().max().item()
+
+            assert varlen_error <= sdpa_error + fwd_atol

-            torch.testing.assert_close(varlen_seq, sdpa_seq, **tolerances)
            start_idx = end_idx


--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@ -42,7 +42,7 @@ import weakref
 from dataclasses import dataclass
 from enum import Enum
 from os.path import dirname, join
-from typing import Any, NamedTuple, Optional, Sized, TYPE_CHECKING, Union
+from typing import Any, NamedTuple, Optional, TYPE_CHECKING, Union
 from unittest.mock import patch

 import sympy
@ -395,13 +395,6 @@ class OptimizedModule(torch.nn.Module):
        self._initialize()
        self.training = self._orig_mod.training

-    def __len__(self) -> int:
-        # Proxy the len call to the original module
-        if isinstance(self._orig_mod, Sized):
-            return len(self._orig_mod)
-        # Mimic python's default behavior for objects without a length
-        raise TypeError(f"{type(self._orig_mod).__name__} does not support len()")
-
    def _initialize(self) -> None:
        # Do this stuff in constructor to lower overhead slightly
        if isinstance(self.dynamo_ctx, DisableContext):
--- a/torch/_functorch/_aot_autograd/graph_compile.py
+++ b/torch/_functorch/_aot_autograd/graph_compile.py
@ -1793,6 +1793,14 @@ def _aot_stage2b_bw_compile(
                    # tensor which is wrong.

                    ph_size = ph_arg.size()
+                    # pyrefly: ignore  # bad-argument-type
+                    if len(ph_size) == 0 and len(real_stride) > 0:
+                        # Fix for 0-dimensional tensors: When a tensor becomes 0-d
+                        # (e.g., via squeeze), its stride should be () not (1,).
+                        # This mismatch can occur when dynamic shape operations produce
+                        # tensors that are later squeezed to 0-d. The stride metadata
+                        # may get preserved causing a dimension mismatch (#164814)
+                        real_stride = ()

                    # pyrefly: ignore  # bad-argument-type
                    placeholder_list[i] = ph_arg.as_strided(ph_size, real_stride)
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@ -409,10 +409,9 @@ class SchedulerDonatedBuffer(SchedulerBuffer):


 class BaseSchedulerNode:
-    ancestors: OrderedSet[str]
-    debug_device_str: Callable[[BaseSchedulerNode], list[str]]
    group: tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]
-    last_usage: OrderedSet[str]
+    read_writes: dependencies.ReadWrites
+    unmet_dependencies: OrderedSet[Dep]
    # .min_order and .max_order are only relevant for "grouped" nodes such as FusedSchedulerNode.
    # e.g. if the FusedSchedulerNode includes nodes (op_1, op_2, op_3), and op_X is X-th node
    # in `self.scheduler.nodes`, then for this FusedSchedulerNode, .min_order is 1 and .max_order is 3.
@ -421,24 +420,22 @@ class BaseSchedulerNode:
    min_order: int
    max_order: int
    mpi_node: MemoryPlanningInfoForNode
-    mutation_renames: dict[str, str]
-    node: Optional[ir.Operation]
-    outputs: list[SchedulerBuffer]
-    outputs_by_name: dict[str, SchedulerBuffer]
    override_estimated_runtime: Optional[float] = None
-    read_writes: dependencies.ReadWrites
-    unmet_dependencies: OrderedSet[Dep]

    def __init__(self, scheduler: Scheduler) -> None:
-        self.scheduler = scheduler
-        self.debug_device_str = lambda *args, **kwargs: []
+        self.scheduler: Scheduler = scheduler
+        self.debug_device_str: Callable[[BaseSchedulerNode], list[str]] = (
+            lambda *args, **kwargs: []
+        )

    def _init_from_node(self, node: ir.Operation) -> None:
-        self.node = node
-        self.ancestors = OrderedSet()
-        self.last_usage = OrderedSet()  # buffers that won't be used after this kernel
+        self.node: Optional[ir.Operation] = node
+        self.ancestors: OrderedSet[str] = OrderedSet()
+        self.last_usage = OrderedSet[
+            str
+        ]()  # buffers that won't be used after this kernel
        self.written = False
-        self.outputs = [
+        self.outputs: list[SchedulerBuffer] = [
            SchedulerBuffer(
                scheduler=self.scheduler,
                node=output,
@ -446,14 +443,16 @@ class BaseSchedulerNode:
            )
            for output in node.get_outputs()
        ]
-        self.outputs_by_name = {buf.get_name(): buf for buf in self.outputs}
+        self.outputs_by_name: dict[str, SchedulerBuffer] = {
+            buf.get_name(): buf for buf in self.outputs
+        }

        # mutation_renames for the current node. Due to potential
        # more mutations happening later, this can be different
        # to Scheduler.mutation_renames. Also this dict should be small
        # since only mutation information relevant to the deps for this
        # node is stored here.
-        self.mutation_renames = {}
+        self.mutation_renames: dict[str, str] = {}

    def __repr__(self) -> str:
        return f"{type(self).__name__}(name={self.get_name()!r})"
@ -2436,34 +2435,6 @@ def pick_loop_order(
    return order


-def _replace_operation_buffer(
-    orig_node: ir.MultiTemplateBuffer, new_node: ir.OperationBuffer
-) -> None:
-    replaced_buf_name = new_node.get_name()
-    orig_buf_name = orig_node.get_name()
-    assert isinstance(orig_buf_name, str) and isinstance(replaced_buf_name, str)
-
-    replaced_op_name = new_node.get_operation_name()
-    orig_op_name = orig_node.get_operation_name()
-    assert isinstance(orig_op_name, str) and isinstance(replaced_op_name, str)
-
-    del V.graph.name_to_buffer[replaced_buf_name]
-    new_node.name = orig_buf_name
-
-    del V.graph.name_to_op[replaced_op_name]
-    new_node.operation_name = orig_op_name
-
-    orig = V.graph.buffers.index(orig_node)
-    V.graph.buffers.remove(new_node)
-    V.graph.buffers[orig] = new_node
-    V.graph.name_to_buffer[orig_buf_name] = new_node
-
-    orig = V.graph.operations.index(orig_node)
-    V.graph.operations.remove(new_node)
-    V.graph.operations[orig] = new_node
-    V.graph.name_to_op[orig_op_name] = new_node
-
-
@dataclasses.dataclass
 class NodeUser:
    node: Union[BaseSchedulerNode, OutputNode]
@ -3365,6 +3336,33 @@ class Scheduler:
        will force completion of compilation and benchmarking.
        """

+        def replace_operation_buffer(
+            orig_node: ir.MultiTemplateBuffer, new_node: ir.OperationBuffer
+        ) -> None:
+            replaced_buf_name = new_node.get_name()
+            orig_buf_name = orig_node.get_name()
+            assert isinstance(orig_buf_name, str) and isinstance(replaced_buf_name, str)
+
+            replaced_op_name = new_node.get_operation_name()
+            orig_op_name = orig_node.get_operation_name()
+            assert isinstance(orig_op_name, str) and isinstance(replaced_op_name, str)
+
+            del V.graph.name_to_buffer[replaced_buf_name]
+            new_node.name = orig_buf_name
+
+            del V.graph.name_to_op[replaced_op_name]
+            new_node.operation_name = orig_op_name
+
+            orig = V.graph.buffers.index(orig_node)
+            V.graph.buffers.remove(new_node)
+            V.graph.buffers[orig] = new_node
+            V.graph.name_to_buffer[orig_buf_name] = new_node
+
+            orig = V.graph.operations.index(orig_node)
+            V.graph.operations.remove(new_node)
+            V.graph.operations[orig] = new_node
+            V.graph.name_to_op[orig_op_name] = new_node
+
        for i, node in enumerate(self.nodes):
            if isinstance(node, SchedulerNode) and isinstance(
                node.node, ir.MultiTemplateBuffer
@ -3418,47 +3416,40 @@ class Scheduler:
                    assign_origin_node(out_tensorbox, multi_node.origin_node)

                out_buffer.layout = multi_node.layout
-                self._replace_node(out_buffer, multi_node, i, node)
+                replace_operation_buffer(multi_node, out_buffer)
+                new_scheduler_node = self.create_scheduler_node(out_buffer)

-    def _replace_node(
-        self,
-        out_buffer: ir.OperationBuffer,
-        multi_node: ir.MultiTemplateBuffer,
-        i: int,
-        node: SchedulerNode,
-    ) -> None:
-        _replace_operation_buffer(multi_node, out_buffer)
-        new_scheduler_node = self.create_scheduler_node(out_buffer)
+                self.nodes[i] = new_scheduler_node
+                self.name_to_node[node.get_name()] = new_scheduler_node
+                self.name_to_fused_node[node.get_name()] = new_scheduler_node

-        self.nodes[i] = new_scheduler_node
-        self.name_to_node[node.get_name()] = new_scheduler_node
-        self.name_to_fused_node[node.get_name()] = new_scheduler_node
+                # We need to reflect the mutation renames that were recorded in the original node
+                mutation_renames = {}
+                for dep in itertools.chain(
+                    node.read_writes.reads, node.unmet_dependencies
+                ):
+                    if real_name := self.mutation_real_name.get(dep.name, None):
+                        mutation_renames[real_name] = dep.name

-        # We need to reflect the mutation renames that were recorded in the original node
-        mutation_renames = {}
-        for dep in itertools.chain(node.read_writes.reads, node.unmet_dependencies):
-            if real_name := self.mutation_real_name.get(dep.name, None):
-                mutation_renames[real_name] = dep.name
+                def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]:
+                    return OrderedSet(dep.rename(mutation_renames) for dep in deps)

-        def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]:
-            return OrderedSet(dep.rename(mutation_renames) for dep in deps)
+                new_scheduler_node.unmet_dependencies = rename_deps(
+                    new_scheduler_node.unmet_dependencies
+                )
+                new_scheduler_node.read_writes.reads = rename_deps(
+                    new_scheduler_node.read_writes.reads
+                )

-        new_scheduler_node.unmet_dependencies = rename_deps(
-            new_scheduler_node.unmet_dependencies
-        )
-        new_scheduler_node.read_writes.reads = rename_deps(
-            new_scheduler_node.read_writes.reads
-        )
+                for new_out, old_out in zip(
+                    new_scheduler_node.get_outputs(), node.get_outputs()
+                ):
+                    self.name_to_buf[old_out.get_name()] = new_out
+                    new_out.users = old_out.users

-        for new_out, old_out in zip(
-            new_scheduler_node.get_outputs(), node.get_outputs()
-        ):
-            self.name_to_buf[old_out.get_name()] = new_out
-            new_out.users = old_out.users
-
-        new_scheduler_node.min_order = node.min_order
-        new_scheduler_node.max_order = node.max_order
-        new_scheduler_node.last_usage = node.last_usage
+                new_scheduler_node.min_order = node.min_order
+                new_scheduler_node.max_order = node.max_order
+                new_scheduler_node.last_usage = node.last_usage

    def _any_atomic_add(self, node_list: Sequence[BaseSchedulerNode]) -> bool:
        return any(
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@ -17,7 +17,6 @@ import time
 from collections.abc import Sequence
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from io import StringIO
-from pathlib import Path
 from types import ModuleType
 from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
 from typing_extensions import Self
@ -2105,11 +2104,6 @@ class TritonTemplate(KernelTemplate):
                "matrix_instr_nonkdim": kwargs.get("matrix_instr_nonkdim", 0),
                "waves_per_eu": kwargs.get("waves_per_eu", 0),
                "kpack": kwargs.get("kpack", 2),
-                **{
-                    k: kwargs[k]
-                    for k in AlgorithmSelectorCache.FLEX_ATTENTION_TUNABLE_KEYS
-                    if k in kwargs
-                },
            },
            mutated_inputs=mutated_inputs,
            workspace_arg=workspace_arg,
@ -2403,17 +2397,6 @@ def get_mm_log_filename() -> Optional[str]:
    return mm_file_name


-@functools.cache
-def get_flex_attention_log_filename() -> Optional[str]:
-    flex_attention_file_name = os.environ.get(
-        "TORCHINDUCTOR_FLEX_ATTENTION_LOGGING_FILE", None
-    )
-    if not flex_attention_file_name:
-        return None
-
-    return str(Path(flex_attention_file_name).with_suffix(".json"))
-
-
 def append_to_log(filename, data):
    lock_file = filename.replace(".json", ".lock")
    lock = FileLock(lock_file)
@ -2624,25 +2607,6 @@ class AlgorithmSelectorCache(PersistentCache):
    doesn't depend on the output layout.
    """

-    FLEX_ATTENTION_TUNABLE_KEYS = tuple(
-        dict.fromkeys(
-            [
-                "num_warps",
-                "num_stages",
-                "BLOCK_M",
-                "BLOCK_N",
-                "BLOCK_M1",
-                "BLOCK_N1",
-                "BLOCK_M2",
-                "BLOCK_N2",
-                "USE_TMA",
-                "kpack",
-                "matrix_instr_nonkdim",
-                "waves_per_eu",
-            ]
-        )
-    )
-
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

@ -3576,73 +3540,6 @@ class AlgorithmSelectorCache(PersistentCache):
        )
        return pruned_choices

-    @staticmethod
-    def get_flex_attention_choice_info(
-        choice: ChoiceCaller, timings: dict[ChoiceCaller, float]
-    ) -> dict[str, Any]:
-        if isinstance(choice, torch._inductor.select_algorithm.ExternKernelCaller):
-            return {"type": "extern", "time": timings[choice]}
-
-        assert isinstance(choice, torch._inductor.select_algorithm.TritonTemplateCaller)
-
-        info = choice.info_dict()
-        result = {
-            "type": "triton",
-            "time": timings[choice],
-        }
-
-        for key in AlgorithmSelectorCache.FLEX_ATTENTION_TUNABLE_KEYS:
-            if key in info:
-                result[key] = info[key]
-
-        return result
-
-    @staticmethod
-    def maybe_log_flex_attention_results(
-        name: str, input_nodes: list[ir.IRNode], timings: dict[ChoiceCaller, float]
-    ) -> None:
-        flex_attention_filename = get_flex_attention_log_filename()
-        if not flex_attention_filename or "flex_attention" not in name:
-            return
-
-        if len(input_nodes) < 3:
-            return
-
-        query_size = input_nodes[0].get_size()
-        key_size = input_nodes[1].get_size()
-        value_size = input_nodes[2].get_size()
-
-        B = query_size[0]
-        Hq = query_size[1]
-        seq_len_q = query_size[2]
-        qk_head_dim = query_size[3]
-        Hkv = key_size[1]
-        seq_len_kv = key_size[2]
-        v_head_dim = value_size[3]
-
-        kernel_type = "backward" if "backward" in name else "forward"
-        dims_key = str(
-            (
-                kernel_type,
-                B,
-                Hq,
-                Hkv,
-                seq_len_q,
-                seq_len_kv,
-                qk_head_dim,
-                v_head_dim,
-            )
-        )
-
-        sorted_choices = sorted(timings, key=timings.__getitem__)
-        out_dict = {
-            dims_key: [
-                AlgorithmSelectorCache.get_flex_attention_choice_info(choice, timings)
-                for choice in sorted_choices
-            ]
-        }
-        append_to_log(flex_attention_filename, out_dict)
-
    @staticmethod
    def log_results(
        name: str,
@ -3653,7 +3550,6 @@ class AlgorithmSelectorCache(PersistentCache):
        prescreening_elapse: Optional[float] = None,
        hint_override: Optional[int] = None,
    ):
-        """Log the autotuning results, currently only handles mm and flex"""
        V.debug.log_autotuning_results(
            name, input_nodes, timings, elapse, precompile_elapse
        )
@ -3722,10 +3618,6 @@ class AlgorithmSelectorCache(PersistentCache):

            append_to_log(mm_filename, out_dict)

-        AlgorithmSelectorCache.maybe_log_flex_attention_results(
-            name, input_nodes, timings
-        )
-
        best_time = timings[best]
        sys.stderr.write(f"AUTOTUNE {name}({sizes})\n")
        sys.stderr.write(f"strides: {strides}\n")
--- a/torch/nn/attention/init.py
+++ b/torch/nn/attention/init.py
@ -14,14 +14,11 @@ from torch.backends.cuda import (
    SDPAParams,
 )

-from .varlen import varlen_attn
-

 __all__: list[str] = [
    "SDPBackend",
    "sdpa_kernel",
    "WARN_FOR_UNFUSED_KERNELS",
-    "varlen_attn",
 ]

 # Note: [SDPA warnings]
--- a/torch/nn/attention/varlen.py
+++ b/torch/nn/attention/varlen.py
@ -7,7 +7,7 @@ that calls into the optimized Flash Attention kernels.

 import logging
 from functools import lru_cache
-from typing import NamedTuple, Optional, Union
+from typing import Any, NamedTuple, Optional, Union

 import torch

@ -20,7 +20,7 @@ __all__ = ["varlen_attn", "AuxRequest"]
@lru_cache(maxsize=8)
 def _should_use_cudnn(device_index: int) -> bool:
    """Cache device capability check to avoid repeated CUDA calls."""
-    return False
+    return True


 class AuxRequest(NamedTuple):
@ -33,8 +33,7 @@ class AuxRequest(NamedTuple):
    lse: bool = False


-# import failures when I try to register as custom op
-# @torch.library.custom_op("torch_nn_attention::_varlen_attn", mutates_args={})
+@torch.library.custom_op("torch_attn::_varlen_attn", mutates_args={})
 def _varlen_attn(
    query: torch.Tensor,
    key: torch.Tensor,
@ -44,7 +43,7 @@ def _varlen_attn(
    max_q: int,
    max_k: int,
    is_causal: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Private custom op for variable-length attention.

@ -52,9 +51,9 @@ def _varlen_attn(
    """

    use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
-
    if use_cudnn:
        log.info("Using cuDNN backend for varlen_attn")
+
        result = torch.ops.aten._cudnn_attention_forward(
            query,
            key,
@ -70,7 +69,7 @@ def _varlen_attn(
            False,  # return_debug_mask
        )
        # cuDNN returns: (output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask)
-        output, softmax_lse = result[0], result[1]
+        output, softmax_lse, rng_state, philox_offset = result[0], result[1], result[6], result[7]
    else:
        log.info("Using Flash Attention backend for varlen_attn")
        output, softmax_lse, rng_state, _, _ = torch.ops.aten._flash_attention_forward(
@ -85,11 +84,16 @@ def _varlen_attn(
            is_causal,
            return_debug_mask=False,
        )
+        philox_offset = torch.zeros((), dtype=torch.int64, device=query.device)

-    return output, softmax_lse
+    rng_state_ = torch.zeros(
+        (2,), dtype=torch.uint64, device=query.device
+    )  # hardcoded since dropout is hardcoded to 0
+
+    return output, softmax_lse, rng_state_, philox_offset


-# @_varlen_attn.register_fake
+@_varlen_attn.register_fake
 def _varlen_attn_fake(
    query: torch.Tensor,
    key: torch.Tensor,
@ -99,7 +103,7 @@ def _varlen_attn_fake(
    max_q: int,
    max_k: int,
    is_causal: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Fake implementation for meta tensor computation and tracing.

@ -110,14 +114,24 @@ def _varlen_attn_fake(
    # Output has same shape as query
    output = torch.empty_like(query)

-    # For varlen path: logsumexp shape is (num_heads, total_q)
+    # For varlen path with cuDNN: logsumexp shape is (total_q, num_heads, 1)
    total_q = query.size(0)
    num_heads = query.size(1)
-    logsumexp = torch.empty(
-        (num_heads, total_q), dtype=torch.float, device=query.device
-    )

-    return output, logsumexp
+    use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
+    if use_cudnn:
+        logsumexp = torch.empty(
+            (total_q, num_heads, 1), dtype=torch.float, device=query.device
+        )
+    else:
+        logsumexp = torch.empty(
+            (num_heads, total_q), dtype=torch.float, device=query.device
+        )
+
+    rng_state = torch.empty((2,), dtype=torch.uint64, device=query.device)
+    philox_offset = torch.zeros((), dtype=torch.int64, device=query.device)
+
+    return output, logsumexp, rng_state, philox_offset


 def varlen_attn(
@ -191,9 +205,142 @@ def varlen_attn(
        ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
        ... )
    """
-    out, lse = _varlen_attn(
+    out, lse, _, _ = torch.ops.torch_attn._varlen_attn(
        query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal
    )
    if return_aux is not None and return_aux.lse:
        return out, lse
    return out
+
+
+def _setup_context(ctx: Any, inputs: tuple[Any, ...], output: Any) -> None:
+    query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal = inputs
+    out, lse, rng_state, philox_offset = output
+
+    ctx.save_for_backward(query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state, philox_offset)
+
+    ctx.max_q = max_q
+    ctx.max_k = max_k
+    ctx.is_causal = is_causal
+
+
+@torch.library.custom_op("torch_attn::_varlen_attn_backward", mutates_args={})
+def _varlen_attn_backward(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool,
+    rng_state: torch.Tensor,
+    philox_offset: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    unused = torch.empty(0, device=query.device)
+
+    use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
+    if use_cudnn:
+        log.info("Using cuDNN backend for varlen_attn")
+
+        head_dim = query.size(-1)
+        scale = 1.0 / (head_dim ** 0.5)
+
+        dq, dk, dv = torch.ops.aten._cudnn_attention_backward(
+            grad_out = grad_out,
+            query = query,
+            key = key,
+            value = value,
+            out = out,
+            logsumexp = lse,
+            philox_seed = rng_state,
+            philox_offset = philox_offset,
+            attn_bias = None,
+            cum_seq_q = cu_seq_q,
+            cum_seq_k = cu_seq_k,
+            max_q = max_q,
+            max_k = max_k,
+            dropout_p = 0.0,
+            is_causal = is_causal,
+            # passing in scale doesn't change the value of the gradients
+            # scale=scale
+        )
+    else:
+        log.info("Using Flash Attention backend for varlen_attn")
+        dq, dk, dv = torch.ops.aten._flash_attention_backward(
+            grad_out,
+            query,
+            key,
+            value,
+            out,
+            lse,
+            cu_seq_q,
+            cu_seq_k,
+            max_q,
+            max_k,
+            0.0,
+            is_causal,
+            rng_state,
+            unused,
+        )
+    return dq, dk, dv
+
+
+@_varlen_attn_backward.register_fake
+def _varlen_attn_backward_fake(
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool,
+    rng_state: torch.Tensor,
+    philox_offset: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Fake implementation for meta tensor computation and tracing.
+    """
+
+    grad_query = torch.empty_like(query)
+    grad_key = torch.empty_like(key)
+    grad_value = torch.empty_like(value)
+
+    return grad_query, grad_key, grad_value
+
+
+def _backward(
+    ctx: Any, grad_out: torch.Tensor, grad_lse: torch.Tensor, grad_rng: torch.Tensor, grad_philox_offset: torch.Tensor
+) -> tuple[Optional[torch.Tensor], ...]:
+    query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state, philox_offset = ctx.saved_tensors
+
+    max_q = ctx.max_q
+    max_k = ctx.max_k
+    is_causal = ctx.is_causal
+
+    dq, dk, dv = torch.ops.torch_attn._varlen_attn_backward(
+        grad_out,
+        query,
+        key,
+        value,
+        out,
+        lse,
+        cu_seq_q,
+        cu_seq_k,
+        max_q,
+        max_k,
+        is_causal,
+        rng_state,
+        philox_offset
+    )
+    return dq, dk, dv, None, None, None, None, None, None
+
+
+_varlen_attn.register_autograd(_backward, setup_context=_setup_context)
--- a/torch/onnx/_internal/exporter/_compat.py
+++ b/torch/onnx/_internal/exporter/_compat.py
@ -74,17 +74,6 @@ def export_compat(
    if opset_version is None:
        opset_version = onnx_constants.ONNX_DEFAULT_OPSET

-    if isinstance(model, torch.nn.Module):
-        if model.training:
-            warnings.warn(
-                "Exporting a model while it is in training mode. "
-                "Please ensure that this is intended, as it may lead to "
-                "different behavior during inference. "
-                "Calling model.eval() before export is recommended.",
-                UserWarning,
-                stacklevel=2,
-            )
-
    if isinstance(model, torch.export.ExportedProgram):
        # We know the model is already exported program, so the args, kwargs, and dynamic_shapes
        # are not used
--- a/torch/testing/_internal/common_mps.py
+++ b/torch/testing/_internal/common_mps.py
@ -812,6 +812,7 @@ if torch.backends.mps.is_available():
            "__rmod__",
            "__rsub__",
            "__rpow__",
+            "bernoulli",
            "clamp_max",
            "clamp_min",
            "masked_scatter",
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@ -950,6 +950,13 @@ def prof_meth_call(*args, **kwargs):
 torch._C.ScriptFunction.__call__ = prof_func_call  # type: ignore[method-assign]
 torch._C.ScriptMethod.__call__ = prof_meth_call  # type: ignore[method-assign]

+def _get_test_report_path():
+    # allow users to override the test file location. We need this
+    # because the distributed tests run the same test file multiple
+    # times with different configurations.
+    override = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE')
+    test_source = override if override is not None else 'python-unittest'
+    return os.path.join('test-reports', test_source)

 def parse_cmd_line_args():
    global CI_FUNCTORCH_ROOT
@ -980,7 +987,9 @@ def parse_cmd_line_args():
    parser.add_argument('--repeat', type=int, default=1)
    parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true')
    parser.add_argument('--use-pytest', action='store_true')
-    parser.add_argument('--save-xml', type=str)
+    parser.add_argument('--save-xml', nargs='?', type=str,
+                        const=_get_test_report_path(),
+                        default=_get_test_report_path() if IS_CI else None)
    parser.add_argument('--discover-tests', action='store_true')
    parser.add_argument('--log-suffix', type=str, default="")
    parser.add_argument('--run-parallel', type=int, default=1)
@ -1010,9 +1019,6 @@ def parse_cmd_line_args():
        # infer flags based on the default settings
        GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()

-    if args.save_xml is None and IS_CI:
-        args.xml_dir = get_report_dir(sys.argv[0], args.log_suffix, args.use_pytest)
-
    RERUN_DISABLED_TESTS = args.rerun_disabled_tests

    SLOW_TESTS_FILE = args.import_slow_tests
@ -1185,37 +1191,19 @@ def lint_test_case_extension(suite):
    return succeed


-def get_report_dir(test_name: str, log_suffix: Optional[str], is_pytest: bool) -> str:
-    """Generates a test report directory path. Test name does not need to be
-    sanitized."""
-    # total path = test-reports/test_source+log_suffix/test_filename
-    # Base path
-    test_source = "python-unittest"
-    if is_pytest:
-        test_source = "python-pytest"
-    # allow users to override the test file location. We need this
-    # because the distributed tests run the same test file multiple
-    # times with different configurations.
-    override = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE')
-    if override is not None:
-        test_source = override
-
-    # Add log suffix to if provided
-    if log_suffix and log_suffix != "":
-        test_source = test_source + log_suffix
-
-    test_report_dir = os.path.join('test-reports', test_source)
-
-    # Add test file name to path
-    test_filename = sanitize_test_filename(test_name)
-    test_report_dir = os.path.join(test_report_dir, test_filename)
-
-    os.makedirs(test_report_dir, exist_ok=True)
-    return test_report_dir
-
-
-def get_report_path(report_dir: str, test_filename: str) -> str:
-    return os.path.join(report_dir, f"{sanitize_test_filename(test_filename)}-{os.urandom(8).hex()}.xml")
+def get_report_path(argv=None, pytest=False):
+    if argv is None:
+        argv = UNITTEST_ARGS
+    test_filename = sanitize_test_filename(argv[0])
+    test_report_path = TEST_SAVE_XML + LOG_SUFFIX
+    test_report_path = os.path.join(test_report_path, test_filename)
+    if pytest:
+        test_report_path = test_report_path.replace('python-unittest', 'python-pytest')
+        os.makedirs(test_report_path, exist_ok=True)
+        test_report_path = os.path.join(test_report_path, f"{test_filename}-{os.urandom(8).hex()}.xml")
+        return test_report_path
+    os.makedirs(test_report_path, exist_ok=True)
+    return test_report_path


 def sanitize_pytest_xml(xml_file: str):
@ -1358,7 +1346,7 @@ def run_tests(argv=None):
        pytest_args = argv + ["--use-main-module"]
        test_report_path = ""
        if TEST_SAVE_XML:
-            test_report_path = get_report_path(TEST_SAVE_XML, argv[0])
+            test_report_path = get_report_path(pytest=True)
            print(f'Test results will be stored in {test_report_path}')
            pytest_args.append(f'--junit-xml-reruns={test_report_path}')
        if PYTEST_SINGLE_TEST:
@ -1402,7 +1390,7 @@ def run_tests(argv=None):
            def printErrors(self) -> None:
                super().printErrors()
                self.printErrorList("XPASS", self.unexpectedSuccesses)
-        test_report_path = get_report_path(TEST_SAVE_XML, argv[0])
+        test_report_path = get_report_path()
        verbose = '--verbose' in argv or '-v' in argv
        if verbose:
            print(f'Test results will be stored in {test_report_path}')