mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 12:15:03 +08:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			csl/xml_st
			...
			varlen-api
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 45715eb46e | |||
| 22c7937326 | 
| @ -129,7 +129,7 @@ function install_129 { | ||||
| } | ||||
|  | ||||
| function install_128 { | ||||
|   CUDNN_VERSION=9.8.0.87 | ||||
|   CUDNN_VERSION=9.10.2.21 | ||||
|   echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" | ||||
|   # install CUDA 12.8.1 in the same container | ||||
|   install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux | ||||
| @ -150,7 +150,7 @@ function install_130 { | ||||
|   CUDNN_VERSION=9.13.0.50 | ||||
|   echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" | ||||
|   # install CUDA 13.0 in the same container | ||||
|   install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux | ||||
|   install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux | ||||
|  | ||||
|   # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement | ||||
|   install_cudnn 13 $CUDNN_VERSION | ||||
|  | ||||
| @ -100,8 +100,6 @@ COPY ./common/common_utils.sh common_utils.sh | ||||
| COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt | ||||
| COPY ci_commit_pins/timm.txt timm.txt | ||||
| COPY ci_commit_pins/torchbench.txt torchbench.txt | ||||
| # Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True | ||||
| ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS} | ||||
| RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi | ||||
| RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt | ||||
|  | ||||
|  | ||||
| @ -272,6 +272,18 @@ def smoke_test_cuda( | ||||
|         torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version()) | ||||
|         print(f"Torch cuDNN version: {torch_cudnn_version}") | ||||
|  | ||||
|         torch_cudnn_compile_version = torch._C._cudnn.getCompileVersion() | ||||
|         print(f"Torch cuDNN compile-time version: {torch_cudnn_compile_version}") | ||||
|         torch_cudnn_runtime_version = tuple( | ||||
|             [int(x) for x in torch_cudnn_version.split(".")] | ||||
|         ) | ||||
|         if torch_cudnn_runtime_version != torch_cudnn_compile_version: | ||||
|             raise RuntimeError( | ||||
|                 "cuDNN runtime version doesn't match comple version. " | ||||
|                 f"Loaded: {torch_cudnn_runtime_version} " | ||||
|                 f"Expected: {torch_cudnn_compile_version}" | ||||
|             ) | ||||
|  | ||||
|         if sys.platform in ["linux", "linux2"]: | ||||
|             torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version()) | ||||
|             print(f"Torch nccl; version: {torch_nccl_version}") | ||||
|  | ||||
| @ -460,18 +460,28 @@ test_inductor_shard() { | ||||
|     --verbose | ||||
| } | ||||
|  | ||||
| test_inductor_aoti_cpp() { | ||||
| test_inductor_aoti() { | ||||
|   # docker build uses bdist_wheel which does not work with test_aot_inductor | ||||
|   # TODO: need a faster way to build | ||||
|   if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then | ||||
|     # We need to hipify before building again | ||||
|     python3 tools/amd_build/build_amd.py | ||||
|   fi | ||||
|   if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then | ||||
|     BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .) | ||||
|     # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB | ||||
|     TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}") | ||||
|   else | ||||
|     BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .) | ||||
|     TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}") | ||||
|   fi | ||||
|  | ||||
|   # aoti cmake custom command requires `torch` to be installed | ||||
|   # initialize the cmake build cache and install torch | ||||
|   /usr/bin/env "${BUILD_COMMAND[@]}" | ||||
|   # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled | ||||
|   /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}" | ||||
|  | ||||
|   /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile | ||||
| } | ||||
|  | ||||
| @ -1766,7 +1776,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then | ||||
|   install_torchvision | ||||
|   PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER" | ||||
|   if [[ "$SHARD_NUMBER" -eq "1" ]]; then | ||||
|     test_inductor_aoti_cpp | ||||
|     test_inductor_aoti | ||||
|   fi | ||||
| elif [[ "${TEST_CONFIG}" == *inductor* ]]; then | ||||
|   install_torchvision | ||||
|  | ||||
| @ -7,9 +7,12 @@ if "%DESIRED_PYTHON%" == "3.13t" ( | ||||
|     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe" | ||||
|     set ADDITIONAL_OPTIONS="Include_freethreaded=1" | ||||
|     set PYTHON_EXEC="python3.13t" | ||||
| ) else if "%DESIRED_PYTHON%"=="3.14" ( | ||||
|     echo Python version is set to 3.14 or 3.14t | ||||
|     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe" | ||||
| ) else if "%DESIRED_PYTHON%"=="3.14t" ( | ||||
|     echo Python version is set to 3.14 or 3.14t | ||||
|     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe" | ||||
|     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe" | ||||
|     set ADDITIONAL_OPTIONS="Include_freethreaded=1" | ||||
|     set PYTHON_EXEC="python3.14t" | ||||
| ) else ( | ||||
|  | ||||
| @ -1,8 +1,3 @@ | ||||
| --- | ||||
| name: docstring | ||||
| description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code. | ||||
| --- | ||||
| 
 | ||||
| # PyTorch Docstring Writing Guide | ||||
| 
 | ||||
| This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`. | ||||
| @ -1,385 +0,0 @@ | ||||
| --- | ||||
| name: skill-writer | ||||
| description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure. | ||||
| --- | ||||
|  | ||||
| # Skill Writer | ||||
|  | ||||
| This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements. | ||||
|  | ||||
| ## When to use this Skill | ||||
|  | ||||
| Use this Skill when: | ||||
| - Creating a new Agent Skill | ||||
| - Writing or updating SKILL.md files | ||||
| - Designing skill structure and frontmatter | ||||
| - Troubleshooting skill discovery issues | ||||
| - Converting existing prompts or workflows into Skills | ||||
|  | ||||
| ## Instructions | ||||
|  | ||||
| ### Step 1: Determine Skill scope | ||||
|  | ||||
| First, understand what the Skill should do: | ||||
|  | ||||
| 1. **Ask clarifying questions**: | ||||
|    - What specific capability should this Skill provide? | ||||
|    - When should Claude use this Skill? | ||||
|    - What tools or resources does it need? | ||||
|    - Is this for personal use or team sharing? | ||||
|  | ||||
| 2. **Keep it focused**: One Skill = one capability | ||||
|    - Good: "PDF form filling", "Excel data analysis" | ||||
|    - Too broad: "Document processing", "Data tools" | ||||
|  | ||||
| ### Step 2: Choose Skill location | ||||
|  | ||||
| Determine where to create the Skill: | ||||
|  | ||||
| **Personal Skills** (`~/.claude/skills/`): | ||||
| - Individual workflows and preferences | ||||
| - Experimental Skills | ||||
| - Personal productivity tools | ||||
|  | ||||
| **Project Skills** (`.claude/skills/`): | ||||
| - Team workflows and conventions | ||||
| - Project-specific expertise | ||||
| - Shared utilities (committed to git) | ||||
|  | ||||
| ### Step 3: Create Skill structure | ||||
|  | ||||
| Create the directory and files: | ||||
|  | ||||
| ```bash | ||||
| # Personal | ||||
| mkdir -p ~/.claude/skills/skill-name | ||||
|  | ||||
| # Project | ||||
| mkdir -p .claude/skills/skill-name | ||||
| ``` | ||||
|  | ||||
| For multi-file Skills: | ||||
| ``` | ||||
| skill-name/ | ||||
| ├── SKILL.md (required) | ||||
| ├── reference.md (optional) | ||||
| ├── examples.md (optional) | ||||
| ├── scripts/ | ||||
| │   └── helper.py (optional) | ||||
| └── templates/ | ||||
|     └── template.txt (optional) | ||||
| ``` | ||||
|  | ||||
| ### Step 4: Write SKILL.md frontmatter | ||||
|  | ||||
| Create YAML frontmatter with required fields: | ||||
|  | ||||
| ```yaml | ||||
| --- | ||||
| name: skill-name | ||||
| description: Brief description of what this does and when to use it | ||||
| --- | ||||
| ``` | ||||
|  | ||||
| **Field requirements**: | ||||
|  | ||||
| - **name**: | ||||
|   - Lowercase letters, numbers, hyphens only | ||||
|   - Max 64 characters | ||||
|   - Must match directory name | ||||
|   - Good: `pdf-processor`, `git-commit-helper` | ||||
|   - Bad: `PDF_Processor`, `Git Commits!` | ||||
|  | ||||
| - **description**: | ||||
|   - Max 1024 characters | ||||
|   - Include BOTH what it does AND when to use it | ||||
|   - Use specific trigger words users would say | ||||
|   - Mention file types, operations, and context | ||||
|  | ||||
| **Optional frontmatter fields**: | ||||
|  | ||||
| - **allowed-tools**: Restrict tool access (comma-separated list) | ||||
|   ```yaml | ||||
|   allowed-tools: Read, Grep, Glob | ||||
|   ``` | ||||
|   Use for: | ||||
|   - Read-only Skills | ||||
|   - Security-sensitive workflows | ||||
|   - Limited-scope operations | ||||
|  | ||||
| ### Step 5: Write effective descriptions | ||||
|  | ||||
| The description is critical for Claude to discover your Skill. | ||||
|  | ||||
| **Formula**: `[What it does] + [When to use it] + [Key triggers]` | ||||
|  | ||||
| **Examples**: | ||||
|  | ||||
| ✅ **Good**: | ||||
| ```yaml | ||||
| description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction. | ||||
| ``` | ||||
|  | ||||
| ✅ **Good**: | ||||
| ```yaml | ||||
| description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format. | ||||
| ``` | ||||
|  | ||||
| ❌ **Too vague**: | ||||
| ```yaml | ||||
| description: Helps with documents | ||||
| description: For data analysis | ||||
| ``` | ||||
|  | ||||
| **Tips**: | ||||
| - Include specific file extensions (.pdf, .xlsx, .json) | ||||
| - Mention common user phrases ("analyze", "extract", "generate") | ||||
| - List concrete operations (not generic verbs) | ||||
| - Add context clues ("Use when...", "For...") | ||||
|  | ||||
| ### Step 6: Structure the Skill content | ||||
|  | ||||
| Use clear Markdown sections: | ||||
|  | ||||
| ```markdown | ||||
| # Skill Name | ||||
|  | ||||
| Brief overview of what this Skill does. | ||||
|  | ||||
| ## Quick start | ||||
|  | ||||
| Provide a simple example to get started immediately. | ||||
|  | ||||
| ## Instructions | ||||
|  | ||||
| Step-by-step guidance for Claude: | ||||
| 1. First step with clear action | ||||
| 2. Second step with expected outcome | ||||
| 3. Handle edge cases | ||||
|  | ||||
| ## Examples | ||||
|  | ||||
| Show concrete usage examples with code or commands. | ||||
|  | ||||
| ## Best practices | ||||
|  | ||||
| - Key conventions to follow | ||||
| - Common pitfalls to avoid | ||||
| - When to use vs. not use | ||||
|  | ||||
| ## Requirements | ||||
|  | ||||
| List any dependencies or prerequisites: | ||||
| ```bash | ||||
| pip install package-name | ||||
| ``` | ||||
|  | ||||
| ## Advanced usage | ||||
|  | ||||
| For complex scenarios, see [reference.md](reference.md). | ||||
| ``` | ||||
|  | ||||
| ### Step 7: Add supporting files (optional) | ||||
|  | ||||
| Create additional files for progressive disclosure: | ||||
|  | ||||
| **reference.md**: Detailed API docs, advanced options | ||||
| **examples.md**: Extended examples and use cases | ||||
| **scripts/**: Helper scripts and utilities | ||||
| **templates/**: File templates or boilerplate | ||||
|  | ||||
| Reference them from SKILL.md: | ||||
| ```markdown | ||||
| For advanced usage, see [reference.md](reference.md). | ||||
|  | ||||
| Run the helper script: | ||||
| \`\`\`bash | ||||
| python scripts/helper.py input.txt | ||||
| \`\`\` | ||||
| ``` | ||||
|  | ||||
| ### Step 8: Validate the Skill | ||||
|  | ||||
| Check these requirements: | ||||
|  | ||||
| ✅ **File structure**: | ||||
| - [ ] SKILL.md exists in correct location | ||||
| - [ ] Directory name matches frontmatter `name` | ||||
|  | ||||
| ✅ **YAML frontmatter**: | ||||
| - [ ] Opening `---` on line 1 | ||||
| - [ ] Closing `---` before content | ||||
| - [ ] Valid YAML (no tabs, correct indentation) | ||||
| - [ ] `name` follows naming rules | ||||
| - [ ] `description` is specific and < 1024 chars | ||||
|  | ||||
| ✅ **Content quality**: | ||||
| - [ ] Clear instructions for Claude | ||||
| - [ ] Concrete examples provided | ||||
| - [ ] Edge cases handled | ||||
| - [ ] Dependencies listed (if any) | ||||
|  | ||||
| ✅ **Testing**: | ||||
| - [ ] Description matches user questions | ||||
| - [ ] Skill activates on relevant queries | ||||
| - [ ] Instructions are clear and actionable | ||||
|  | ||||
| ### Step 9: Test the Skill | ||||
|  | ||||
| 1. **Restart Claude Code** (if running) to load the Skill | ||||
|  | ||||
| 2. **Ask relevant questions** that match the description: | ||||
|    ``` | ||||
|    Can you help me extract text from this PDF? | ||||
|    ``` | ||||
|  | ||||
| 3. **Verify activation**: Claude should use the Skill automatically | ||||
|  | ||||
| 4. **Check behavior**: Confirm Claude follows the instructions correctly | ||||
|  | ||||
| ### Step 10: Debug if needed | ||||
|  | ||||
| If Claude doesn't use the Skill: | ||||
|  | ||||
| 1. **Make description more specific**: | ||||
|    - Add trigger words | ||||
|    - Include file types | ||||
|    - Mention common user phrases | ||||
|  | ||||
| 2. **Check file location**: | ||||
|    ```bash | ||||
|    ls ~/.claude/skills/skill-name/SKILL.md | ||||
|    ls .claude/skills/skill-name/SKILL.md | ||||
|    ``` | ||||
|  | ||||
| 3. **Validate YAML**: | ||||
|    ```bash | ||||
|    cat SKILL.md | head -n 10 | ||||
|    ``` | ||||
|  | ||||
| 4. **Run debug mode**: | ||||
|    ```bash | ||||
|    claude --debug | ||||
|    ``` | ||||
|  | ||||
| ## Common patterns | ||||
|  | ||||
| ### Read-only Skill | ||||
|  | ||||
| ```yaml | ||||
| --- | ||||
| name: code-reader | ||||
| description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation. | ||||
| allowed-tools: Read, Grep, Glob | ||||
| --- | ||||
| ``` | ||||
|  | ||||
| ### Script-based Skill | ||||
|  | ||||
| ```yaml | ||||
| --- | ||||
| name: data-processor | ||||
| description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets. | ||||
| --- | ||||
|  | ||||
| # Data Processor | ||||
|  | ||||
| ## Instructions | ||||
|  | ||||
| 1. Use the processing script: | ||||
| \`\`\`bash | ||||
| python scripts/process.py input.csv --output results.json | ||||
| \`\`\` | ||||
|  | ||||
| 2. Validate output with: | ||||
| \`\`\`bash | ||||
| python scripts/validate.py results.json | ||||
| \`\`\` | ||||
| ``` | ||||
|  | ||||
| ### Multi-file Skill with progressive disclosure | ||||
|  | ||||
| ```yaml | ||||
| --- | ||||
| name: api-designer | ||||
| description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture. | ||||
| --- | ||||
|  | ||||
| # API Designer | ||||
|  | ||||
| Quick start: See [examples.md](examples.md) | ||||
|  | ||||
| Detailed reference: See [reference.md](reference.md) | ||||
|  | ||||
| ## Instructions | ||||
|  | ||||
| 1. Gather requirements | ||||
| 2. Design endpoints (see examples.md) | ||||
| 3. Document with OpenAPI spec | ||||
| 4. Review against best practices (see reference.md) | ||||
| ``` | ||||
|  | ||||
| ## Best practices for Skill authors | ||||
|  | ||||
| 1. **One Skill, one purpose**: Don't create mega-Skills | ||||
| 2. **Specific descriptions**: Include trigger words users will say | ||||
| 3. **Clear instructions**: Write for Claude, not humans | ||||
| 4. **Concrete examples**: Show real code, not pseudocode | ||||
| 5. **List dependencies**: Mention required packages in description | ||||
| 6. **Test with teammates**: Verify activation and clarity | ||||
| 7. **Version your Skills**: Document changes in content | ||||
| 8. **Use progressive disclosure**: Put advanced details in separate files | ||||
|  | ||||
| ## Validation checklist | ||||
|  | ||||
| Before finalizing a Skill, verify: | ||||
|  | ||||
| - [ ] Name is lowercase, hyphens only, max 64 chars | ||||
| - [ ] Description is specific and < 1024 chars | ||||
| - [ ] Description includes "what" and "when" | ||||
| - [ ] YAML frontmatter is valid | ||||
| - [ ] Instructions are step-by-step | ||||
| - [ ] Examples are concrete and realistic | ||||
| - [ ] Dependencies are documented | ||||
| - [ ] File paths use forward slashes | ||||
| - [ ] Skill activates on relevant queries | ||||
| - [ ] Claude follows instructions correctly | ||||
|  | ||||
| ## Troubleshooting | ||||
|  | ||||
| **Skill doesn't activate**: | ||||
| - Make description more specific with trigger words | ||||
| - Include file types and operations in description | ||||
| - Add "Use when..." clause with user phrases | ||||
|  | ||||
| **Multiple Skills conflict**: | ||||
| - Make descriptions more distinct | ||||
| - Use different trigger words | ||||
| - Narrow the scope of each Skill | ||||
|  | ||||
| **Skill has errors**: | ||||
| - Check YAML syntax (no tabs, proper indentation) | ||||
| - Verify file paths (use forward slashes) | ||||
| - Ensure scripts have execute permissions | ||||
| - List all dependencies | ||||
|  | ||||
| ## Examples | ||||
|  | ||||
| See the documentation for complete examples: | ||||
| - Simple single-file Skill (commit-helper) | ||||
| - Skill with tool permissions (code-reviewer) | ||||
| - Multi-file Skill (pdf-processing) | ||||
|  | ||||
| ## Output format | ||||
|  | ||||
| When creating a Skill, I will: | ||||
|  | ||||
| 1. Ask clarifying questions about scope and requirements | ||||
| 2. Suggest a Skill name and location | ||||
| 3. Create the SKILL.md file with proper frontmatter | ||||
| 4. Include clear instructions and examples | ||||
| 5. Add supporting files if needed | ||||
| 6. Provide testing instructions | ||||
| 7. Validate against all requirements | ||||
|  | ||||
| The result will be a complete, working Skill that follows all best practices and validation rules. | ||||
							
								
								
									
										22
									
								
								.github/scripts/generate_binary_build_matrix.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										22
									
								
								.github/scripts/generate_binary_build_matrix.py
									
									
									
									
										vendored
									
									
								
							| @ -22,7 +22,7 @@ CUDA_ARCHES_FULL_VERSION = { | ||||
|     "12.6": "12.6.3", | ||||
|     "12.8": "12.8.1", | ||||
|     "12.9": "12.9.1", | ||||
|     "13.0": "13.0.2", | ||||
|     "13.0": "13.0.0", | ||||
| } | ||||
| CUDA_ARCHES_CUDNN_VERSION = { | ||||
|     "12.6": "9", | ||||
| @ -96,21 +96,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { | ||||
|         "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'" | ||||
|     ), | ||||
|     "13.0": ( | ||||
|         "nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | " | ||||
|         "nvidia-cublas==13.1.0.3; platform_system == 'Linux' | " | ||||
|         "nvidia-cufft==12.0.0.61; platform_system == 'Linux' | " | ||||
|         "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | " | ||||
|         "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | " | ||||
|         "nvidia-curand==10.4.0.35; platform_system == 'Linux' | " | ||||
|         "nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | " | ||||
|         "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " | ||||
|         "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | " | ||||
|         "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " | ||||
|         "nvidia-nvtx==13.0.85; platform_system == 'Linux' | " | ||||
|         "nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | " | ||||
|         "nvidia-cufile==1.15.1.6; platform_system == 'Linux'" | ||||
|         "nvidia-nvtx==13.0.39; platform_system == 'Linux' | " | ||||
|         "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " | ||||
|         "nvidia-cufile==1.15.0.42; platform_system == 'Linux'" | ||||
|     ), | ||||
|     "xpu": ( | ||||
|         "intel-cmplr-lib-rt==2025.2.1 | " | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -270,7 +270,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_10-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -519,7 +519,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_11-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -768,7 +768,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_12-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1017,7 +1017,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1266,7 +1266,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13t-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1515,7 +1515,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1764,7 +1764,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14t-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -325,7 +325,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_10-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_10-cuda13_0-test:  # Testing | ||||
| @ -991,7 +991,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_11-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_11-cuda13_0-test:  # Testing | ||||
| @ -1657,7 +1657,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_12-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_12-cuda13_0-test:  # Testing | ||||
| @ -2323,7 +2323,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13-cuda13_0-test:  # Testing | ||||
| @ -2989,7 +2989,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13t-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13t-cuda13_0-test:  # Testing | ||||
| @ -3655,7 +3655,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14-cuda13_0-test:  # Testing | ||||
| @ -4321,7 +4321,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14t-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14t-cuda13_0-test:  # Testing | ||||
|  | ||||
| @ -1,4 +1,4 @@ | ||||
|  | ||||
|  | ||||
|  | ||||
| -------------------------------------------------------------------------------- | ||||
|  | ||||
| @ -72,7 +72,7 @@ Elaborating Further: | ||||
|  | ||||
| If you use NumPy, then you have used Tensors (a.k.a. ndarray). | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the | ||||
| computation by a huge amount. | ||||
| @ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc | ||||
| While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date. | ||||
| You get the best of speed and flexibility for your crazy research. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| ### Python First | ||||
|  | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,171 +0,0 @@ | ||||
| #pragma once | ||||
|  | ||||
| #include <ATen/core/Tensor.h> | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| using at::blas::ScalingType; | ||||
| using at::blas::SwizzleType; | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| // TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492 | ||||
| c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) { | ||||
|   if (resolve_conj && tensor.is_conj()) { | ||||
|     return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj()); | ||||
|   } else { | ||||
|     return c10::MaybeOwned<Tensor>::borrowed(tensor); | ||||
|   } | ||||
| } | ||||
|  | ||||
| c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) { | ||||
|   if (tensor.is_non_overlapping_and_dense()) { // common case | ||||
|       transpose_tensor = tensor.is_contiguous(); | ||||
|       return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor); | ||||
|   } | ||||
|   IntArrayRef tensor_strides = tensor.strides(); | ||||
|   IntArrayRef tensor_sizes = tensor.sizes(); | ||||
|   if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) { | ||||
|     transpose_tensor = false; | ||||
|     return resolve_conj_if_indicated(tensor, !transpose_result); | ||||
|   } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) { | ||||
|     transpose_tensor = true; | ||||
|     return resolve_conj_if_indicated(tensor, transpose_result); | ||||
|   } else { | ||||
|     transpose_tensor = true; | ||||
|     return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous)); | ||||
|   } | ||||
| } | ||||
|  | ||||
| c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) { | ||||
|   if (tensor.is_non_overlapping_and_dense()) { // common case | ||||
|       transpose_tensor = tensor.is_contiguous(); | ||||
|       return resolve_conj_if_indicated(tensor, true); | ||||
|   } | ||||
|  | ||||
|   IntArrayRef tensor_strides = tensor.strides(); | ||||
|   IntArrayRef tensor_sizes = tensor.sizes(); | ||||
|   if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) { | ||||
|     transpose_tensor = false; | ||||
|     return resolve_conj_if_indicated(tensor, true); | ||||
|   } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) { | ||||
|     transpose_tensor = true; | ||||
|     return resolve_conj_if_indicated(tensor, true); | ||||
|   } else { | ||||
|     transpose_tensor = true; | ||||
|     return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous)); | ||||
|   } | ||||
| } | ||||
|  | ||||
| } // namespace | ||||
|  | ||||
| /** | ||||
|  * @brief Prepares matrices for CUBLAS operation | ||||
|  * | ||||
|  * This constructor prepares tensors for CUBLAS | ||||
|  * The main difference is that PyTorch uses row-major as the default and | ||||
|  * CUBLAS expects column-major. | ||||
|  * | ||||
|  * @details | ||||
|  * To enable row-major output while using CUBLAS, | ||||
|  * we use the mathematical identity that (A × B)^T = B^T × A^T. | ||||
|  * | ||||
|  * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major) | ||||
|  * T = row-major, N = col-major | ||||
|  * | ||||
|  * Example: | ||||
|  * For matrices A (M×K)(row-major) and B (K×N)(row-major): | ||||
|  *   - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major) | ||||
|  *   - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N) | ||||
|  *   - However, since the output form cublas is column-major this is | ||||
|  *   - equivalent to an output of size MxN row-major as expected | ||||
|  * | ||||
|  * The transpose flags are derived from the layouts of the passed in tensors | ||||
|  * | ||||
|  * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted | ||||
|  * to their unpacked values to match what cuBLAS expects. | ||||
|  * | ||||
|  * @param mat1 First input matrix | ||||
|  * @param mat2 Second input matrix | ||||
|  * @param c Output matrix (result) | ||||
|  * @param scale_a Optional scaling factor for first matrix | ||||
|  * @param scale_b Optional scaling factor for second matrix | ||||
|  * @param scale_result Optional scaling factor for result | ||||
|  */ | ||||
| struct cublasCommonArgs { | ||||
|   cublasCommonArgs( | ||||
|       const Tensor& mat1, | ||||
|       const Tensor& mat2, | ||||
|       Tensor& c, | ||||
|       const std::optional<Tensor>& scale_a = std::nullopt, | ||||
|       const std::optional<Tensor>& scale_b = std::nullopt, | ||||
|       const std::optional<Tensor>& scale_result = std::nullopt, | ||||
|       const std::optional<ScalingType>& scaling_choice_a = std::nullopt, | ||||
|       const std::optional<ScalingType>& scaling_choice_b = std::nullopt) { | ||||
|     bool transpose_result = false, transpose_a = false, transpose_b = false; | ||||
|     result = prepare_matrix_for_cublas(c, transpose_result); | ||||
|     mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); | ||||
|     matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); | ||||
|  | ||||
|     // Handle scale tensors if provided | ||||
|     if (scale_a && scale_b) { | ||||
|       // By default since we return in row-major we run the gemm | ||||
|       // as B.T @ A.T, check transpose_result to determine if we flip the scales | ||||
|       scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); | ||||
|       scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); | ||||
|       scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; | ||||
|       scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); | ||||
|       scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); | ||||
|       scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; | ||||
|     } | ||||
|  | ||||
|     if (scale_result) { | ||||
|       scale_result_ptr = scale_result->data_ptr(); | ||||
|       scale_result_dtype = scale_result->scalar_type(); | ||||
|     } | ||||
|  | ||||
|     // Update transpose flags | ||||
|     if (transpose_result) { | ||||
|       transpose_a = !transpose_a; | ||||
|       transpose_b = !transpose_b; | ||||
|     } | ||||
|  | ||||
|     auto sizes_a = mata->sizes(); | ||||
|     auto sizes_b = matb->sizes(); | ||||
|  | ||||
|     m = sizes_a[transpose_result ? 1 : 0]; | ||||
|     k = sizes_a[transpose_result ? 0 : 1]; | ||||
|     n = sizes_b[transpose_result ? 0 : 1]; | ||||
|     lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); | ||||
|     ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); | ||||
|     result_ld = result->stride(transpose_result ? 0 : 1); | ||||
|     transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; | ||||
|     transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; | ||||
|  | ||||
|     // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing | ||||
|     // if the gemm operands are in packed float4 | ||||
|     if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) { | ||||
|       k = k * 2; | ||||
|       lda = lda * 2; | ||||
|       ldb = ldb * 2; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   // Matrix members | ||||
|   char transa, transb; | ||||
|   int64_t m, n, k; | ||||
|   int64_t lda, ldb, result_ld; | ||||
|   c10::MaybeOwned<Tensor> mata, matb, result; | ||||
|  | ||||
|   // Scale members | ||||
|   void* scale_mata_ptr = nullptr; | ||||
|   void* scale_matb_ptr = nullptr; | ||||
|   void* scale_result_ptr = nullptr; | ||||
|   std::optional<c10::ScalarType> scale_mata_dtype; | ||||
|   std::optional<ScalingType> scaling_mata_type; | ||||
|   std::optional<c10::ScalarType> scale_matb_dtype; | ||||
|   std::optional<ScalingType> scaling_matb_type; | ||||
|   std::optional<c10::ScalarType> scale_result_dtype; | ||||
| }; | ||||
|  | ||||
| } // namespace at::native | ||||
| @ -57,7 +57,6 @@ Tensor& random_mps_impl(Tensor& self, | ||||
|   if (self.numel() == 0) { | ||||
|     return self; | ||||
|   } | ||||
|   at::assert_no_internal_overlap(self); | ||||
|   // MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624 | ||||
|   const auto need_reshape = self.ndimension() > 4; | ||||
|   auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator()); | ||||
| @ -154,16 +153,8 @@ Tensor& random_mps_impl(Tensor& self, | ||||
|       feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData(); | ||||
|     } | ||||
|  | ||||
|     // Handle non-contiguous output tensors by creating a contiguous temporary | ||||
|     const auto needs_gather = needsGather(self); | ||||
|     Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self; | ||||
|     Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_); | ||||
|     Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self); | ||||
|     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); | ||||
|  | ||||
|     // Copy results back to original non-contiguous output | ||||
|     if (needs_gather) { | ||||
|       self.copy_(self_); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   return self; | ||||
|  | ||||
| @ -1358,15 +1358,9 @@ if(BUILD_TEST) | ||||
|     ) | ||||
|   else() | ||||
|     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) | ||||
|     add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy) | ||||
|     # NativeRT is disabled | ||||
|     # add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert) | ||||
|     add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor) | ||||
|     add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check) | ||||
|     if(BUILD_AOT_INDUCTOR_TEST) | ||||
|       add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference) | ||||
|     endif() | ||||
|  | ||||
|     if(USE_DISTRIBUTED) | ||||
|       add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d) | ||||
|       if(NOT WIN32) | ||||
| @ -1384,6 +1378,16 @@ if(BUILD_TEST) | ||||
|         ${CMAKE_BINARY_DIR}/test_mobile_nnc | ||||
|       ) | ||||
|     endif() | ||||
|     add_subdirectory(${TORCH_ROOT}/test/cpp/lazy | ||||
|                      ${CMAKE_BINARY_DIR}/test_lazy) | ||||
|   endif() | ||||
|   if(BUILD_AOT_INDUCTOR_TEST) | ||||
|     add_subdirectory( | ||||
|       ${TORCH_ROOT}/test/cpp/aoti_abi_check | ||||
|       ${CMAKE_BINARY_DIR}/test_aoti_abi_check) | ||||
|     add_subdirectory( | ||||
|       ${TORCH_ROOT}/test/cpp/aoti_inference | ||||
|       ${CMAKE_BINARY_DIR}/test_aoti_inference) | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
|  | ||||
| @ -1,8 +1,3 @@ | ||||
| # Skip on windows | ||||
| if(WIN32) | ||||
|   return() | ||||
| endif() | ||||
|  | ||||
| set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check) | ||||
|  | ||||
| # Build the cpp gtest binary containing the cpp-only tests. | ||||
| @ -35,15 +30,8 @@ target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST) | ||||
|  | ||||
| # WARNING: DO NOT LINK torch!!! | ||||
| # The purpose is to check if the used aten/c10 headers are written in a header-only way | ||||
| target_link_libraries(test_aoti_abi_check PRIVATE gtest_main sleef) | ||||
| target_link_libraries(test_aoti_abi_check PRIVATE gtest_main) | ||||
| target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE}) | ||||
| if(NOT USE_SYSTEM_SLEEF) | ||||
|   target_include_directories(test_aoti_abi_check PRIVATE ${CMAKE_BINARY_DIR}/include) | ||||
| endif() | ||||
|  | ||||
| # Disable unused-variable warnings for variables that are only used to test compilation | ||||
| target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable) | ||||
| target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable) | ||||
|  | ||||
| foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS}) | ||||
|   foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES}) | ||||
| @ -53,17 +41,12 @@ foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS}) | ||||
|     separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}") | ||||
|     add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") | ||||
|  | ||||
|     target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main sleef) | ||||
|     target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main) | ||||
|     target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${ATen_CPU_INCLUDE}) | ||||
|     if(NOT USE_SYSTEM_SLEEF) | ||||
|       target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${CMAKE_BINARY_DIR}/include) | ||||
|     endif() | ||||
|  | ||||
|     # Define CPU_CAPABILITY and CPU_CAPABILITY_XXX macros for conditional compilation | ||||
|     target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY} CPU_CAPABILITY_${CPU_CAPABILITY}) | ||||
|     target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE ${FLAGS}) | ||||
|     target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-variable) | ||||
|     target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-but-set-variable) | ||||
|   endforeach() | ||||
| endforeach() | ||||
|  | ||||
|  | ||||
| @ -2,27 +2,10 @@ | ||||
|  | ||||
| #include <ATen/cpu/vec/vec.h> | ||||
|  | ||||
| #include <iostream> | ||||
| namespace torch { | ||||
| namespace aot_inductor { | ||||
|  | ||||
| template <typename T> | ||||
| void ExpectVecEqual( | ||||
|     const at::vec::Vectorized<T>& expected, | ||||
|     const at::vec::Vectorized<T>& actual) { | ||||
|   using Vec = at::vec::Vectorized<T>; | ||||
|   // Have to use std::vector for comparison because at::vec::Vectorized doesn't | ||||
|   // support operator[] on aarch64 | ||||
|   std::vector<T> expected_data(Vec::size()); | ||||
|   std::vector<T> actual_data(Vec::size()); | ||||
|  | ||||
|   expected.store(expected_data.data()); | ||||
|   actual.store(actual_data.data()); | ||||
|  | ||||
|   for (int i = 0; i < Vec::size(); i++) { | ||||
|     EXPECT_EQ(expected_data[i], actual_data[i]); | ||||
|   } | ||||
| } | ||||
|  | ||||
| TEST(TestVec, TestAdd) { | ||||
|   using Vec = at::vec::Vectorized<int>; | ||||
|   std::vector<int> a(1024, 1); | ||||
| @ -33,7 +16,9 @@ TEST(TestVec, TestAdd) { | ||||
|   std::vector<int> expected(1024, 3); | ||||
|   Vec expected_vec = Vec::loadu(expected.data()); | ||||
|  | ||||
|   ExpectVecEqual(expected_vec, actual_vec); | ||||
|   for (int i = 0; i < Vec::size(); i++) { | ||||
|     EXPECT_EQ(expected_vec[i], actual_vec[i]); | ||||
|   } | ||||
| } | ||||
|  | ||||
| TEST(TestVec, TestMax) { | ||||
| @ -45,7 +30,9 @@ TEST(TestVec, TestMax) { | ||||
|   Vec actual_vec = at::vec::maximum(a_vec, b_vec); | ||||
|   Vec expected_vec = b_vec; | ||||
|  | ||||
|   ExpectVecEqual(expected_vec, actual_vec); | ||||
|   for (int i = 0; i < Vec::size(); i++) { | ||||
|     EXPECT_EQ(expected_vec[i], actual_vec[i]); | ||||
|   } | ||||
| } | ||||
|  | ||||
| TEST(TestVec, TestMin) { | ||||
| @ -57,7 +44,9 @@ TEST(TestVec, TestMin) { | ||||
|   Vec actual_vec = at::vec::minimum(a_vec, b_vec); | ||||
|   Vec expected_vec = a_vec; | ||||
|  | ||||
|   ExpectVecEqual(expected_vec, actual_vec); | ||||
|   for (int i = 0; i < Vec::size(); i++) { | ||||
|     EXPECT_EQ(expected_vec[i], actual_vec[i]); | ||||
|   } | ||||
| } | ||||
|  | ||||
| TEST(TestVec, TestConvert) { | ||||
| @ -69,7 +58,9 @@ TEST(TestVec, TestConvert) { | ||||
|   auto actual_vec = at::vec::convert<float>(a_vec); | ||||
|   auto expected_vec = b_vec; | ||||
|  | ||||
|   ExpectVecEqual(expected_vec, actual_vec); | ||||
|   for (int i = 0; i < at::vec::Vectorized<int>::size(); i++) { | ||||
|     EXPECT_EQ(expected_vec[i], actual_vec[i]); | ||||
|   } | ||||
| } | ||||
|  | ||||
| TEST(TestVec, TestClampMin) { | ||||
| @ -81,7 +72,9 @@ TEST(TestVec, TestClampMin) { | ||||
|   Vec actual_vec = at::vec::clamp_min(a_vec, min_vec); | ||||
|   Vec expected_vec = min_vec; | ||||
|  | ||||
|   ExpectVecEqual(expected_vec, actual_vec); | ||||
|   for (int i = 0; i < Vec::size(); i++) { | ||||
|     EXPECT_EQ(expected_vec[i], actual_vec[i]); | ||||
|   } | ||||
| } | ||||
|  | ||||
| } // namespace aot_inductor | ||||
|  | ||||
| @ -1,3 +1,4 @@ | ||||
|  | ||||
| set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_inference) | ||||
|  | ||||
| # Build custom TorchScript op for AOTInductor | ||||
| @ -7,12 +8,27 @@ set_target_properties(aoti_custom_class PROPERTIES | ||||
| if(USE_CUDA) | ||||
|   target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA) | ||||
| elseif(USE_ROCM) | ||||
|   target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM) | ||||
|     target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM) | ||||
| endif() | ||||
|  | ||||
| # Link against LibTorch | ||||
| target_link_libraries(aoti_custom_class torch) | ||||
|  | ||||
| # the custom command that generates the TorchScript module | ||||
| add_custom_command( | ||||
|     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt | ||||
|            ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt | ||||
|            ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt | ||||
|     # This script requires the torch package to be installed. | ||||
|     COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py | ||||
|     DEPENDS torch torch_python aoti_custom_class ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py | ||||
| ) | ||||
| add_custom_target(aoti_script_model ALL | ||||
|     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt | ||||
|     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt | ||||
|     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt | ||||
| ) | ||||
| add_dependencies(aoti_script_model aoti_custom_class) | ||||
|  | ||||
| # Build the cpp gtest binary containing the cpp-only tests. | ||||
| set(INDUCTOR_TEST_SRCS | ||||
|   ${AOT_INDUCTOR_TEST_ROOT}/test.cpp | ||||
| @ -21,12 +37,23 @@ set(INDUCTOR_TEST_SRCS | ||||
| add_executable(test_aoti_inference | ||||
|   ${TORCH_ROOT}/test/cpp/common/main.cpp | ||||
|   ${INDUCTOR_TEST_SRCS} | ||||
|   data.pt | ||||
|   script_data.pt | ||||
|   script_model_cpu.pt | ||||
|   script_model_cuda.pt | ||||
| ) | ||||
| add_dependencies(test_aoti_inference aoti_custom_class) | ||||
| add_dependencies(test_aoti_inference aoti_custom_class aoti_script_model) | ||||
|  | ||||
| # TODO temporary until we can delete the old gtest polyfills. | ||||
| target_compile_definitions(test_aoti_inference PRIVATE USE_GTEST) | ||||
|  | ||||
| # Define a custom command to generate the library | ||||
| add_custom_command( | ||||
|         OUTPUT data.pt | ||||
|         COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py | ||||
|         DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py | ||||
| ) | ||||
|  | ||||
| target_link_libraries(test_aoti_inference PRIVATE | ||||
|   torch | ||||
|   gtest_main | ||||
| @ -44,10 +71,6 @@ target_compile_definitions(test_aoti_inference PRIVATE | ||||
|     CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} | ||||
| ) | ||||
|  | ||||
| target_compile_options_if_supported(test_aoti_inference -Wno-unused-variable) | ||||
| target_compile_options_if_supported(test_aoti_inference -Wno-unused-but-set-variable) | ||||
| target_compile_options_if_supported(test_aoti_inference -Wno-unused-function) | ||||
|  | ||||
| if(INSTALL_TEST) | ||||
|   install(TARGETS test_aoti_inference DESTINATION bin) | ||||
|   # Install PDB files for MSVC builds | ||||
|  | ||||
| @ -2,9 +2,7 @@ | ||||
| #include <gtest/gtest.h> | ||||
| #include <atomic> | ||||
| #include <condition_variable> | ||||
| #include <cstdlib> | ||||
| #include <filesystem> | ||||
| #include <fstream> | ||||
| #include <functional> | ||||
| #include <mutex> | ||||
| #include <queue> | ||||
| @ -30,64 +28,6 @@ | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| // Function to check if test data files exist and are valid | ||||
| bool testDataFilesExist() { | ||||
|   std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR); | ||||
|   std::array<std::string, 4> required_files = { | ||||
|       "data.pt", | ||||
|       "script_data.pt", | ||||
|       "script_model_cpu.pt", | ||||
|       "script_model_cuda.pt"}; | ||||
|  | ||||
|   for (const auto& filename : required_files) { | ||||
|     std::string filepath = bindir + "/" + filename; | ||||
|     std::ifstream file(filepath); | ||||
|     if (!file.good()) { | ||||
|       return false; | ||||
|     } | ||||
|   } | ||||
|   return true; | ||||
| } | ||||
|  | ||||
| // Function to ensure test data files are generated at runtime | ||||
| void ensureTestDataGenerated() { | ||||
|   static std::once_flag generated_flag; | ||||
|   std::call_once(generated_flag, []() { | ||||
|     // Only generate if files don't exist or are placeholders | ||||
|     if (testDataFilesExist()) { | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR); | ||||
|  | ||||
|     // Calculate path to source directory: build/test_aoti_inference -> build -> | ||||
|     // pytorch | ||||
|     std::string pytorch_root = bindir.substr(0, bindir.find_last_of("/")); | ||||
|     pytorch_root = pytorch_root.substr(0, pytorch_root.find_last_of("/")); | ||||
|     std::string source_dir = pytorch_root + "/test/cpp/aoti_inference"; | ||||
|  | ||||
|     // Generate test data files (data.pt, etc.) by running test.py directly | ||||
|     std::string test_script = source_dir + "/test.py"; | ||||
|     std::string test_data_cmd = "cd " + bindir + " && python " + test_script; | ||||
|     std::cout << "Generating test data: " << test_data_cmd << std::endl; | ||||
|     int result1 = std::system(test_data_cmd.c_str()); | ||||
|     if (result1 != 0) { | ||||
|       std::cerr << "Warning: Test data generation failed with code " << result1 | ||||
|                 << std::endl; | ||||
|     } | ||||
|  | ||||
|     // Generate model files (script_*.pt) by running compile_model.py directly | ||||
|     std::string compile_script = source_dir + "/compile_model.py"; | ||||
|     std::string models_cmd = "cd " + bindir + " && python " + compile_script; | ||||
|     std::cout << "Generating model files: " << models_cmd << std::endl; | ||||
|     int result2 = std::system(models_cmd.c_str()); | ||||
|     if (result2 != 0) { | ||||
|       std::cerr << "Warning: Model generation failed with code " << result2 | ||||
|                 << std::endl; | ||||
|     } | ||||
|   }); | ||||
| } | ||||
|  | ||||
| const std::unordered_map<std::string, at::Tensor> derefTensorConstantMap( | ||||
|     torch::inductor::TensorConstantMap tensor_constant_map) { | ||||
|   std::unordered_map<std::string, at::Tensor> ret; | ||||
| @ -915,6 +855,7 @@ void test_aoti_free_buffer(bool use_runtime_constant_folding) { | ||||
|   } | ||||
| } | ||||
|  | ||||
| #if defined(USE_CUDA) || defined(USE_ROCM) | ||||
| void test_cuda_alloc_test() { | ||||
|   torch::NoGradGuard no_grad; | ||||
|  | ||||
| @ -954,8 +895,8 @@ void test_cuda_alloc_test() { | ||||
|       runner->run(data_loader.attr(inputs_attr.c_str()).toTensorList().vec()); | ||||
|   ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); | ||||
| } | ||||
| #endif | ||||
|  | ||||
| #ifdef USE_CUDA | ||||
| class ThreadPool { | ||||
|  private: | ||||
|   struct Task { | ||||
| @ -1096,96 +1037,86 @@ void test_multi_cuda_streams(const std::string& device) { | ||||
|     ASSERT_TRUE(torch::allclose(ref_output_tensors[0], all_outputs[i][0])); | ||||
|   } | ||||
| } | ||||
| #endif // USE_CUDA | ||||
| #endif // USE_CUDA || USE_ROCM | ||||
| #endif | ||||
| } // namespace | ||||
|  | ||||
| namespace torch::aot_inductor { | ||||
|  | ||||
| // Test fixture that ensures test data is generated once for all tests | ||||
| class AotInductorTest : public ::testing::Test { | ||||
|  public: | ||||
|   // This runs once before all tests in this test suite | ||||
|   static void SetUpTestSuite() { | ||||
|     ensureTestDataGenerated(); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| TEST_F(AotInductorTest, BasicTestCpu) { | ||||
| TEST(AotInductorTest, BasicTestCpu) { | ||||
|   test_aoti("cpu", false); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, BasicScriptTestCpu) { | ||||
| TEST(AotInductorTest, BasicScriptTestCpu) { | ||||
|   test_aoti_script("cpu"); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, BasicPackageLoaderTestCpu) { | ||||
| TEST(AotInductorTest, BasicPackageLoaderTestCpu) { | ||||
|   test_aoti_package_loader("cpu", false); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, ExtractConstantsMapCpu) { | ||||
| TEST(AotInductorTest, ExtractConstantsMapCpu) { | ||||
|   test_aoti_extract_constants_map("cpu"); | ||||
| } | ||||
|  | ||||
| #ifdef USE_CUDA | ||||
| TEST_F(AotInductorTest, BasicTestCuda) { | ||||
| TEST(AotInductorTest, BasicTestCuda) { | ||||
|   test_aoti("cuda", true); | ||||
|   test_aoti("cuda", false); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, BasicScriptTestCuda) { | ||||
| TEST(AotInductorTest, BasicScriptTestCuda) { | ||||
|   test_aoti_script("cuda"); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, BasicPackageLoaderTestCuda) { | ||||
| TEST(AotInductorTest, BasicPackageLoaderTestCuda) { | ||||
|   test_aoti_package_loader("cuda", false); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) { | ||||
| TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) { | ||||
|   test_aoti_package_loader_multi_gpu("cuda", false); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, UpdateUserManagedConstantsCuda) { | ||||
| TEST(AotInductorTest, UpdateUserManagedConstantsCuda) { | ||||
|   test_aoti_user_managed_buffer(); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, RuntimeUpdateConstantsCuda) { | ||||
| TEST(AotInductorTest, RuntimeUpdateConstantsCuda) { | ||||
|   test_aoti_constants_update("cuda", true); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, UpdateConstantsCuda) { | ||||
| TEST(AotInductorTest, UpdateConstantsCuda) { | ||||
|   test_aoti_constants_update("cuda", false); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, ExtractConstantsMapCuda) { | ||||
| TEST(AotInductorTest, ExtractConstantsMapCuda) { | ||||
|   test_aoti_extract_constants_map("cuda"); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) { | ||||
| TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) { | ||||
|   test_aoti_double_buffering("cuda", true); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, UpdateInactiveConstantsCuda) { | ||||
| TEST(AotInductorTest, UpdateInactiveConstantsCuda) { | ||||
|   test_aoti_double_buffering("cuda", false); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) { | ||||
| TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) { | ||||
|   test_aoti_double_buffering_with_tensor_constants(); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, FreeInactiveConstantBufferCuda) { | ||||
| TEST(AotInductorTest, FreeInactiveConstantBufferCuda) { | ||||
|   test_aoti_free_buffer(false); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) { | ||||
| TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) { | ||||
|   test_aoti_free_buffer(true); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, MultiStreamTestCuda) { | ||||
| TEST(AotInductorTest, MultiStreamTestCuda) { | ||||
|   test_multi_cuda_streams("cuda"); | ||||
| } | ||||
|  | ||||
| TEST_F(AotInductorTest, CudaAllocTestCuda) { | ||||
| TEST(AotInductorTest, CudaAllocTestCuda) { | ||||
|   test_cuda_alloc_test(); | ||||
| } | ||||
| #endif | ||||
|  | ||||
| @ -279,7 +279,6 @@ class SymmetricMemoryTest(MultiProcContinuousTest): | ||||
| # MultiProcContinuousTest will skip all the following tests if a test fails ( | ||||
| # we should fix this too). We still want to get the test signals for the core | ||||
| # symmetric memory APIs when Async TP ops fail. | ||||
| @skip_if_rocm_multiprocess  # AsyncTP is not yet supported on ROCm | ||||
| @instantiate_parametrized_tests | ||||
| @requires_cuda_p2p_access() | ||||
| class AsyncTPTest(MultiProcContinuousTest): | ||||
|  | ||||
| @ -892,16 +892,10 @@ fn(torch.randn(5)) | ||||
|                 os.remove( | ||||
|                     file_path | ||||
|                 )  # Delete temp file manually, due to setup NamedTemporaryFile as delete=False. | ||||
|                 orig_maxDiff = unittest.TestCase.maxDiff | ||||
|                 unittest.TestCase.maxDiff = None | ||||
|                 try: | ||||
|                     self.assertEqual(  # process wrap difference: /r/n on Windows, /n on posix. | ||||
|                         empty_line_normalizer(lines), | ||||
|                         empty_line_normalizer(stderr.decode("utf-8")), | ||||
|                     ) | ||||
|                 except Exception: | ||||
|                     unittest.TestCase.maxDiff = orig_maxDiff | ||||
|                     raise | ||||
|                 self.assertEqual(  # process wrap difference: /r/n on Windows, /n on posix. | ||||
|                     empty_line_normalizer(lines), | ||||
|                     empty_line_normalizer(stderr.decode("utf-8")), | ||||
|                 ) | ||||
|  | ||||
|     @make_settings_test("torch._dynamo.eval_frame") | ||||
|     def test_log_traced_frames(self, records): | ||||
|  | ||||
| @ -1000,18 +1000,6 @@ class ReproTests(torch._dynamo.test_case.TestCase): | ||||
|         self.exit_stack.close() | ||||
|         super().tearDown() | ||||
|  | ||||
|     def test_compiled_module_truthiness(self): | ||||
|         # Test with empty ModuleList | ||||
|         original_empty = nn.ModuleList() | ||||
|         compiled_empty = torch.compile(original_empty) | ||||
|         self.assertEqual(bool(original_empty), bool(compiled_empty)) | ||||
|         self.assertFalse(bool(compiled_empty)) | ||||
|         # Test with non-empty ModuleList | ||||
|         original_filled = nn.ModuleList([nn.Linear(10, 5)]) | ||||
|         compiled_filled = torch.compile(original_filled) | ||||
|         self.assertEqual(bool(original_filled), bool(compiled_filled)) | ||||
|         self.assertTrue(bool(compiled_filled)) | ||||
|  | ||||
|     def guard_manager_clone_hook_fn(self, guard_manager_wrapper, f_locals, builder): | ||||
|         root = guard_manager_wrapper.root | ||||
|         cloned_root = root.clone_manager(lambda x: True) | ||||
|  | ||||
| @ -2,11 +2,8 @@ | ||||
| # flake8: noqa: B950 | ||||
|  | ||||
| import functools | ||||
| import json | ||||
| import os | ||||
| import random | ||||
| import string | ||||
| import tempfile | ||||
| import unittest | ||||
| import warnings | ||||
| from collections import namedtuple | ||||
| @ -7048,120 +7045,6 @@ class TestLearnableBiases(InductorTestCase): | ||||
|     def test_flex_attention_with_dynamic_max_autotune_graph_partition(self, device): | ||||
|         self._test_flex_attention_with_dynamic_max_autotune(device) | ||||
|  | ||||
|     @skip_on_cpu | ||||
|     def test_flex_attention_logging(self, device): | ||||
|         with tempfile.TemporaryDirectory() as tmpdir: | ||||
|             log_file = os.path.join(tmpdir, "flex_attention_configs") | ||||
|  | ||||
|             with patch.dict( | ||||
|                 os.environ, {"TORCHINDUCTOR_FLEX_ATTENTION_LOGGING_FILE": log_file} | ||||
|             ): | ||||
|                 query = torch.randn( | ||||
|                     1, | ||||
|                     2, | ||||
|                     128, | ||||
|                     64, | ||||
|                     device=device, | ||||
|                     dtype=torch.float16, | ||||
|                     requires_grad=True, | ||||
|                 ) | ||||
|                 key = torch.randn( | ||||
|                     1, | ||||
|                     2, | ||||
|                     128, | ||||
|                     64, | ||||
|                     device=device, | ||||
|                     dtype=torch.float16, | ||||
|                     requires_grad=True, | ||||
|                 ) | ||||
|                 value = torch.randn( | ||||
|                     1, | ||||
|                     2, | ||||
|                     128, | ||||
|                     64, | ||||
|                     device=device, | ||||
|                     dtype=torch.float16, | ||||
|                     requires_grad=True, | ||||
|                 ) | ||||
|  | ||||
|                 def score_mod(score, b, h, q_idx, kv_idx): | ||||
|                     return score * 2 | ||||
|  | ||||
|                 def causal_mask(b, h, q_idx, kv_idx): | ||||
|                     return q_idx >= kv_idx | ||||
|  | ||||
|                 block_mask = torch.compile(create_block_mask)( | ||||
|                     causal_mask, 1, 1, 128, 128, device=device | ||||
|                 ) | ||||
|  | ||||
|                 compiled_flex = torch.compile( | ||||
|                     flex_attention, mode="max-autotune-no-cudagraphs" | ||||
|                 ) | ||||
|  | ||||
|                 out = compiled_flex( | ||||
|                     query=query, | ||||
|                     key=key, | ||||
|                     value=value, | ||||
|                     score_mod=score_mod, | ||||
|                     block_mask=block_mask, | ||||
|                 ) | ||||
|  | ||||
|                 out.sum().backward() | ||||
|  | ||||
|                 json_file = log_file + ".json" | ||||
|                 self.assertTrue( | ||||
|                     os.path.exists(json_file), f"Log file {json_file} was not created" | ||||
|                 ) | ||||
|  | ||||
|                 with open(json_file) as f: | ||||
|                     log_data = json.load(f) | ||||
|  | ||||
|                 self.assertIsInstance(log_data, list) | ||||
|                 self.assertEqual(len(log_data), 2) | ||||
|  | ||||
|                 keys_seen = [next(iter(entry.keys())) for entry in log_data] | ||||
|  | ||||
|                 expected_fwd_key = "('forward', 1, 2, 2, 128, 128, 64, 64)" | ||||
|                 expected_bwd_key = "('backward', 1, 2, 2, 128, 128, 64, 64)" | ||||
|  | ||||
|                 self.assertIn(expected_fwd_key, keys_seen) | ||||
|                 self.assertIn(expected_bwd_key, keys_seen) | ||||
|  | ||||
|                 for entry in log_data: | ||||
|                     self.assertIsInstance(entry, dict) | ||||
|                     self.assertEqual(len(entry), 1) | ||||
|  | ||||
|                     dims_key = next(iter(entry.keys())) | ||||
|                     choices = entry[dims_key] | ||||
|  | ||||
|                     kernel_type = eval(dims_key)[0] | ||||
|  | ||||
|                     self.assertIsInstance(choices, list) | ||||
|                     self.assertGreater(len(choices), 0) | ||||
|  | ||||
|                     for i, choice in enumerate(choices): | ||||
|                         self.assertIn("type", choice) | ||||
|                         self.assertIn("time", choice) | ||||
|  | ||||
|                         if choice["type"] == "triton": | ||||
|                             self.assertIn("num_warps", choice) | ||||
|                             self.assertIn("num_stages", choice) | ||||
|  | ||||
|                             if kernel_type == "forward": | ||||
|                                 self.assertIn("BLOCK_M", choice) | ||||
|                                 self.assertIn("BLOCK_N", choice) | ||||
|                                 self.assertNotIn("BLOCK_M1", choice) | ||||
|                             elif kernel_type == "backward": | ||||
|                                 self.assertIn("BLOCK_M1", choice) | ||||
|                                 self.assertIn("BLOCK_N1", choice) | ||||
|                                 self.assertIn("BLOCK_M2", choice) | ||||
|                                 self.assertIn("BLOCK_N2", choice) | ||||
|                                 self.assertNotIn("BLOCK_M", choice) | ||||
|                                 self.assertNotIn("BLOCK_N", choice) | ||||
|  | ||||
|                         if i > 0: | ||||
|                             self.assertLessEqual(choices[0]["time"], choice["time"]) | ||||
|  | ||||
|     @skip_on_cpu | ||||
|     def test_inspect_bug(self, device): | ||||
|         # https://github.com/pytorch/pytorch/issues/139374 | ||||
|  | ||||
| @ -12,6 +12,7 @@ from torch.testing._internal.common_device_type import ( | ||||
|     dtypes, | ||||
|     dtypesIfMPS, | ||||
|     expectedFailureMPS, | ||||
|     expectedFailureMPSPre15, | ||||
|     expectedFailureXLA, | ||||
|     instantiate_device_type_tests, | ||||
| ) | ||||
| @ -172,6 +173,7 @@ class TestDropoutNNDeviceType(NNTestCase): | ||||
|                     else: | ||||
|                         self.assertNotEqual(permuted_inp, out) | ||||
|  | ||||
|     @expectedFailureMPSPre15 | ||||
|     def test_Dropout(self, device): | ||||
|         input = torch.empty(1000) | ||||
|         self._test_dropout(nn.Dropout, device, input) | ||||
|  | ||||
| @ -529,7 +529,7 @@ class TestProfiler(TestCase): | ||||
|                 found_mm = True | ||||
|             if "gemm" in e.name.lower() or "Cijk" in e.name: | ||||
|                 found_gemm = True | ||||
|             if "memcpy" in e.name.lower() or "__amd_rocclr_copyBuffer" in e.name: | ||||
|             if "memcpy" in e.name.lower(): | ||||
|                 found_memcpy = True | ||||
|         if use_cuda: | ||||
|             self.assertTrue(found_gemm) | ||||
|  | ||||
| @ -27,7 +27,6 @@ import torch | ||||
| import torch.distributed as dist | ||||
| from torch.multiprocessing import current_process, get_context | ||||
| from torch.testing._internal.common_utils import ( | ||||
|     get_report_dir, | ||||
|     get_report_path, | ||||
|     IS_CI, | ||||
|     IS_MACOS, | ||||
| @ -35,6 +34,7 @@ from torch.testing._internal.common_utils import ( | ||||
|     set_cwd, | ||||
|     shell, | ||||
|     TEST_CUDA, | ||||
|     TEST_SAVE_XML, | ||||
|     TEST_WITH_ASAN, | ||||
|     TEST_WITH_ROCM, | ||||
|     TEST_WITH_SLOW_GRADCHECK, | ||||
| @ -529,14 +529,6 @@ def run_test( | ||||
|         replacement = {"-f": "-x", "-dist=loadfile": "--dist=loadfile"} | ||||
|         unittest_args = [replacement.get(arg, arg) for arg in unittest_args] | ||||
|  | ||||
|     xml_report_dir = get_report_dir(test_file, None, options.pytest) | ||||
|     if is_cpp_test: | ||||
|         unittest_args.append( | ||||
|             f"--junit-xml-reruns={get_report_path(xml_report_dir, test_file)}" | ||||
|         ) | ||||
|     else: | ||||
|         unittest_args.append(f"--save-xml={xml_report_dir}") | ||||
|  | ||||
|     if options.showlocals: | ||||
|         if options.pytest: | ||||
|             unittest_args.extend(["--showlocals", "--tb=long", "--color=yes"]) | ||||
| @ -1234,6 +1226,12 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False): | ||||
|         # is much slower than running them directly | ||||
|         pytest_args.extend(["-n", str(NUM_PROCS)]) | ||||
|  | ||||
|         if TEST_SAVE_XML: | ||||
|             # Add the option to generate XML test report here as C++ tests | ||||
|             # won't go into common_utils | ||||
|             test_report_path = get_report_path(pytest=True) | ||||
|             pytest_args.extend(["--junit-xml-reruns", test_report_path]) | ||||
|  | ||||
|     if options.pytest_k_expr: | ||||
|         pytest_args.extend(["-k", options.pytest_k_expr]) | ||||
|  | ||||
|  | ||||
| @ -7846,45 +7846,6 @@ class TestMPS(TestCaseMPS): | ||||
|         y = torch.normal(torch.zeros(shape, device="mps"), torch.ones(shape, device="mps")) | ||||
|         self.assertNotEqual(y[0], y[1]) | ||||
|  | ||||
|     def test_random_ops_noncontiguous(self): | ||||
|         """Test random in-place operations on non-contiguous tensors. | ||||
|  | ||||
|         All random in-place operations should work on non-contiguous tensors. | ||||
|         See issues #165257 and #124029. | ||||
|         """ | ||||
|         # Test each random in-place operation | ||||
|         ops = [ | ||||
|             ("normal_", lambda t: t.normal_(0, 1)), | ||||
|             ("uniform_", lambda t: t.uniform_(0, 1)), | ||||
|             ("exponential_", lambda t: t.exponential_(1.0)), | ||||
|             ("bernoulli_", lambda t: t.bernoulli_(0.5)), | ||||
|             ("random_", lambda t: t.random_()), | ||||
|             ("random_with_to", lambda t: t.random_(10)), | ||||
|             ("random_with_range", lambda t: t.random_(0, 10)), | ||||
|         ] | ||||
|  | ||||
|         for name, op_func in ops: | ||||
|             with self.subTest(operation=name): | ||||
|                 # Create non-contiguous tensor via transpose | ||||
|                 t_mps = torch.zeros(50, 50, device='mps').T.clone() | ||||
|                 self.assertFalse(t_mps.is_contiguous(), | ||||
|                                  f"{name}: tensor should be non-contiguous") | ||||
|  | ||||
|                 # Apply operation | ||||
|                 op_func(t_mps) | ||||
|  | ||||
|                 # Verify tensor was modified (not all zeros) | ||||
|                 max_val = t_mps.max().item() | ||||
|                 self.assertNotEqual(max_val, 0.0, | ||||
|                                     f"{name}: operation failed to modify non-contiguous tensor") | ||||
|  | ||||
|         # Test rand_like specifically (issue #124029) | ||||
|         t = torch.ones((3, 2, 2), device='mps').permute(2, 0, 1) | ||||
|         self.assertFalse(t.is_contiguous(), "rand_like input should be non-contiguous") | ||||
|         result = torch.rand_like(t) | ||||
|         self.assertFalse(result.is_contiguous(), "rand_like result should be non-contiguous") | ||||
|         self.assertNotEqual(result.max().item(), 0.0, "rand_like should generate non-zero values") | ||||
|  | ||||
|     # Test exponential | ||||
|     @unittest.skip("This does not test anything") | ||||
|     def test_exponential(self): | ||||
|  | ||||
| @ -5,22 +5,29 @@ from collections import namedtuple | ||||
| import torch | ||||
| import torch.nn as nn | ||||
| import torch.nn.functional as F | ||||
| from torch.nn.attention import varlen_attn | ||||
| from torch.nn.attention.varlen import varlen_attn | ||||
| from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION | ||||
| from torch.testing._internal.common_device_type import instantiate_device_type_tests | ||||
| from torch.testing._internal.common_nn import NNTestCase | ||||
| from torch.testing._internal.common_utils import parametrize, run_tests | ||||
| from torch.testing._internal.common_utils import parametrize, run_tests, skipIfRocm | ||||
| from torch.utils._python_dispatch import TorchDispatchMode | ||||
|  | ||||
|  | ||||
| VarlenShape = namedtuple( | ||||
|     "VarlenShape", ["batch_size", "max_seq_len", "embed_dim", "num_heads"] | ||||
| ) | ||||
|  | ||||
| default_tolerances = { | ||||
|     torch.float16: {"atol": 1e-1, "rtol": 1e-1}, | ||||
|     torch.bfloat16: {"atol": 9e-2, "rtol": 5e-2}, | ||||
|     torch.float32: {"atol": 1e-5, "rtol": 1.3e-6}, | ||||
| } | ||||
|  | ||||
| class OpLoggingMode(TorchDispatchMode): | ||||
|     """Logging mode that captures all dispatched operations""" | ||||
|  | ||||
|     def __init__(self): | ||||
|         self.called_ops = [] | ||||
|  | ||||
|     def __torch_dispatch__(self, func, types, args=(), kwargs=None): | ||||
|         op_name = str(func) | ||||
|         self.called_ops.append(op_name) | ||||
|         return func(*args, **(kwargs or {})) | ||||
|  | ||||
|  | ||||
| class AttentionBlock(nn.Module): | ||||
| @ -39,12 +46,9 @@ class AttentionBlock(nn.Module): | ||||
|             embed_dim, embed_dim, bias=False, device=device, dtype=dtype | ||||
|         ) | ||||
|  | ||||
|     def forward_varlen( | ||||
|     def get_varlen_qkv( | ||||
|         self, | ||||
|         x_packed: torch.Tensor, | ||||
|         cu_seq: torch.Tensor, | ||||
|         max_len: int, | ||||
|         is_causal: bool = False, | ||||
|     ): | ||||
|         qkv = self.qkv_proj(x_packed) | ||||
|         q, k, v = qkv.chunk(3, dim=-1) | ||||
| @ -53,24 +57,56 @@ class AttentionBlock(nn.Module): | ||||
|         k = k.view(-1, self.num_heads, self.head_dim) | ||||
|         v = v.view(-1, self.num_heads, self.head_dim) | ||||
|  | ||||
|         attn_out = varlen_attn( | ||||
|             q, k, v, cu_seq, cu_seq, max_len, max_len, is_causal=is_causal | ||||
|         ) | ||||
|         return q, k, v | ||||
|  | ||||
|     def forward_varlen( | ||||
|         self, | ||||
|         x_packed: torch.Tensor, | ||||
|         cu_seq: torch.Tensor, | ||||
|         max_len: int, | ||||
|         is_causal: bool = False, | ||||
|     ): | ||||
|         q, k, v = self.get_varlen_qkv(x_packed) | ||||
|  | ||||
|         attn_out = varlen_attn(q, k, v, cu_seq, cu_seq, max_len, max_len, is_causal) | ||||
|         attn_out = attn_out.view(-1, self.embed_dim) | ||||
|  | ||||
|         return self.out_proj(attn_out) | ||||
|  | ||||
|     def forward_sdpa(self, x_padded: torch.Tensor, is_causal: bool = False): | ||||
|     def forward_sdpa( | ||||
|         self, | ||||
|         x_padded: torch.Tensor, | ||||
|         seq_lengths: torch.Tensor, | ||||
|         is_causal: bool = False, | ||||
|     ): | ||||
|         batch_size, seq_len, _ = x_padded.shape | ||||
|  | ||||
|         qkv = self.qkv_proj(x_padded) | ||||
|         q, k, v = qkv.chunk(3, dim=-1) | ||||
|  | ||||
|         mask = ( | ||||
|             torch.arange(seq_len, device=x_padded.device)[None, :] | ||||
|             < seq_lengths[:, None] | ||||
|         ) | ||||
|  | ||||
|         attn_mask = mask[:, None, None, :].expand( | ||||
|             batch_size, self.num_heads, seq_len, seq_len | ||||
|         ) | ||||
|  | ||||
|         q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) | ||||
|         k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) | ||||
|         v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) | ||||
|  | ||||
|         attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=is_causal) | ||||
|         if is_causal: | ||||
|             causal_mask = torch.triu( | ||||
|                 torch.ones(seq_len, seq_len, device=x_padded.device, dtype=torch.bool), | ||||
|                 diagonal=1, | ||||
|             ) | ||||
|             combined_mask = causal_mask[None, None, :, :] | ~attn_mask | ||||
|             attn_out = F.scaled_dot_product_attention(q, k, v, attn_mask=~combined_mask) | ||||
|         else: | ||||
|             attn_out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask) | ||||
|  | ||||
|         attn_out = ( | ||||
|             attn_out.transpose(1, 2) | ||||
|             .contiguous() | ||||
| @ -91,7 +127,9 @@ def create_variable_length_batch( | ||||
|     seq_lengths = torch.tensor(seq_lengths, device=device) | ||||
|     total_tokens = seq_lengths.sum().item() | ||||
|  | ||||
|     x_packed = torch.randn(total_tokens, shape.embed_dim, device=device, dtype=dtype) | ||||
|     x_packed = torch.randn( | ||||
|         total_tokens, shape.embed_dim, device=device, dtype=dtype, requires_grad=True | ||||
|     ) | ||||
|  | ||||
|     cu_seq = torch.zeros(shape.batch_size + 1, device=device, dtype=torch.int32) | ||||
|     cu_seq[1:] = seq_lengths.cumsum(0) | ||||
| @ -106,6 +144,7 @@ def create_variable_length_batch( | ||||
|         end_idx = start_idx + seq_len | ||||
|         x_padded[i, :seq_len] = x_packed[start_idx:end_idx] | ||||
|         start_idx = end_idx | ||||
|     x_padded = x_padded.clone().detach().requires_grad_() | ||||
|  | ||||
|     return { | ||||
|         "seq_lengths": seq_lengths, | ||||
| @ -118,6 +157,7 @@ def create_variable_length_batch( | ||||
|  | ||||
|  | ||||
| class TestVarlenAttention(NNTestCase): | ||||
|     @skipIfRocm(msg="ROCM does not support variable length attention") | ||||
|     @unittest.skipIf( | ||||
|         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported" | ||||
|     ) | ||||
| @ -133,7 +173,11 @@ class TestVarlenAttention(NNTestCase): | ||||
|  | ||||
|         total_tokens = shape.batch_size * shape.max_seq_len | ||||
|         x_packed = torch.randn( | ||||
|             total_tokens, shape.embed_dim, device=device, dtype=dtype | ||||
|             total_tokens, | ||||
|             shape.embed_dim, | ||||
|             device=device, | ||||
|             dtype=dtype, | ||||
|             requires_grad=True, | ||||
|         ) | ||||
|         cu_seq = torch.tensor( | ||||
|             [0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32 | ||||
| @ -147,6 +191,131 @@ class TestVarlenAttention(NNTestCase): | ||||
|         self.assertEqual(output.device, torch.device(device)) | ||||
|         self.assertEqual(output.dtype, dtype) | ||||
|  | ||||
|         varlen_grad_out = torch.ones_like(output) | ||||
|  | ||||
|         varlen_grad = torch.autograd.grad( | ||||
|             outputs=output, | ||||
|             inputs=x_packed, | ||||
|             grad_outputs=varlen_grad_out, | ||||
|             retain_graph=True, | ||||
|             create_graph=False, | ||||
|             allow_unused=False, | ||||
|         )[0] | ||||
|  | ||||
|         self.assertIsNotNone(varlen_grad) | ||||
|         self.assertEqual(varlen_grad.shape, x_packed.shape) | ||||
|         self.assertEqual(varlen_grad.dtype, x_packed.dtype) | ||||
|  | ||||
|     @skipIfRocm(msg="ROCM does not support variable length attention") | ||||
|     @unittest.skipIf( | ||||
|         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported" | ||||
|     ) | ||||
|     @parametrize("dtype", [torch.bfloat16, torch.float16]) | ||||
|     def test_custom_op_compliance(self, device, dtype): | ||||
|         torch.manual_seed(42) | ||||
|  | ||||
|         shape = VarlenShape(batch_size=2, max_seq_len=512, embed_dim=1024, num_heads=16) | ||||
|  | ||||
|         attention_block = AttentionBlock( | ||||
|             shape.embed_dim, shape.num_heads, device, dtype | ||||
|         ) | ||||
|  | ||||
|         total_tokens = shape.batch_size * shape.max_seq_len | ||||
|         x_packed = torch.randn( | ||||
|             total_tokens, | ||||
|             shape.embed_dim, | ||||
|             device=device, | ||||
|             dtype=dtype, | ||||
|         ) | ||||
|         cu_seq = torch.tensor( | ||||
|             [0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32 | ||||
|         ) | ||||
|  | ||||
|         q, k, v = attention_block.get_varlen_qkv(x_packed) | ||||
|  | ||||
|         torch.library.opcheck( | ||||
|             torch.ops.torch_attn._varlen_attn, | ||||
|             (q, k, v, cu_seq, cu_seq, shape.max_seq_len, shape.max_seq_len, False), | ||||
|         ) | ||||
|  | ||||
|         out, lse, rng_state = torch.ops.torch_attn._varlen_attn( | ||||
|             q, k, v, cu_seq, cu_seq, shape.max_seq_len, shape.max_seq_len, False | ||||
|         ) | ||||
|         grad_out = torch.randn_like(out) | ||||
|  | ||||
|         # we don't support double backward | ||||
|         # skipping test_autograd_registration, test_aot_dispatch_dynamic, test_aot_dispatch_static | ||||
|         torch.library.opcheck( | ||||
|             torch.ops.torch_attn._varlen_attn_backward, | ||||
|             ( | ||||
|                 grad_out, | ||||
|                 q, | ||||
|                 k, | ||||
|                 v, | ||||
|                 out, | ||||
|                 lse, | ||||
|                 cu_seq, | ||||
|                 cu_seq, | ||||
|                 shape.max_seq_len, | ||||
|                 shape.max_seq_len, | ||||
|                 False, | ||||
|                 rng_state, | ||||
|             ), | ||||
|             test_utils=["test_schema", "test_faketensor"], | ||||
|         ) | ||||
|  | ||||
|     @skipIfRocm(msg="ROCM does not support variable length attention") | ||||
|     @unittest.skipIf( | ||||
|         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported" | ||||
|     ) | ||||
|     @parametrize("dtype", [torch.bfloat16, torch.float16]) | ||||
|     def test_custom_op_registration(self, device, dtype): | ||||
|         torch.manual_seed(42) | ||||
|  | ||||
|         shape = VarlenShape(batch_size=2, max_seq_len=512, embed_dim=1024, num_heads=16) | ||||
|  | ||||
|         attention_block = AttentionBlock( | ||||
|             shape.embed_dim, shape.num_heads, device, dtype | ||||
|         ) | ||||
|  | ||||
|         total_tokens = shape.batch_size * shape.max_seq_len | ||||
|         x_packed = torch.randn( | ||||
|             total_tokens, | ||||
|             shape.embed_dim, | ||||
|             device=device, | ||||
|             dtype=dtype, | ||||
|             requires_grad=True, | ||||
|         ) | ||||
|         cu_seq = torch.tensor( | ||||
|             [0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32 | ||||
|         ) | ||||
|  | ||||
|         compiled_forward = torch.compile( | ||||
|             attention_block.forward_varlen, backend="eager", fullgraph=True | ||||
|         ) | ||||
|         with OpLoggingMode() as mode: | ||||
|             output = compiled_forward( | ||||
|                 x_packed, cu_seq, shape.max_seq_len, is_causal=False | ||||
|             ) | ||||
|  | ||||
|             varlen_grad_out = torch.ones_like(output) | ||||
|             _ = torch.autograd.grad( | ||||
|                 outputs=output, | ||||
|                 inputs=x_packed, | ||||
|                 grad_outputs=varlen_grad_out, | ||||
|                 retain_graph=True, | ||||
|                 create_graph=False, | ||||
|                 allow_unused=False, | ||||
|             )[0] | ||||
|  | ||||
|         called_ops = mode.called_ops | ||||
|  | ||||
|         custom_ops_called = any( | ||||
|             "torch_attn._varlen_attn" in op for op in called_ops | ||||
|         ) and any("torch_attn._varlen_attn_backward" in op for op in called_ops) | ||||
|         assert custom_ops_called | ||||
|  | ||||
|     @skipIfRocm(msg="ROCM does not support variable length attention") | ||||
|     @unittest.skipIf( | ||||
|         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported" | ||||
|     ) | ||||
| @ -156,14 +325,21 @@ class TestVarlenAttention(NNTestCase): | ||||
|         torch.manual_seed(42) | ||||
|  | ||||
|         shape = VarlenShape( | ||||
|             batch_size=8, max_seq_len=2048, embed_dim=1024, num_heads=16 | ||||
|             batch_size=2, max_seq_len=128, embed_dim=32, num_heads=4 | ||||
|         ) | ||||
|  | ||||
|         attention_block = AttentionBlock( | ||||
|             shape.embed_dim, shape.num_heads, device, dtype | ||||
|         ) | ||||
|  | ||||
|         golden_attention_block = AttentionBlock( | ||||
|             shape.embed_dim, shape.num_heads, device, torch.float64 | ||||
|         ) | ||||
|  | ||||
|         variable_length_batch_data = create_variable_length_batch(shape, device, dtype) | ||||
|         golden_variable_length_batch_data = create_variable_length_batch( | ||||
|             shape, device, torch.float64 | ||||
|         ) | ||||
|  | ||||
|         varlen_output = attention_block.forward_varlen( | ||||
|             variable_length_batch_data["x_packed"], | ||||
| @ -172,18 +348,89 @@ class TestVarlenAttention(NNTestCase): | ||||
|             is_causal=is_causal, | ||||
|         ) | ||||
|         sdpa_output = attention_block.forward_sdpa( | ||||
|             variable_length_batch_data["x_padded"], is_causal=is_causal | ||||
|             variable_length_batch_data["x_padded"], | ||||
|             variable_length_batch_data["seq_lengths"], | ||||
|             is_causal=is_causal, | ||||
|         ) | ||||
|  | ||||
|         golden_sdpa_output = golden_attention_block.forward_sdpa( | ||||
|             golden_variable_length_batch_data["x_padded"], | ||||
|             golden_variable_length_batch_data["seq_lengths"], | ||||
|             is_causal=is_causal, | ||||
|         ) | ||||
|  | ||||
|         tolerances = default_tolerances[dtype] | ||||
|         start_idx = 0 | ||||
|         for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]): | ||||
|             end_idx = start_idx + seq_len | ||||
|  | ||||
|             varlen_seq = varlen_output[start_idx:end_idx] | ||||
|             sdpa_seq = sdpa_output[i, :seq_len] | ||||
|             golden_sdpa_seq = golden_sdpa_output[i, :seq_len] | ||||
|  | ||||
|             fwd_atol = ( | ||||
|                 2 * (golden_sdpa_seq + 0.3 - 0.3 - golden_sdpa_seq).abs().max().item() | ||||
|             ) | ||||
|  | ||||
|             varlen_error = (varlen_seq - fwd_atol).abs().max().item() | ||||
|             sdpa_error = (sdpa_seq - fwd_atol).abs().max().item() | ||||
|             assert varlen_error <= sdpa_error + fwd_atol | ||||
|  | ||||
|             start_idx = end_idx | ||||
|  | ||||
|         varlen_grad_out = torch.ones_like(varlen_output) | ||||
|         sdpa_grad_out = torch.ones_like(sdpa_output) | ||||
|         golden_sdpa_grad_out = torch.ones_like(golden_sdpa_output) | ||||
|  | ||||
|         start_idx = 0 | ||||
|         for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]): | ||||
|             end_idx = start_idx + seq_len | ||||
|             sdpa_grad_out[i, :seq_len] = varlen_grad_out[start_idx:end_idx] | ||||
|             start_idx = end_idx | ||||
|  | ||||
|         varlen_grad = torch.autograd.grad( | ||||
|             outputs=varlen_output, | ||||
|             inputs=variable_length_batch_data["x_packed"], | ||||
|             grad_outputs=varlen_grad_out, | ||||
|             retain_graph=True, | ||||
|             create_graph=False, | ||||
|             allow_unused=False, | ||||
|         )[0] | ||||
|  | ||||
|         sdpa_grad = torch.autograd.grad( | ||||
|             outputs=sdpa_output, | ||||
|             inputs=variable_length_batch_data["x_padded"], | ||||
|             grad_outputs=sdpa_grad_out, | ||||
|             retain_graph=True, | ||||
|             create_graph=False, | ||||
|             allow_unused=False, | ||||
|         )[0] | ||||
|  | ||||
|         golden_sdpa_grad = torch.autograd.grad( | ||||
|             outputs=golden_sdpa_output, | ||||
|             inputs=golden_variable_length_batch_data["x_padded"], | ||||
|             grad_outputs=golden_sdpa_grad_out, | ||||
|             retain_graph=True, | ||||
|             create_graph=False, | ||||
|             allow_unused=False, | ||||
|         )[0] | ||||
|  | ||||
|         start_idx = 0 | ||||
|         for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]): | ||||
|             end_idx = start_idx + seq_len | ||||
|  | ||||
|             varlen_grad_seq = varlen_grad[start_idx:end_idx] | ||||
|             sdpa_grad_seq = sdpa_grad[i, :seq_len] | ||||
|             golden_sdpa_seq = golden_sdpa_grad[i, :seq_len] | ||||
|  | ||||
|             fwd_atol = ( | ||||
|                 2 * (golden_sdpa_seq + 0.3 - 0.3 - golden_sdpa_seq).abs().max().item() | ||||
|             ) | ||||
|  | ||||
|             varlen_error = (varlen_grad_seq - fwd_atol).abs().max().item() | ||||
|             sdpa_error = (sdpa_grad_seq - fwd_atol).abs().max().item() | ||||
|  | ||||
|             assert varlen_error <= sdpa_error + fwd_atol | ||||
|  | ||||
|             torch.testing.assert_close(varlen_seq, sdpa_seq, **tolerances) | ||||
|             start_idx = end_idx | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -42,7 +42,7 @@ import weakref | ||||
| from dataclasses import dataclass | ||||
| from enum import Enum | ||||
| from os.path import dirname, join | ||||
| from typing import Any, NamedTuple, Optional, Sized, TYPE_CHECKING, Union | ||||
| from typing import Any, NamedTuple, Optional, TYPE_CHECKING, Union | ||||
| from unittest.mock import patch | ||||
|  | ||||
| import sympy | ||||
| @ -395,13 +395,6 @@ class OptimizedModule(torch.nn.Module): | ||||
|         self._initialize() | ||||
|         self.training = self._orig_mod.training | ||||
|  | ||||
|     def __len__(self) -> int: | ||||
|         # Proxy the len call to the original module | ||||
|         if isinstance(self._orig_mod, Sized): | ||||
|             return len(self._orig_mod) | ||||
|         # Mimic python's default behavior for objects without a length | ||||
|         raise TypeError(f"{type(self._orig_mod).__name__} does not support len()") | ||||
|  | ||||
|     def _initialize(self) -> None: | ||||
|         # Do this stuff in constructor to lower overhead slightly | ||||
|         if isinstance(self.dynamo_ctx, DisableContext): | ||||
|  | ||||
| @ -1793,6 +1793,14 @@ def _aot_stage2b_bw_compile( | ||||
|                     # tensor which is wrong. | ||||
|  | ||||
|                     ph_size = ph_arg.size() | ||||
|                     # pyrefly: ignore  # bad-argument-type | ||||
|                     if len(ph_size) == 0 and len(real_stride) > 0: | ||||
|                         # Fix for 0-dimensional tensors: When a tensor becomes 0-d | ||||
|                         # (e.g., via squeeze), its stride should be () not (1,). | ||||
|                         # This mismatch can occur when dynamic shape operations produce | ||||
|                         # tensors that are later squeezed to 0-d. The stride metadata | ||||
|                         # may get preserved causing a dimension mismatch (#164814) | ||||
|                         real_stride = () | ||||
|  | ||||
|                     # pyrefly: ignore  # bad-argument-type | ||||
|                     placeholder_list[i] = ph_arg.as_strided(ph_size, real_stride) | ||||
|  | ||||
| @ -409,10 +409,9 @@ class SchedulerDonatedBuffer(SchedulerBuffer): | ||||
|  | ||||
|  | ||||
| class BaseSchedulerNode: | ||||
|     ancestors: OrderedSet[str] | ||||
|     debug_device_str: Callable[[BaseSchedulerNode], list[str]] | ||||
|     group: tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]] | ||||
|     last_usage: OrderedSet[str] | ||||
|     read_writes: dependencies.ReadWrites | ||||
|     unmet_dependencies: OrderedSet[Dep] | ||||
|     # .min_order and .max_order are only relevant for "grouped" nodes such as FusedSchedulerNode. | ||||
|     # e.g. if the FusedSchedulerNode includes nodes (op_1, op_2, op_3), and op_X is X-th node | ||||
|     # in `self.scheduler.nodes`, then for this FusedSchedulerNode, .min_order is 1 and .max_order is 3. | ||||
| @ -421,24 +420,22 @@ class BaseSchedulerNode: | ||||
|     min_order: int | ||||
|     max_order: int | ||||
|     mpi_node: MemoryPlanningInfoForNode | ||||
|     mutation_renames: dict[str, str] | ||||
|     node: Optional[ir.Operation] | ||||
|     outputs: list[SchedulerBuffer] | ||||
|     outputs_by_name: dict[str, SchedulerBuffer] | ||||
|     override_estimated_runtime: Optional[float] = None | ||||
|     read_writes: dependencies.ReadWrites | ||||
|     unmet_dependencies: OrderedSet[Dep] | ||||
|  | ||||
|     def __init__(self, scheduler: Scheduler) -> None: | ||||
|         self.scheduler = scheduler | ||||
|         self.debug_device_str = lambda *args, **kwargs: [] | ||||
|         self.scheduler: Scheduler = scheduler | ||||
|         self.debug_device_str: Callable[[BaseSchedulerNode], list[str]] = ( | ||||
|             lambda *args, **kwargs: [] | ||||
|         ) | ||||
|  | ||||
|     def _init_from_node(self, node: ir.Operation) -> None: | ||||
|         self.node = node | ||||
|         self.ancestors = OrderedSet() | ||||
|         self.last_usage = OrderedSet()  # buffers that won't be used after this kernel | ||||
|         self.node: Optional[ir.Operation] = node | ||||
|         self.ancestors: OrderedSet[str] = OrderedSet() | ||||
|         self.last_usage = OrderedSet[ | ||||
|             str | ||||
|         ]()  # buffers that won't be used after this kernel | ||||
|         self.written = False | ||||
|         self.outputs = [ | ||||
|         self.outputs: list[SchedulerBuffer] = [ | ||||
|             SchedulerBuffer( | ||||
|                 scheduler=self.scheduler, | ||||
|                 node=output, | ||||
| @ -446,14 +443,16 @@ class BaseSchedulerNode: | ||||
|             ) | ||||
|             for output in node.get_outputs() | ||||
|         ] | ||||
|         self.outputs_by_name = {buf.get_name(): buf for buf in self.outputs} | ||||
|         self.outputs_by_name: dict[str, SchedulerBuffer] = { | ||||
|             buf.get_name(): buf for buf in self.outputs | ||||
|         } | ||||
|  | ||||
|         # mutation_renames for the current node. Due to potential | ||||
|         # more mutations happening later, this can be different | ||||
|         # to Scheduler.mutation_renames. Also this dict should be small | ||||
|         # since only mutation information relevant to the deps for this | ||||
|         # node is stored here. | ||||
|         self.mutation_renames = {} | ||||
|         self.mutation_renames: dict[str, str] = {} | ||||
|  | ||||
|     def __repr__(self) -> str: | ||||
|         return f"{type(self).__name__}(name={self.get_name()!r})" | ||||
| @ -2436,34 +2435,6 @@ def pick_loop_order( | ||||
|     return order | ||||
|  | ||||
|  | ||||
| def _replace_operation_buffer( | ||||
|     orig_node: ir.MultiTemplateBuffer, new_node: ir.OperationBuffer | ||||
| ) -> None: | ||||
|     replaced_buf_name = new_node.get_name() | ||||
|     orig_buf_name = orig_node.get_name() | ||||
|     assert isinstance(orig_buf_name, str) and isinstance(replaced_buf_name, str) | ||||
|  | ||||
|     replaced_op_name = new_node.get_operation_name() | ||||
|     orig_op_name = orig_node.get_operation_name() | ||||
|     assert isinstance(orig_op_name, str) and isinstance(replaced_op_name, str) | ||||
|  | ||||
|     del V.graph.name_to_buffer[replaced_buf_name] | ||||
|     new_node.name = orig_buf_name | ||||
|  | ||||
|     del V.graph.name_to_op[replaced_op_name] | ||||
|     new_node.operation_name = orig_op_name | ||||
|  | ||||
|     orig = V.graph.buffers.index(orig_node) | ||||
|     V.graph.buffers.remove(new_node) | ||||
|     V.graph.buffers[orig] = new_node | ||||
|     V.graph.name_to_buffer[orig_buf_name] = new_node | ||||
|  | ||||
|     orig = V.graph.operations.index(orig_node) | ||||
|     V.graph.operations.remove(new_node) | ||||
|     V.graph.operations[orig] = new_node | ||||
|     V.graph.name_to_op[orig_op_name] = new_node | ||||
|  | ||||
|  | ||||
| @dataclasses.dataclass | ||||
| class NodeUser: | ||||
|     node: Union[BaseSchedulerNode, OutputNode] | ||||
| @ -3365,6 +3336,33 @@ class Scheduler: | ||||
|         will force completion of compilation and benchmarking. | ||||
|         """ | ||||
|  | ||||
|         def replace_operation_buffer( | ||||
|             orig_node: ir.MultiTemplateBuffer, new_node: ir.OperationBuffer | ||||
|         ) -> None: | ||||
|             replaced_buf_name = new_node.get_name() | ||||
|             orig_buf_name = orig_node.get_name() | ||||
|             assert isinstance(orig_buf_name, str) and isinstance(replaced_buf_name, str) | ||||
|  | ||||
|             replaced_op_name = new_node.get_operation_name() | ||||
|             orig_op_name = orig_node.get_operation_name() | ||||
|             assert isinstance(orig_op_name, str) and isinstance(replaced_op_name, str) | ||||
|  | ||||
|             del V.graph.name_to_buffer[replaced_buf_name] | ||||
|             new_node.name = orig_buf_name | ||||
|  | ||||
|             del V.graph.name_to_op[replaced_op_name] | ||||
|             new_node.operation_name = orig_op_name | ||||
|  | ||||
|             orig = V.graph.buffers.index(orig_node) | ||||
|             V.graph.buffers.remove(new_node) | ||||
|             V.graph.buffers[orig] = new_node | ||||
|             V.graph.name_to_buffer[orig_buf_name] = new_node | ||||
|  | ||||
|             orig = V.graph.operations.index(orig_node) | ||||
|             V.graph.operations.remove(new_node) | ||||
|             V.graph.operations[orig] = new_node | ||||
|             V.graph.name_to_op[orig_op_name] = new_node | ||||
|  | ||||
|         for i, node in enumerate(self.nodes): | ||||
|             if isinstance(node, SchedulerNode) and isinstance( | ||||
|                 node.node, ir.MultiTemplateBuffer | ||||
| @ -3418,47 +3416,40 @@ class Scheduler: | ||||
|                     assign_origin_node(out_tensorbox, multi_node.origin_node) | ||||
|  | ||||
|                 out_buffer.layout = multi_node.layout | ||||
|                 self._replace_node(out_buffer, multi_node, i, node) | ||||
|                 replace_operation_buffer(multi_node, out_buffer) | ||||
|                 new_scheduler_node = self.create_scheduler_node(out_buffer) | ||||
|  | ||||
|     def _replace_node( | ||||
|         self, | ||||
|         out_buffer: ir.OperationBuffer, | ||||
|         multi_node: ir.MultiTemplateBuffer, | ||||
|         i: int, | ||||
|         node: SchedulerNode, | ||||
|     ) -> None: | ||||
|         _replace_operation_buffer(multi_node, out_buffer) | ||||
|         new_scheduler_node = self.create_scheduler_node(out_buffer) | ||||
|                 self.nodes[i] = new_scheduler_node | ||||
|                 self.name_to_node[node.get_name()] = new_scheduler_node | ||||
|                 self.name_to_fused_node[node.get_name()] = new_scheduler_node | ||||
|  | ||||
|         self.nodes[i] = new_scheduler_node | ||||
|         self.name_to_node[node.get_name()] = new_scheduler_node | ||||
|         self.name_to_fused_node[node.get_name()] = new_scheduler_node | ||||
|                 # We need to reflect the mutation renames that were recorded in the original node | ||||
|                 mutation_renames = {} | ||||
|                 for dep in itertools.chain( | ||||
|                     node.read_writes.reads, node.unmet_dependencies | ||||
|                 ): | ||||
|                     if real_name := self.mutation_real_name.get(dep.name, None): | ||||
|                         mutation_renames[real_name] = dep.name | ||||
|  | ||||
|         # We need to reflect the mutation renames that were recorded in the original node | ||||
|         mutation_renames = {} | ||||
|         for dep in itertools.chain(node.read_writes.reads, node.unmet_dependencies): | ||||
|             if real_name := self.mutation_real_name.get(dep.name, None): | ||||
|                 mutation_renames[real_name] = dep.name | ||||
|                 def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]: | ||||
|                     return OrderedSet(dep.rename(mutation_renames) for dep in deps) | ||||
|  | ||||
|         def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]: | ||||
|             return OrderedSet(dep.rename(mutation_renames) for dep in deps) | ||||
|                 new_scheduler_node.unmet_dependencies = rename_deps( | ||||
|                     new_scheduler_node.unmet_dependencies | ||||
|                 ) | ||||
|                 new_scheduler_node.read_writes.reads = rename_deps( | ||||
|                     new_scheduler_node.read_writes.reads | ||||
|                 ) | ||||
|  | ||||
|         new_scheduler_node.unmet_dependencies = rename_deps( | ||||
|             new_scheduler_node.unmet_dependencies | ||||
|         ) | ||||
|         new_scheduler_node.read_writes.reads = rename_deps( | ||||
|             new_scheduler_node.read_writes.reads | ||||
|         ) | ||||
|                 for new_out, old_out in zip( | ||||
|                     new_scheduler_node.get_outputs(), node.get_outputs() | ||||
|                 ): | ||||
|                     self.name_to_buf[old_out.get_name()] = new_out | ||||
|                     new_out.users = old_out.users | ||||
|  | ||||
|         for new_out, old_out in zip( | ||||
|             new_scheduler_node.get_outputs(), node.get_outputs() | ||||
|         ): | ||||
|             self.name_to_buf[old_out.get_name()] = new_out | ||||
|             new_out.users = old_out.users | ||||
|  | ||||
|         new_scheduler_node.min_order = node.min_order | ||||
|         new_scheduler_node.max_order = node.max_order | ||||
|         new_scheduler_node.last_usage = node.last_usage | ||||
|                 new_scheduler_node.min_order = node.min_order | ||||
|                 new_scheduler_node.max_order = node.max_order | ||||
|                 new_scheduler_node.last_usage = node.last_usage | ||||
|  | ||||
|     def _any_atomic_add(self, node_list: Sequence[BaseSchedulerNode]) -> bool: | ||||
|         return any( | ||||
|  | ||||
| @ -17,7 +17,6 @@ import time | ||||
| from collections.abc import Sequence | ||||
| from concurrent.futures import as_completed, ThreadPoolExecutor | ||||
| from io import StringIO | ||||
| from pathlib import Path | ||||
| from types import ModuleType | ||||
| from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union | ||||
| from typing_extensions import Self | ||||
| @ -2105,11 +2104,6 @@ class TritonTemplate(KernelTemplate): | ||||
|                 "matrix_instr_nonkdim": kwargs.get("matrix_instr_nonkdim", 0), | ||||
|                 "waves_per_eu": kwargs.get("waves_per_eu", 0), | ||||
|                 "kpack": kwargs.get("kpack", 2), | ||||
|                 **{ | ||||
|                     k: kwargs[k] | ||||
|                     for k in AlgorithmSelectorCache.FLEX_ATTENTION_TUNABLE_KEYS | ||||
|                     if k in kwargs | ||||
|                 }, | ||||
|             }, | ||||
|             mutated_inputs=mutated_inputs, | ||||
|             workspace_arg=workspace_arg, | ||||
| @ -2403,17 +2397,6 @@ def get_mm_log_filename() -> Optional[str]: | ||||
|     return mm_file_name | ||||
|  | ||||
|  | ||||
| @functools.cache | ||||
| def get_flex_attention_log_filename() -> Optional[str]: | ||||
|     flex_attention_file_name = os.environ.get( | ||||
|         "TORCHINDUCTOR_FLEX_ATTENTION_LOGGING_FILE", None | ||||
|     ) | ||||
|     if not flex_attention_file_name: | ||||
|         return None | ||||
|  | ||||
|     return str(Path(flex_attention_file_name).with_suffix(".json")) | ||||
|  | ||||
|  | ||||
| def append_to_log(filename, data): | ||||
|     lock_file = filename.replace(".json", ".lock") | ||||
|     lock = FileLock(lock_file) | ||||
| @ -2624,25 +2607,6 @@ class AlgorithmSelectorCache(PersistentCache): | ||||
|     doesn't depend on the output layout. | ||||
|     """ | ||||
|  | ||||
|     FLEX_ATTENTION_TUNABLE_KEYS = tuple( | ||||
|         dict.fromkeys( | ||||
|             [ | ||||
|                 "num_warps", | ||||
|                 "num_stages", | ||||
|                 "BLOCK_M", | ||||
|                 "BLOCK_N", | ||||
|                 "BLOCK_M1", | ||||
|                 "BLOCK_N1", | ||||
|                 "BLOCK_M2", | ||||
|                 "BLOCK_N2", | ||||
|                 "USE_TMA", | ||||
|                 "kpack", | ||||
|                 "matrix_instr_nonkdim", | ||||
|                 "waves_per_eu", | ||||
|             ] | ||||
|         ) | ||||
|     ) | ||||
|  | ||||
|     def __init__(self, *args, **kwargs) -> None: | ||||
|         super().__init__(*args, **kwargs) | ||||
|  | ||||
| @ -3576,73 +3540,6 @@ class AlgorithmSelectorCache(PersistentCache): | ||||
|         ) | ||||
|         return pruned_choices | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_flex_attention_choice_info( | ||||
|         choice: ChoiceCaller, timings: dict[ChoiceCaller, float] | ||||
|     ) -> dict[str, Any]: | ||||
|         if isinstance(choice, torch._inductor.select_algorithm.ExternKernelCaller): | ||||
|             return {"type": "extern", "time": timings[choice]} | ||||
|  | ||||
|         assert isinstance(choice, torch._inductor.select_algorithm.TritonTemplateCaller) | ||||
|  | ||||
|         info = choice.info_dict() | ||||
|         result = { | ||||
|             "type": "triton", | ||||
|             "time": timings[choice], | ||||
|         } | ||||
|  | ||||
|         for key in AlgorithmSelectorCache.FLEX_ATTENTION_TUNABLE_KEYS: | ||||
|             if key in info: | ||||
|                 result[key] = info[key] | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     @staticmethod | ||||
|     def maybe_log_flex_attention_results( | ||||
|         name: str, input_nodes: list[ir.IRNode], timings: dict[ChoiceCaller, float] | ||||
|     ) -> None: | ||||
|         flex_attention_filename = get_flex_attention_log_filename() | ||||
|         if not flex_attention_filename or "flex_attention" not in name: | ||||
|             return | ||||
|  | ||||
|         if len(input_nodes) < 3: | ||||
|             return | ||||
|  | ||||
|         query_size = input_nodes[0].get_size() | ||||
|         key_size = input_nodes[1].get_size() | ||||
|         value_size = input_nodes[2].get_size() | ||||
|  | ||||
|         B = query_size[0] | ||||
|         Hq = query_size[1] | ||||
|         seq_len_q = query_size[2] | ||||
|         qk_head_dim = query_size[3] | ||||
|         Hkv = key_size[1] | ||||
|         seq_len_kv = key_size[2] | ||||
|         v_head_dim = value_size[3] | ||||
|  | ||||
|         kernel_type = "backward" if "backward" in name else "forward" | ||||
|         dims_key = str( | ||||
|             ( | ||||
|                 kernel_type, | ||||
|                 B, | ||||
|                 Hq, | ||||
|                 Hkv, | ||||
|                 seq_len_q, | ||||
|                 seq_len_kv, | ||||
|                 qk_head_dim, | ||||
|                 v_head_dim, | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|         sorted_choices = sorted(timings, key=timings.__getitem__) | ||||
|         out_dict = { | ||||
|             dims_key: [ | ||||
|                 AlgorithmSelectorCache.get_flex_attention_choice_info(choice, timings) | ||||
|                 for choice in sorted_choices | ||||
|             ] | ||||
|         } | ||||
|         append_to_log(flex_attention_filename, out_dict) | ||||
|  | ||||
|     @staticmethod | ||||
|     def log_results( | ||||
|         name: str, | ||||
| @ -3653,7 +3550,6 @@ class AlgorithmSelectorCache(PersistentCache): | ||||
|         prescreening_elapse: Optional[float] = None, | ||||
|         hint_override: Optional[int] = None, | ||||
|     ): | ||||
|         """Log the autotuning results, currently only handles mm and flex""" | ||||
|         V.debug.log_autotuning_results( | ||||
|             name, input_nodes, timings, elapse, precompile_elapse | ||||
|         ) | ||||
| @ -3722,10 +3618,6 @@ class AlgorithmSelectorCache(PersistentCache): | ||||
|  | ||||
|             append_to_log(mm_filename, out_dict) | ||||
|  | ||||
|         AlgorithmSelectorCache.maybe_log_flex_attention_results( | ||||
|             name, input_nodes, timings | ||||
|         ) | ||||
|  | ||||
|         best_time = timings[best] | ||||
|         sys.stderr.write(f"AUTOTUNE {name}({sizes})\n") | ||||
|         sys.stderr.write(f"strides: {strides}\n") | ||||
|  | ||||
| @ -14,14 +14,11 @@ from torch.backends.cuda import ( | ||||
|     SDPAParams, | ||||
| ) | ||||
|  | ||||
| from .varlen import varlen_attn | ||||
|  | ||||
|  | ||||
| __all__: list[str] = [ | ||||
|     "SDPBackend", | ||||
|     "sdpa_kernel", | ||||
|     "WARN_FOR_UNFUSED_KERNELS", | ||||
|     "varlen_attn", | ||||
| ] | ||||
|  | ||||
| # Note: [SDPA warnings] | ||||
|  | ||||
| @ -7,7 +7,7 @@ that calls into the optimized Flash Attention kernels. | ||||
|  | ||||
| import logging | ||||
| from functools import lru_cache | ||||
| from typing import NamedTuple, Optional, Union | ||||
| from typing import Any, NamedTuple, Optional, Union | ||||
|  | ||||
| import torch | ||||
|  | ||||
| @ -20,7 +20,7 @@ __all__ = ["varlen_attn", "AuxRequest"] | ||||
| @lru_cache(maxsize=8) | ||||
| def _should_use_cudnn(device_index: int) -> bool: | ||||
|     """Cache device capability check to avoid repeated CUDA calls.""" | ||||
|     return False | ||||
|     return True | ||||
|  | ||||
|  | ||||
| class AuxRequest(NamedTuple): | ||||
| @ -33,8 +33,7 @@ class AuxRequest(NamedTuple): | ||||
|     lse: bool = False | ||||
|  | ||||
|  | ||||
| # import failures when I try to register as custom op | ||||
| # @torch.library.custom_op("torch_nn_attention::_varlen_attn", mutates_args={}) | ||||
| @torch.library.custom_op("torch_attn::_varlen_attn", mutates_args={}) | ||||
| def _varlen_attn( | ||||
|     query: torch.Tensor, | ||||
|     key: torch.Tensor, | ||||
| @ -44,7 +43,7 @@ def _varlen_attn( | ||||
|     max_q: int, | ||||
|     max_k: int, | ||||
|     is_causal: bool = False, | ||||
| ) -> tuple[torch.Tensor, torch.Tensor]: | ||||
| ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: | ||||
|     """ | ||||
|     Private custom op for variable-length attention. | ||||
|  | ||||
| @ -52,9 +51,9 @@ def _varlen_attn( | ||||
|     """ | ||||
|  | ||||
|     use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index) | ||||
|  | ||||
|     if use_cudnn: | ||||
|         log.info("Using cuDNN backend for varlen_attn") | ||||
|  | ||||
|         result = torch.ops.aten._cudnn_attention_forward( | ||||
|             query, | ||||
|             key, | ||||
| @ -70,7 +69,7 @@ def _varlen_attn( | ||||
|             False,  # return_debug_mask | ||||
|         ) | ||||
|         # cuDNN returns: (output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask) | ||||
|         output, softmax_lse = result[0], result[1] | ||||
|         output, softmax_lse, rng_state, philox_offset = result[0], result[1], result[6], result[7] | ||||
|     else: | ||||
|         log.info("Using Flash Attention backend for varlen_attn") | ||||
|         output, softmax_lse, rng_state, _, _ = torch.ops.aten._flash_attention_forward( | ||||
| @ -85,11 +84,16 @@ def _varlen_attn( | ||||
|             is_causal, | ||||
|             return_debug_mask=False, | ||||
|         ) | ||||
|         philox_offset = torch.zeros((), dtype=torch.int64, device=query.device) | ||||
|  | ||||
|     return output, softmax_lse | ||||
|     rng_state_ = torch.zeros( | ||||
|         (2,), dtype=torch.uint64, device=query.device | ||||
|     )  # hardcoded since dropout is hardcoded to 0 | ||||
|  | ||||
|     return output, softmax_lse, rng_state_, philox_offset | ||||
|  | ||||
|  | ||||
| # @_varlen_attn.register_fake | ||||
| @_varlen_attn.register_fake | ||||
| def _varlen_attn_fake( | ||||
|     query: torch.Tensor, | ||||
|     key: torch.Tensor, | ||||
| @ -99,7 +103,7 @@ def _varlen_attn_fake( | ||||
|     max_q: int, | ||||
|     max_k: int, | ||||
|     is_causal: bool = False, | ||||
| ) -> tuple[torch.Tensor, torch.Tensor]: | ||||
| ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: | ||||
|     """ | ||||
|     Fake implementation for meta tensor computation and tracing. | ||||
|  | ||||
| @ -110,14 +114,24 @@ def _varlen_attn_fake( | ||||
|     # Output has same shape as query | ||||
|     output = torch.empty_like(query) | ||||
|  | ||||
|     # For varlen path: logsumexp shape is (num_heads, total_q) | ||||
|     # For varlen path with cuDNN: logsumexp shape is (total_q, num_heads, 1) | ||||
|     total_q = query.size(0) | ||||
|     num_heads = query.size(1) | ||||
|     logsumexp = torch.empty( | ||||
|         (num_heads, total_q), dtype=torch.float, device=query.device | ||||
|     ) | ||||
|  | ||||
|     return output, logsumexp | ||||
|     use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index) | ||||
|     if use_cudnn: | ||||
|         logsumexp = torch.empty( | ||||
|             (total_q, num_heads, 1), dtype=torch.float, device=query.device | ||||
|         ) | ||||
|     else: | ||||
|         logsumexp = torch.empty( | ||||
|             (num_heads, total_q), dtype=torch.float, device=query.device | ||||
|         ) | ||||
|  | ||||
|     rng_state = torch.empty((2,), dtype=torch.uint64, device=query.device) | ||||
|     philox_offset = torch.zeros((), dtype=torch.int64, device=query.device) | ||||
|  | ||||
|     return output, logsumexp, rng_state, philox_offset | ||||
|  | ||||
|  | ||||
| def varlen_attn( | ||||
| @ -191,9 +205,142 @@ def varlen_attn( | ||||
|         ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False | ||||
|         ... ) | ||||
|     """ | ||||
|     out, lse = _varlen_attn( | ||||
|     out, lse, _, _ = torch.ops.torch_attn._varlen_attn( | ||||
|         query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal | ||||
|     ) | ||||
|     if return_aux is not None and return_aux.lse: | ||||
|         return out, lse | ||||
|     return out | ||||
|  | ||||
|  | ||||
| def _setup_context(ctx: Any, inputs: tuple[Any, ...], output: Any) -> None: | ||||
|     query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal = inputs | ||||
|     out, lse, rng_state, philox_offset = output | ||||
|  | ||||
|     ctx.save_for_backward(query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state, philox_offset) | ||||
|  | ||||
|     ctx.max_q = max_q | ||||
|     ctx.max_k = max_k | ||||
|     ctx.is_causal = is_causal | ||||
|  | ||||
|  | ||||
| @torch.library.custom_op("torch_attn::_varlen_attn_backward", mutates_args={}) | ||||
| def _varlen_attn_backward( | ||||
|     grad_out: torch.Tensor, | ||||
|     query: torch.Tensor, | ||||
|     key: torch.Tensor, | ||||
|     value: torch.Tensor, | ||||
|     out: torch.Tensor, | ||||
|     lse: torch.Tensor, | ||||
|     cu_seq_q: torch.Tensor, | ||||
|     cu_seq_k: torch.Tensor, | ||||
|     max_q: int, | ||||
|     max_k: int, | ||||
|     is_causal: bool, | ||||
|     rng_state: torch.Tensor, | ||||
|     philox_offset: torch.Tensor, | ||||
| ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: | ||||
|     unused = torch.empty(0, device=query.device) | ||||
|  | ||||
|     use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index) | ||||
|     if use_cudnn: | ||||
|         log.info("Using cuDNN backend for varlen_attn") | ||||
|  | ||||
|         head_dim = query.size(-1) | ||||
|         scale = 1.0 / (head_dim ** 0.5) | ||||
|  | ||||
|         dq, dk, dv = torch.ops.aten._cudnn_attention_backward( | ||||
|             grad_out = grad_out, | ||||
|             query = query, | ||||
|             key = key, | ||||
|             value = value, | ||||
|             out = out, | ||||
|             logsumexp = lse, | ||||
|             philox_seed = rng_state, | ||||
|             philox_offset = philox_offset, | ||||
|             attn_bias = None, | ||||
|             cum_seq_q = cu_seq_q, | ||||
|             cum_seq_k = cu_seq_k, | ||||
|             max_q = max_q, | ||||
|             max_k = max_k, | ||||
|             dropout_p = 0.0, | ||||
|             is_causal = is_causal, | ||||
|             # passing in scale doesn't change the value of the gradients | ||||
|             # scale=scale | ||||
|         ) | ||||
|     else: | ||||
|         log.info("Using Flash Attention backend for varlen_attn") | ||||
|         dq, dk, dv = torch.ops.aten._flash_attention_backward( | ||||
|             grad_out, | ||||
|             query, | ||||
|             key, | ||||
|             value, | ||||
|             out, | ||||
|             lse, | ||||
|             cu_seq_q, | ||||
|             cu_seq_k, | ||||
|             max_q, | ||||
|             max_k, | ||||
|             0.0, | ||||
|             is_causal, | ||||
|             rng_state, | ||||
|             unused, | ||||
|         ) | ||||
|     return dq, dk, dv | ||||
|  | ||||
|  | ||||
| @_varlen_attn_backward.register_fake | ||||
| def _varlen_attn_backward_fake( | ||||
|     grad_out: torch.Tensor, | ||||
|     query: torch.Tensor, | ||||
|     key: torch.Tensor, | ||||
|     value: torch.Tensor, | ||||
|     out: torch.Tensor, | ||||
|     lse: torch.Tensor, | ||||
|     cu_seq_q: torch.Tensor, | ||||
|     cu_seq_k: torch.Tensor, | ||||
|     max_q: int, | ||||
|     max_k: int, | ||||
|     is_causal: bool, | ||||
|     rng_state: torch.Tensor, | ||||
|     philox_offset: torch.Tensor, | ||||
| ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: | ||||
|     """ | ||||
|     Fake implementation for meta tensor computation and tracing. | ||||
|     """ | ||||
|  | ||||
|     grad_query = torch.empty_like(query) | ||||
|     grad_key = torch.empty_like(key) | ||||
|     grad_value = torch.empty_like(value) | ||||
|  | ||||
|     return grad_query, grad_key, grad_value | ||||
|  | ||||
|  | ||||
| def _backward( | ||||
|     ctx: Any, grad_out: torch.Tensor, grad_lse: torch.Tensor, grad_rng: torch.Tensor, grad_philox_offset: torch.Tensor | ||||
| ) -> tuple[Optional[torch.Tensor], ...]: | ||||
|     query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state, philox_offset = ctx.saved_tensors | ||||
|  | ||||
|     max_q = ctx.max_q | ||||
|     max_k = ctx.max_k | ||||
|     is_causal = ctx.is_causal | ||||
|  | ||||
|     dq, dk, dv = torch.ops.torch_attn._varlen_attn_backward( | ||||
|         grad_out, | ||||
|         query, | ||||
|         key, | ||||
|         value, | ||||
|         out, | ||||
|         lse, | ||||
|         cu_seq_q, | ||||
|         cu_seq_k, | ||||
|         max_q, | ||||
|         max_k, | ||||
|         is_causal, | ||||
|         rng_state, | ||||
|         philox_offset | ||||
|     ) | ||||
|     return dq, dk, dv, None, None, None, None, None, None | ||||
|  | ||||
|  | ||||
| _varlen_attn.register_autograd(_backward, setup_context=_setup_context) | ||||
|  | ||||
| @ -74,17 +74,6 @@ def export_compat( | ||||
|     if opset_version is None: | ||||
|         opset_version = onnx_constants.ONNX_DEFAULT_OPSET | ||||
|  | ||||
|     if isinstance(model, torch.nn.Module): | ||||
|         if model.training: | ||||
|             warnings.warn( | ||||
|                 "Exporting a model while it is in training mode. " | ||||
|                 "Please ensure that this is intended, as it may lead to " | ||||
|                 "different behavior during inference. " | ||||
|                 "Calling model.eval() before export is recommended.", | ||||
|                 UserWarning, | ||||
|                 stacklevel=2, | ||||
|             ) | ||||
|  | ||||
|     if isinstance(model, torch.export.ExportedProgram): | ||||
|         # We know the model is already exported program, so the args, kwargs, and dynamic_shapes | ||||
|         # are not used | ||||
|  | ||||
| @ -812,6 +812,7 @@ if torch.backends.mps.is_available(): | ||||
|             "__rmod__", | ||||
|             "__rsub__", | ||||
|             "__rpow__", | ||||
|             "bernoulli", | ||||
|             "clamp_max", | ||||
|             "clamp_min", | ||||
|             "masked_scatter", | ||||
|  | ||||
| @ -950,6 +950,13 @@ def prof_meth_call(*args, **kwargs): | ||||
| torch._C.ScriptFunction.__call__ = prof_func_call  # type: ignore[method-assign] | ||||
| torch._C.ScriptMethod.__call__ = prof_meth_call  # type: ignore[method-assign] | ||||
|  | ||||
| def _get_test_report_path(): | ||||
|     # allow users to override the test file location. We need this | ||||
|     # because the distributed tests run the same test file multiple | ||||
|     # times with different configurations. | ||||
|     override = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE') | ||||
|     test_source = override if override is not None else 'python-unittest' | ||||
|     return os.path.join('test-reports', test_source) | ||||
|  | ||||
| def parse_cmd_line_args(): | ||||
|     global CI_FUNCTORCH_ROOT | ||||
| @ -980,7 +987,9 @@ def parse_cmd_line_args(): | ||||
|     parser.add_argument('--repeat', type=int, default=1) | ||||
|     parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true') | ||||
|     parser.add_argument('--use-pytest', action='store_true') | ||||
|     parser.add_argument('--save-xml', type=str) | ||||
|     parser.add_argument('--save-xml', nargs='?', type=str, | ||||
|                         const=_get_test_report_path(), | ||||
|                         default=_get_test_report_path() if IS_CI else None) | ||||
|     parser.add_argument('--discover-tests', action='store_true') | ||||
|     parser.add_argument('--log-suffix', type=str, default="") | ||||
|     parser.add_argument('--run-parallel', type=int, default=1) | ||||
| @ -1010,9 +1019,6 @@ def parse_cmd_line_args(): | ||||
|         # infer flags based on the default settings | ||||
|         GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode() | ||||
|  | ||||
|     if args.save_xml is None and IS_CI: | ||||
|         args.xml_dir = get_report_dir(sys.argv[0], args.log_suffix, args.use_pytest) | ||||
|  | ||||
|     RERUN_DISABLED_TESTS = args.rerun_disabled_tests | ||||
|  | ||||
|     SLOW_TESTS_FILE = args.import_slow_tests | ||||
| @ -1185,37 +1191,19 @@ def lint_test_case_extension(suite): | ||||
|     return succeed | ||||
|  | ||||
|  | ||||
| def get_report_dir(test_name: str, log_suffix: Optional[str], is_pytest: bool) -> str: | ||||
|     """Generates a test report directory path. Test name does not need to be | ||||
|     sanitized.""" | ||||
|     # total path = test-reports/test_source+log_suffix/test_filename | ||||
|     # Base path | ||||
|     test_source = "python-unittest" | ||||
|     if is_pytest: | ||||
|         test_source = "python-pytest" | ||||
|     # allow users to override the test file location. We need this | ||||
|     # because the distributed tests run the same test file multiple | ||||
|     # times with different configurations. | ||||
|     override = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE') | ||||
|     if override is not None: | ||||
|         test_source = override | ||||
|  | ||||
|     # Add log suffix to if provided | ||||
|     if log_suffix and log_suffix != "": | ||||
|         test_source = test_source + log_suffix | ||||
|  | ||||
|     test_report_dir = os.path.join('test-reports', test_source) | ||||
|  | ||||
|     # Add test file name to path | ||||
|     test_filename = sanitize_test_filename(test_name) | ||||
|     test_report_dir = os.path.join(test_report_dir, test_filename) | ||||
|  | ||||
|     os.makedirs(test_report_dir, exist_ok=True) | ||||
|     return test_report_dir | ||||
|  | ||||
|  | ||||
| def get_report_path(report_dir: str, test_filename: str) -> str: | ||||
|     return os.path.join(report_dir, f"{sanitize_test_filename(test_filename)}-{os.urandom(8).hex()}.xml") | ||||
| def get_report_path(argv=None, pytest=False): | ||||
|     if argv is None: | ||||
|         argv = UNITTEST_ARGS | ||||
|     test_filename = sanitize_test_filename(argv[0]) | ||||
|     test_report_path = TEST_SAVE_XML + LOG_SUFFIX | ||||
|     test_report_path = os.path.join(test_report_path, test_filename) | ||||
|     if pytest: | ||||
|         test_report_path = test_report_path.replace('python-unittest', 'python-pytest') | ||||
|         os.makedirs(test_report_path, exist_ok=True) | ||||
|         test_report_path = os.path.join(test_report_path, f"{test_filename}-{os.urandom(8).hex()}.xml") | ||||
|         return test_report_path | ||||
|     os.makedirs(test_report_path, exist_ok=True) | ||||
|     return test_report_path | ||||
|  | ||||
|  | ||||
| def sanitize_pytest_xml(xml_file: str): | ||||
| @ -1358,7 +1346,7 @@ def run_tests(argv=None): | ||||
|         pytest_args = argv + ["--use-main-module"] | ||||
|         test_report_path = "" | ||||
|         if TEST_SAVE_XML: | ||||
|             test_report_path = get_report_path(TEST_SAVE_XML, argv[0]) | ||||
|             test_report_path = get_report_path(pytest=True) | ||||
|             print(f'Test results will be stored in {test_report_path}') | ||||
|             pytest_args.append(f'--junit-xml-reruns={test_report_path}') | ||||
|         if PYTEST_SINGLE_TEST: | ||||
| @ -1402,7 +1390,7 @@ def run_tests(argv=None): | ||||
|             def printErrors(self) -> None: | ||||
|                 super().printErrors() | ||||
|                 self.printErrorList("XPASS", self.unexpectedSuccesses) | ||||
|         test_report_path = get_report_path(TEST_SAVE_XML, argv[0]) | ||||
|         test_report_path = get_report_path() | ||||
|         verbose = '--verbose' in argv or '-v' in argv | ||||
|         if verbose: | ||||
|             print(f'Test results will be stored in {test_report_path}') | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	