mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 04:04:57 +08:00 
			
		
		
		
	Compare commits
	
		
			84 Commits
		
	
	
		
			ciflow/ind
			...
			revert-cpp
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 2eacbe792a | |||
| 8110ce02a2 | |||
| 43c30f607e | |||
| 5ebf74a655 | |||
| acd936cc1a | |||
| a4a0378e6b | |||
| ac841267a1 | |||
| 0eacd934bc | |||
| 5016e7b2eb | |||
| 544b443ea1 | |||
| 3041ede082 | |||
| 34d6ef7022 | |||
| 110efe4df4 | |||
| e137cd0a10 | |||
| be28329710 | |||
| 85a7c745aa | |||
| 32fe4f681e | |||
| ebb2b2e894 | |||
| 13413b3b07 | |||
| 5d0b3e28dc | |||
| 9139368b64 | |||
| 02095cc09d | |||
| 65868156c6 | |||
| f93ea7dab1 | |||
| a77f5d9a00 | |||
| ff46d5a79b | |||
| f452edd782 | |||
| ea698e8bfc | |||
| 7f7a28046b | |||
| d8283a317a | |||
| e0ca3049c0 | |||
| 8417981c96 | |||
| 06e71c8558 | |||
| a76b59cc45 | |||
| 74336f8c77 | |||
| 236ce736a1 | |||
| 17bdb232e1 | |||
| add37bacda | |||
| 1425b40f29 | |||
| 8af9ed0824 | |||
| 7045aab143 | |||
| 7ae8aaf4c0 | |||
| f2450798cd | |||
| 46d17e8871 | |||
| dc011d3203 | |||
| e95920e3e6 | |||
| 5e769ff867 | |||
| 0ae3e30621 | |||
| 47f50cfd45 | |||
| a51f877287 | |||
| b44423bbb4 | |||
| 8e1e4ee8e0 | |||
| 1e836bc769 | |||
| 9a91486e45 | |||
| 92381a5aa7 | |||
| 2a5f87decf | |||
| 840d63c12d | |||
| 2ce894bb1d | |||
| 47ec1e9990 | |||
| 904abfc2ca | |||
| 7d16fcf2df | |||
| 483845a9c4 | |||
| 60bcb4ee88 | |||
| ee7434be82 | |||
| d049ed2cb1 | |||
| 9901d44418 | |||
| 6096c0fc74 | |||
| f6951cb8ea | |||
| 8887a33ede | |||
| 36a48e7e6d | |||
| c6a02eae5b | |||
| 6ecd6b23b6 | |||
| 3f69b4d9b4 | |||
| a04edcb27a | |||
| eb2bad5bb5 | |||
| a076b4d7ac | |||
| a988510c33 | |||
| 99e07c39ec | |||
| 610c09f8f4 | |||
| 61bad3c1ea | |||
| f89a7e9fe8 | |||
| f2c81635c8 | |||
| e214af6ae8 | |||
| 7ce723d21c | 
| @ -150,7 +150,7 @@ function install_130 { | ||||
|   CUDNN_VERSION=9.13.0.50 | ||||
|   echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" | ||||
|   # install CUDA 13.0 in the same container | ||||
|   install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux | ||||
|   install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux | ||||
|  | ||||
|   # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement | ||||
|   install_cudnn 13 $CUDNN_VERSION | ||||
|  | ||||
| @ -1,3 +1,8 @@ | ||||
| --- | ||||
| name: docstring | ||||
| description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code. | ||||
| --- | ||||
| 
 | ||||
| # PyTorch Docstring Writing Guide | ||||
| 
 | ||||
| This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`. | ||||
							
								
								
									
										385
									
								
								.claude/skills/skill-writer/SKILL.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										385
									
								
								.claude/skills/skill-writer/SKILL.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,385 @@ | ||||
| --- | ||||
| name: skill-writer | ||||
| description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure. | ||||
| --- | ||||
|  | ||||
| # Skill Writer | ||||
|  | ||||
| This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements. | ||||
|  | ||||
| ## When to use this Skill | ||||
|  | ||||
| Use this Skill when: | ||||
| - Creating a new Agent Skill | ||||
| - Writing or updating SKILL.md files | ||||
| - Designing skill structure and frontmatter | ||||
| - Troubleshooting skill discovery issues | ||||
| - Converting existing prompts or workflows into Skills | ||||
|  | ||||
| ## Instructions | ||||
|  | ||||
| ### Step 1: Determine Skill scope | ||||
|  | ||||
| First, understand what the Skill should do: | ||||
|  | ||||
| 1. **Ask clarifying questions**: | ||||
|    - What specific capability should this Skill provide? | ||||
|    - When should Claude use this Skill? | ||||
|    - What tools or resources does it need? | ||||
|    - Is this for personal use or team sharing? | ||||
|  | ||||
| 2. **Keep it focused**: One Skill = one capability | ||||
|    - Good: "PDF form filling", "Excel data analysis" | ||||
|    - Too broad: "Document processing", "Data tools" | ||||
|  | ||||
| ### Step 2: Choose Skill location | ||||
|  | ||||
| Determine where to create the Skill: | ||||
|  | ||||
| **Personal Skills** (`~/.claude/skills/`): | ||||
| - Individual workflows and preferences | ||||
| - Experimental Skills | ||||
| - Personal productivity tools | ||||
|  | ||||
| **Project Skills** (`.claude/skills/`): | ||||
| - Team workflows and conventions | ||||
| - Project-specific expertise | ||||
| - Shared utilities (committed to git) | ||||
|  | ||||
| ### Step 3: Create Skill structure | ||||
|  | ||||
| Create the directory and files: | ||||
|  | ||||
| ```bash | ||||
| # Personal | ||||
| mkdir -p ~/.claude/skills/skill-name | ||||
|  | ||||
| # Project | ||||
| mkdir -p .claude/skills/skill-name | ||||
| ``` | ||||
|  | ||||
| For multi-file Skills: | ||||
| ``` | ||||
| skill-name/ | ||||
| ├── SKILL.md (required) | ||||
| ├── reference.md (optional) | ||||
| ├── examples.md (optional) | ||||
| ├── scripts/ | ||||
| │   └── helper.py (optional) | ||||
| └── templates/ | ||||
|     └── template.txt (optional) | ||||
| ``` | ||||
|  | ||||
| ### Step 4: Write SKILL.md frontmatter | ||||
|  | ||||
| Create YAML frontmatter with required fields: | ||||
|  | ||||
| ```yaml | ||||
| --- | ||||
| name: skill-name | ||||
| description: Brief description of what this does and when to use it | ||||
| --- | ||||
| ``` | ||||
|  | ||||
| **Field requirements**: | ||||
|  | ||||
| - **name**: | ||||
|   - Lowercase letters, numbers, hyphens only | ||||
|   - Max 64 characters | ||||
|   - Must match directory name | ||||
|   - Good: `pdf-processor`, `git-commit-helper` | ||||
|   - Bad: `PDF_Processor`, `Git Commits!` | ||||
|  | ||||
| - **description**: | ||||
|   - Max 1024 characters | ||||
|   - Include BOTH what it does AND when to use it | ||||
|   - Use specific trigger words users would say | ||||
|   - Mention file types, operations, and context | ||||
|  | ||||
| **Optional frontmatter fields**: | ||||
|  | ||||
| - **allowed-tools**: Restrict tool access (comma-separated list) | ||||
|   ```yaml | ||||
|   allowed-tools: Read, Grep, Glob | ||||
|   ``` | ||||
|   Use for: | ||||
|   - Read-only Skills | ||||
|   - Security-sensitive workflows | ||||
|   - Limited-scope operations | ||||
|  | ||||
| ### Step 5: Write effective descriptions | ||||
|  | ||||
| The description is critical for Claude to discover your Skill. | ||||
|  | ||||
| **Formula**: `[What it does] + [When to use it] + [Key triggers]` | ||||
|  | ||||
| **Examples**: | ||||
|  | ||||
| ✅ **Good**: | ||||
| ```yaml | ||||
| description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction. | ||||
| ``` | ||||
|  | ||||
| ✅ **Good**: | ||||
| ```yaml | ||||
| description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format. | ||||
| ``` | ||||
|  | ||||
| ❌ **Too vague**: | ||||
| ```yaml | ||||
| description: Helps with documents | ||||
| description: For data analysis | ||||
| ``` | ||||
|  | ||||
| **Tips**: | ||||
| - Include specific file extensions (.pdf, .xlsx, .json) | ||||
| - Mention common user phrases ("analyze", "extract", "generate") | ||||
| - List concrete operations (not generic verbs) | ||||
| - Add context clues ("Use when...", "For...") | ||||
|  | ||||
| ### Step 6: Structure the Skill content | ||||
|  | ||||
| Use clear Markdown sections: | ||||
|  | ||||
| ```markdown | ||||
| # Skill Name | ||||
|  | ||||
| Brief overview of what this Skill does. | ||||
|  | ||||
| ## Quick start | ||||
|  | ||||
| Provide a simple example to get started immediately. | ||||
|  | ||||
| ## Instructions | ||||
|  | ||||
| Step-by-step guidance for Claude: | ||||
| 1. First step with clear action | ||||
| 2. Second step with expected outcome | ||||
| 3. Handle edge cases | ||||
|  | ||||
| ## Examples | ||||
|  | ||||
| Show concrete usage examples with code or commands. | ||||
|  | ||||
| ## Best practices | ||||
|  | ||||
| - Key conventions to follow | ||||
| - Common pitfalls to avoid | ||||
| - When to use vs. not use | ||||
|  | ||||
| ## Requirements | ||||
|  | ||||
| List any dependencies or prerequisites: | ||||
| ```bash | ||||
| pip install package-name | ||||
| ``` | ||||
|  | ||||
| ## Advanced usage | ||||
|  | ||||
| For complex scenarios, see [reference.md](reference.md). | ||||
| ``` | ||||
|  | ||||
| ### Step 7: Add supporting files (optional) | ||||
|  | ||||
| Create additional files for progressive disclosure: | ||||
|  | ||||
| **reference.md**: Detailed API docs, advanced options | ||||
| **examples.md**: Extended examples and use cases | ||||
| **scripts/**: Helper scripts and utilities | ||||
| **templates/**: File templates or boilerplate | ||||
|  | ||||
| Reference them from SKILL.md: | ||||
| ```markdown | ||||
| For advanced usage, see [reference.md](reference.md). | ||||
|  | ||||
| Run the helper script: | ||||
| \`\`\`bash | ||||
| python scripts/helper.py input.txt | ||||
| \`\`\` | ||||
| ``` | ||||
|  | ||||
| ### Step 8: Validate the Skill | ||||
|  | ||||
| Check these requirements: | ||||
|  | ||||
| ✅ **File structure**: | ||||
| - [ ] SKILL.md exists in correct location | ||||
| - [ ] Directory name matches frontmatter `name` | ||||
|  | ||||
| ✅ **YAML frontmatter**: | ||||
| - [ ] Opening `---` on line 1 | ||||
| - [ ] Closing `---` before content | ||||
| - [ ] Valid YAML (no tabs, correct indentation) | ||||
| - [ ] `name` follows naming rules | ||||
| - [ ] `description` is specific and < 1024 chars | ||||
|  | ||||
| ✅ **Content quality**: | ||||
| - [ ] Clear instructions for Claude | ||||
| - [ ] Concrete examples provided | ||||
| - [ ] Edge cases handled | ||||
| - [ ] Dependencies listed (if any) | ||||
|  | ||||
| ✅ **Testing**: | ||||
| - [ ] Description matches user questions | ||||
| - [ ] Skill activates on relevant queries | ||||
| - [ ] Instructions are clear and actionable | ||||
|  | ||||
| ### Step 9: Test the Skill | ||||
|  | ||||
| 1. **Restart Claude Code** (if running) to load the Skill | ||||
|  | ||||
| 2. **Ask relevant questions** that match the description: | ||||
|    ``` | ||||
|    Can you help me extract text from this PDF? | ||||
|    ``` | ||||
|  | ||||
| 3. **Verify activation**: Claude should use the Skill automatically | ||||
|  | ||||
| 4. **Check behavior**: Confirm Claude follows the instructions correctly | ||||
|  | ||||
| ### Step 10: Debug if needed | ||||
|  | ||||
| If Claude doesn't use the Skill: | ||||
|  | ||||
| 1. **Make description more specific**: | ||||
|    - Add trigger words | ||||
|    - Include file types | ||||
|    - Mention common user phrases | ||||
|  | ||||
| 2. **Check file location**: | ||||
|    ```bash | ||||
|    ls ~/.claude/skills/skill-name/SKILL.md | ||||
|    ls .claude/skills/skill-name/SKILL.md | ||||
|    ``` | ||||
|  | ||||
| 3. **Validate YAML**: | ||||
|    ```bash | ||||
|    cat SKILL.md | head -n 10 | ||||
|    ``` | ||||
|  | ||||
| 4. **Run debug mode**: | ||||
|    ```bash | ||||
|    claude --debug | ||||
|    ``` | ||||
|  | ||||
| ## Common patterns | ||||
|  | ||||
| ### Read-only Skill | ||||
|  | ||||
| ```yaml | ||||
| --- | ||||
| name: code-reader | ||||
| description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation. | ||||
| allowed-tools: Read, Grep, Glob | ||||
| --- | ||||
| ``` | ||||
|  | ||||
| ### Script-based Skill | ||||
|  | ||||
| ```yaml | ||||
| --- | ||||
| name: data-processor | ||||
| description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets. | ||||
| --- | ||||
|  | ||||
| # Data Processor | ||||
|  | ||||
| ## Instructions | ||||
|  | ||||
| 1. Use the processing script: | ||||
| \`\`\`bash | ||||
| python scripts/process.py input.csv --output results.json | ||||
| \`\`\` | ||||
|  | ||||
| 2. Validate output with: | ||||
| \`\`\`bash | ||||
| python scripts/validate.py results.json | ||||
| \`\`\` | ||||
| ``` | ||||
|  | ||||
| ### Multi-file Skill with progressive disclosure | ||||
|  | ||||
| ```yaml | ||||
| --- | ||||
| name: api-designer | ||||
| description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture. | ||||
| --- | ||||
|  | ||||
| # API Designer | ||||
|  | ||||
| Quick start: See [examples.md](examples.md) | ||||
|  | ||||
| Detailed reference: See [reference.md](reference.md) | ||||
|  | ||||
| ## Instructions | ||||
|  | ||||
| 1. Gather requirements | ||||
| 2. Design endpoints (see examples.md) | ||||
| 3. Document with OpenAPI spec | ||||
| 4. Review against best practices (see reference.md) | ||||
| ``` | ||||
|  | ||||
| ## Best practices for Skill authors | ||||
|  | ||||
| 1. **One Skill, one purpose**: Don't create mega-Skills | ||||
| 2. **Specific descriptions**: Include trigger words users will say | ||||
| 3. **Clear instructions**: Write for Claude, not humans | ||||
| 4. **Concrete examples**: Show real code, not pseudocode | ||||
| 5. **List dependencies**: Mention required packages in description | ||||
| 6. **Test with teammates**: Verify activation and clarity | ||||
| 7. **Version your Skills**: Document changes in content | ||||
| 8. **Use progressive disclosure**: Put advanced details in separate files | ||||
|  | ||||
| ## Validation checklist | ||||
|  | ||||
| Before finalizing a Skill, verify: | ||||
|  | ||||
| - [ ] Name is lowercase, hyphens only, max 64 chars | ||||
| - [ ] Description is specific and < 1024 chars | ||||
| - [ ] Description includes "what" and "when" | ||||
| - [ ] YAML frontmatter is valid | ||||
| - [ ] Instructions are step-by-step | ||||
| - [ ] Examples are concrete and realistic | ||||
| - [ ] Dependencies are documented | ||||
| - [ ] File paths use forward slashes | ||||
| - [ ] Skill activates on relevant queries | ||||
| - [ ] Claude follows instructions correctly | ||||
|  | ||||
| ## Troubleshooting | ||||
|  | ||||
| **Skill doesn't activate**: | ||||
| - Make description more specific with trigger words | ||||
| - Include file types and operations in description | ||||
| - Add "Use when..." clause with user phrases | ||||
|  | ||||
| **Multiple Skills conflict**: | ||||
| - Make descriptions more distinct | ||||
| - Use different trigger words | ||||
| - Narrow the scope of each Skill | ||||
|  | ||||
| **Skill has errors**: | ||||
| - Check YAML syntax (no tabs, proper indentation) | ||||
| - Verify file paths (use forward slashes) | ||||
| - Ensure scripts have execute permissions | ||||
| - List all dependencies | ||||
|  | ||||
| ## Examples | ||||
|  | ||||
| See the documentation for complete examples: | ||||
| - Simple single-file Skill (commit-helper) | ||||
| - Skill with tool permissions (code-reviewer) | ||||
| - Multi-file Skill (pdf-processing) | ||||
|  | ||||
| ## Output format | ||||
|  | ||||
| When creating a Skill, I will: | ||||
|  | ||||
| 1. Ask clarifying questions about scope and requirements | ||||
| 2. Suggest a Skill name and location | ||||
| 3. Create the SKILL.md file with proper frontmatter | ||||
| 4. Include clear instructions and examples | ||||
| 5. Add supporting files if needed | ||||
| 6. Provide testing instructions | ||||
| 7. Validate against all requirements | ||||
|  | ||||
| The result will be a complete, working Skill that follows all best practices and validation rules. | ||||
							
								
								
									
										22
									
								
								.github/scripts/generate_binary_build_matrix.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										22
									
								
								.github/scripts/generate_binary_build_matrix.py
									
									
									
									
										vendored
									
									
								
							| @ -22,7 +22,7 @@ CUDA_ARCHES_FULL_VERSION = { | ||||
|     "12.6": "12.6.3", | ||||
|     "12.8": "12.8.1", | ||||
|     "12.9": "12.9.1", | ||||
|     "13.0": "13.0.0", | ||||
|     "13.0": "13.0.2", | ||||
| } | ||||
| CUDA_ARCHES_CUDNN_VERSION = { | ||||
|     "12.6": "9", | ||||
| @ -96,21 +96,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { | ||||
|         "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'" | ||||
|     ), | ||||
|     "13.0": ( | ||||
|         "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | " | ||||
|         "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | " | ||||
|         "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | " | ||||
|         "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | " | ||||
|         "nvidia-cublas==13.1.0.3; platform_system == 'Linux' | " | ||||
|         "nvidia-cufft==12.0.0.61; platform_system == 'Linux' | " | ||||
|         "nvidia-curand==10.4.0.35; platform_system == 'Linux' | " | ||||
|         "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " | ||||
|         "nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " | ||||
|         "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | " | ||||
|         "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " | ||||
|         "nvidia-nvtx==13.0.39; platform_system == 'Linux' | " | ||||
|         "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " | ||||
|         "nvidia-cufile==1.15.0.42; platform_system == 'Linux'" | ||||
|         "nvidia-nvtx==13.0.85; platform_system == 'Linux' | " | ||||
|         "nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | " | ||||
|         "nvidia-cufile==1.15.1.6; platform_system == 'Linux'" | ||||
|     ), | ||||
|     "xpu": ( | ||||
|         "intel-cmplr-lib-rt==2025.2.1 | " | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -270,7 +270,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_10-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -519,7 +519,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_11-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -768,7 +768,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_12-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1017,7 +1017,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1266,7 +1266,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13t-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1515,7 +1515,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1764,7 +1764,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14t-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -325,7 +325,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_10-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_10-cuda13_0-test:  # Testing | ||||
| @ -991,7 +991,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_11-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_11-cuda13_0-test:  # Testing | ||||
| @ -1657,7 +1657,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_12-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_12-cuda13_0-test:  # Testing | ||||
| @ -2323,7 +2323,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13-cuda13_0-test:  # Testing | ||||
| @ -2989,7 +2989,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13t-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13t-cuda13_0-test:  # Testing | ||||
| @ -3655,7 +3655,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14-cuda13_0-test:  # Testing | ||||
| @ -4321,7 +4321,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14t-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14t-cuda13_0-test:  # Testing | ||||
|  | ||||
| @ -31,9 +31,9 @@ Be careful when running untrusted models. This classification includes models cr | ||||
|  | ||||
| **Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing). | ||||
|  | ||||
| **Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done. | ||||
|  | ||||
| **Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details. | ||||
|  | ||||
| Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs. | ||||
|  | ||||
| Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance. | ||||
|  | ||||
|  | ||||
| @ -38,7 +38,7 @@ set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT) | ||||
|  | ||||
| configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h") | ||||
| # TODO: Do not generate CUDAConfig.h for ROCm BUILDS | ||||
| # At the moment, `jit_macors.h` include CUDAConfig.h for both CUDA and HIP builds | ||||
| # At the moment, `jit_macros.h` include CUDAConfig.h for both CUDA and HIP builds | ||||
| if(USE_CUDA OR USE_ROCM) | ||||
|   configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h") | ||||
| endif() | ||||
|  | ||||
| @ -122,7 +122,7 @@ void FunctionalTensorWrapper::freeze_storage() const { | ||||
| //          |   have their own storages, but backends like functorch      | | ||||
| //         \/   are allowed to re-alias underneath the pass               \/ | ||||
| // . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - . | ||||
| // |    underyling_storage     |                             |      underyling_storage       | | ||||
| // |    underlying_storage     |                             |      underlying_storage       | | ||||
| // . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - . | ||||
| // | ||||
| // This constructor is only used by view ops. | ||||
|  | ||||
| @ -1534,7 +1534,7 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { | ||||
|  | ||||
|   // XLA and lazy tensors don't have storage, so they don't have an underlying data pointer. | ||||
|   // Nothing beyond this point is important for meta functions, so it's fine to exit early here. | ||||
|   // Extend the condition to MAIA tesnors as MAIA tensors also don't have storage. | ||||
|   // Extend the condition to MAIA tensors as MAIA tensors also don't have storage. | ||||
|   if (privateuse1_without_storage  || | ||||
|       common_device_.type() == DeviceType::XLA  || | ||||
|       common_device_.type() == DeviceType::IPU  || | ||||
|  | ||||
| @ -94,11 +94,11 @@ struct PinnedReserveSegment { | ||||
| struct TORCH_API HostStats { | ||||
|   // COUNT: total allocations (active) | ||||
|   Stat active_requests; | ||||
|   // SUM: bytes allocated/reserved by this memory alocator. (active) | ||||
|   // SUM: bytes allocated/reserved by this memory allocator. (active) | ||||
|   Stat active_bytes; | ||||
|   // COUNT: total allocations (active + free) | ||||
|   Stat allocations; | ||||
|   // SUM: bytes allocated/reserved by this memory alocator. This accounts | ||||
|   // SUM: bytes allocated/reserved by this memory allocator. This accounts | ||||
|   // for both free and in-use blocks. | ||||
|   Stat allocated_bytes; | ||||
|  | ||||
| @ -127,7 +127,7 @@ struct alignas(hardware_destructive_interference_size) HostStatsStaged { | ||||
|   // COUNT: total allocations (active + free) | ||||
|   // LOCK: access to this stat is protected by the allocator's blocks_mutex_ | ||||
|   Stat allocations; | ||||
|   // SUM: bytes allocated/reserved by this memory alocator. This accounts | ||||
|   // SUM: bytes allocated/reserved by this memory allocator. This accounts | ||||
|   // for both free and in-use blocks. | ||||
|   Stat allocated_bytes; | ||||
|   // COUNT: number of allocations per bucket (active) | ||||
| @ -455,7 +455,7 @@ struct CachingHostAllocatorImpl { | ||||
|   } | ||||
|  | ||||
|   void resetAccumulatedStats() { | ||||
|     // Reseting accumulated memory stats requires concurrently holding both the | ||||
|     // Resetting accumulated memory stats requires concurrently holding both the | ||||
|     // free list mutexes and the blocks mutex. Previously, this was only done in | ||||
|     // empty_cache function. | ||||
|     for (size_t i = 0; i < free_list_.size(); ++i) { | ||||
| @ -482,7 +482,7 @@ struct CachingHostAllocatorImpl { | ||||
|   } | ||||
|  | ||||
|   void resetPeakStats() { | ||||
|     // Reseting peak memory stats requires concurrently holding both the | ||||
|     // Resetting peak memory stats requires concurrently holding both the | ||||
|     // free list mutexes and the blocks mutex. Previously, this was only done in | ||||
|     // empty_cache function. | ||||
|     for (size_t i = 0; i < free_list_.size(); ++i) { | ||||
|  | ||||
| @ -148,7 +148,7 @@ struct TORCH_API ClassType : public NamedType { | ||||
|  | ||||
|   void checkNotExist(const std::string& name, const std::string& what) const; | ||||
|  | ||||
|   // Attributes are stored in a specific slot at runtime for effiency. | ||||
|   // Attributes are stored in a specific slot at runtime for efficiency. | ||||
|   // When emitting instructions we specify the slot so that attribute access is | ||||
|   // a constant lookup | ||||
|   std::optional<size_t> findAttributeSlot(const std::string& name) const { | ||||
| @ -412,7 +412,7 @@ struct TORCH_API ClassType : public NamedType { | ||||
|   // Holds method attributes | ||||
|   std::weak_ptr<CompilationUnit> compilation_unit_; | ||||
|  | ||||
|   // Holds all atrributes, attribute details are found on ClassAttribute | ||||
|   // Holds all attributes, attribute details are found on ClassAttribute | ||||
|   std::vector<ClassAttribute> attributes_; | ||||
|   // Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef. | ||||
|   // Never fill this without using the appropriate provideNewClassAttribute method | ||||
|  | ||||
| @ -537,7 +537,7 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK | ||||
|  | ||||
|   // Note: this records a sequence number for both Autograd keys, and for | ||||
|   // non-Autograd keys where the dispatchKeySet still contains an autograd key. | ||||
|   // This means that we might collect the same sequence nubmer two different | ||||
|   // This means that we might collect the same sequence number two different | ||||
|   // events if they all occurred above Autograd and still had the Autograd | ||||
|   // dispatch key in the dispatch key set. | ||||
|   // However, this usually doesn't happen: normally the first call will | ||||
|  | ||||
| @ -585,7 +585,7 @@ class TORCH_API OperatorHandle { | ||||
|  | ||||
|   // We need to store this iterator in order to make | ||||
|   // Dispatcher::cleanup() fast -- it runs a lot on program | ||||
|   // termination (and presuambly library unloading). | ||||
|   // termination (and presumably library unloading). | ||||
|   std::list<Dispatcher::OperatorDef>::iterator operatorIterator_; | ||||
| }; | ||||
|  | ||||
|  | ||||
| @ -365,7 +365,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab | ||||
|   //          For autograd keys, we only use kernel from CompositeImplicitAutograd when there's no direct registration | ||||
|   //          to its corresponding backend key or CompositeExplicitAutograd. See Note [CompositeExplicitAutograd and CompositeImplicitAutograd]. | ||||
|   //          For AutogradOther, we eagerly return ambiguousAutogradOtherKernel() if there's registration to any of | ||||
|   //          its backends and ask backend extender to request a decicated Autograd key for the backend. | ||||
|   //          its backends and ask backend extender to request a dedicated Autograd key for the backend. | ||||
|   //          See Note [Ambiguity in AutogradOther kernel] for more details. | ||||
|   //          A CompositeExplicitAutograd kernel prevents CompositeImplicitAutograd kernel being used for Autograd keys, but it doesn't | ||||
|   //          cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available) | ||||
|  | ||||
| @ -261,7 +261,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) { | ||||
|     // | ||||
|     // There are 2 cases | ||||
|     // 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'. | ||||
|     // without the extra parenthesis, the c++ schem parser can not parse it. | ||||
|     // without the extra parenthesis, the c++ scheme parser can not parse it. | ||||
|     // 2. something like '-> ((str, str))'. Need extra parenthesis so the return | ||||
|     // type is a single tuple rather than two strings. | ||||
|     // PR (https://github.com/pytorch/pytorch/pull/23204) has more context about | ||||
|  | ||||
| @ -1176,7 +1176,7 @@ struct TORCH_API IValue final { | ||||
|   using HashIdentityIValueMap = | ||||
|       std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>; | ||||
|  | ||||
|   // Chechs if this and rhs has a subvalues in common. | ||||
|   // Checks if this and rhs has a subvalues in common. | ||||
|   // [t1,t2] and [t2, t3] returns true. | ||||
|   bool overlaps(const IValue& rhs) const; | ||||
|  | ||||
|  | ||||
| @ -1501,7 +1501,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target { | ||||
|   // However, the CompilationUnit holds ownership of the type's graphs, so | ||||
|   // inserting a constant object into a Graph would create a reference cycle if | ||||
|   // that constant object held a shared_ptr to its CU. For these objects we | ||||
|   // instatiate them with non-owning references to its CU | ||||
|   // instantiate them with non-owning references to its CU | ||||
|   Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) { | ||||
|     slots_.resize(numSlots); | ||||
|   } | ||||
|  | ||||
| @ -373,7 +373,7 @@ struct TORCH_API SymbolicShape { | ||||
|   // Unranked shape constructor. | ||||
|   SymbolicShape() : dims_(std::nullopt) {} | ||||
|  | ||||
|   // Known rank but unknown dimentions. | ||||
|   // Known rank but unknown dimensions. | ||||
|   SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) { | ||||
|     if(!rank) { | ||||
|       return; | ||||
| @ -884,9 +884,9 @@ struct TORCH_API ListType | ||||
|  | ||||
|   // global singleton | ||||
|   // Given an inner type T and an identifier, | ||||
|   // this function wil return the global singleton type pointer | ||||
|   // this function will return the global singleton type pointer | ||||
|   // the type List<T>. | ||||
|   // The extra "identifier" argument is needed beccause we have multiple container types | ||||
|   // The extra "identifier" argument is needed because we have multiple container types | ||||
|   // that all re-use this function (List<T>, array<T, N>, etc.) | ||||
|   static TypePtr get(const std::string& identifier, TypePtr inner); | ||||
|  | ||||
|  | ||||
| @ -21,7 +21,7 @@ namespace c10 { | ||||
|  | ||||
| namespace detail { | ||||
| // The first argument of the schema might be of type DispatchKeySet, in which case we remove it. | ||||
| // We do this because every argument in a function schema is expected to be convertable | ||||
| // We do this because every argument in a function schema is expected to be convertible | ||||
| // to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of. | ||||
| // See Note [Plumbing Keys Through The Dispatcher] | ||||
| template<class KernelFunctor> | ||||
|  | ||||
| @ -251,7 +251,7 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox | ||||
|   callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CPU), dummyTensor(c10::DispatchKey::CUDA)); | ||||
|   EXPECT_TRUE(called_kernel_cpu); | ||||
|  | ||||
|   // Ensure that disptach key from tensor is not used here. | ||||
|   // Ensure that dispatch key from tensor is not used here. | ||||
|   called_kernel_cpu = false; | ||||
|   expectThrows<c10::Error>([&] { | ||||
|     callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CUDA), dummyTensor(c10::DispatchKey::CPU)); | ||||
|  | ||||
| @ -172,7 +172,7 @@ VaryingShape<Stride> TensorType::computeStrideProps( | ||||
|   // The logic below follows what TensorIterator uses in its logic: | ||||
|   //   1. Fast_set_up is the short-cut to identify a. channels_last and | ||||
|   //      b. contiguous format, which is what we have in the below logic. | ||||
|   //   2. In more generla cases, it does best effort to preserve permutatoin. | ||||
|   //   2. In more general cases, it does best effort to preserve permutatoin. | ||||
|   if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) { | ||||
|     // case 1.a. short cut channels last | ||||
|     std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2); | ||||
|  | ||||
| @ -77,21 +77,36 @@ CONVERT_TEMPLATE(double, int64_t) | ||||
| CONVERT_TEMPLATE(double, float) | ||||
| CONVERT_TEMPLATE(double, double) | ||||
| #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
| CONVERT_TEMPLATE(float16_t, uint8_t) | ||||
| CONVERT_TEMPLATE(float16_t, int8_t) | ||||
| CONVERT_TEMPLATE(float16_t, int16_t) | ||||
| CONVERT_TEMPLATE(float16_t, int32_t) | ||||
| CONVERT_TEMPLATE(float16_t, int64_t) | ||||
| CONVERT_TEMPLATE(float16_t, float16_t) | ||||
| CONVERT_TEMPLATE(float16_t, float) | ||||
| CONVERT_TEMPLATE(float16_t, double) | ||||
| CONVERT_TEMPLATE(uint8_t, float16_t) | ||||
| CONVERT_TEMPLATE(int8_t, float16_t) | ||||
| CONVERT_TEMPLATE(int16_t, float16_t) | ||||
| CONVERT_TEMPLATE(int32_t, float16_t) | ||||
| CONVERT_TEMPLATE(int64_t, float16_t) | ||||
| CONVERT_TEMPLATE(float, float16_t) | ||||
| CONVERT_TEMPLATE(double, float16_t) | ||||
|  | ||||
| #define CONVERT_FROM_FP16_TEMPLATE(to_type)                            \ | ||||
|   template <>                                                          \ | ||||
|   inline void convert(const at::Half* src, to_type* dst, int64_t n) {  \ | ||||
|     const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src); \ | ||||
|     return convertImpl<float16_t, to_type>(srcPtr, dst, n);            \ | ||||
|   } | ||||
|  | ||||
| #define CONVERT_TO_FP16_TEMPLATE(from_type)                             \ | ||||
|   template <>                                                           \ | ||||
|   inline void convert(const from_type* src, at::Half* dst, int64_t n) { \ | ||||
|     float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);              \ | ||||
|     return convertImpl<from_type, float16_t>(src, dstPtr, n);           \ | ||||
|   } | ||||
|  | ||||
| CONVERT_FROM_FP16_TEMPLATE(uint8_t) | ||||
| CONVERT_FROM_FP16_TEMPLATE(int8_t) | ||||
| CONVERT_FROM_FP16_TEMPLATE(int16_t) | ||||
| CONVERT_FROM_FP16_TEMPLATE(int32_t) | ||||
| CONVERT_FROM_FP16_TEMPLATE(int64_t) | ||||
| CONVERT_FROM_FP16_TEMPLATE(float16_t) | ||||
| CONVERT_FROM_FP16_TEMPLATE(float) | ||||
| CONVERT_FROM_FP16_TEMPLATE(double) | ||||
| CONVERT_TO_FP16_TEMPLATE(uint8_t) | ||||
| CONVERT_TO_FP16_TEMPLATE(int8_t) | ||||
| CONVERT_TO_FP16_TEMPLATE(int16_t) | ||||
| CONVERT_TO_FP16_TEMPLATE(int32_t) | ||||
| CONVERT_TO_FP16_TEMPLATE(int64_t) | ||||
| CONVERT_TO_FP16_TEMPLATE(float) | ||||
| CONVERT_TO_FP16_TEMPLATE(double) | ||||
| #endif | ||||
| #ifdef __ARM_FEATURE_BF16 | ||||
| CONVERT_TEMPLATE(bfloat16_t, uint8_t) | ||||
|  | ||||
| @ -634,8 +634,7 @@ inline Vectorized<float> Vectorized<float>::erf() const { | ||||
|   // - exp(- x * x) | ||||
|   auto pow_2 = (*this) * (*this); | ||||
|   auto neg_pow_2 = pow_2 ^ neg_zero_vec; | ||||
|   auto tmp4 = neg_pow_2.map( | ||||
|       std::exp); // This can be swapped for a faster implementation of exp. | ||||
|   auto tmp4 = neg_pow_2.exp(); | ||||
|   auto tmp5 = tmp4 ^ neg_zero_vec; | ||||
|   // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) | ||||
|   auto tmp6 = t * tmp5; | ||||
|  | ||||
| @ -234,7 +234,7 @@ class Vectorized<c10::Half> : public Vectorized16< | ||||
|         vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift); | ||||
|     return vaddvq_u16(bits_vec); | ||||
| #else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
|     // use known working implmentation. | ||||
|     // use known working implementation. | ||||
|     __at_align__ value_type tmp[size()]; | ||||
|     store(tmp); | ||||
|     int mask = 0; | ||||
|  | ||||
| @ -1740,7 +1740,7 @@ Vectorized<int16_t> inline shift_256_16( | ||||
|  | ||||
|   // Control masks for shuffle operation, treating 256 bits as an | ||||
|   // array of 16-bit elements, and considering pairs of neighboring | ||||
|   // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and | ||||
|   // elements.  Specifically, a mask named "ctl_M_N" (M,N in [0,1], and | ||||
|   // M!=N) is set so that shuffle will move element with index M from | ||||
|   // input pair into element with index N in output pair, and element | ||||
|   // with index M in output pair will be set to all 0s. | ||||
| @ -1875,7 +1875,7 @@ Vectorized<T> inline shift_256_8( | ||||
|  | ||||
|   // Control masks for shuffle operation, treating 256 bits as an | ||||
|   // array of 8-bit elements, and considering quadruples of | ||||
|   // neighboring elements.  Specifially, a mask named "ctl_M_N" (M,N | ||||
|   // neighboring elements.  Specifically, a mask named "ctl_M_N" (M,N | ||||
|   // in [0,1,2,3], and M!=N) is set so that shuffle will move element | ||||
|   // with index M from input quadruple into element with index N in | ||||
|   // output quadruple, and other elements in output quadruple will be | ||||
|  | ||||
| @ -143,7 +143,7 @@ class Vectorized<double> { | ||||
|       const Vectorized<double>& a, | ||||
|       const Vectorized<double>& b, | ||||
|       const Vectorized<double>& mask) { | ||||
|     // the mask used here returned by comparision of vec256 | ||||
|     // the mask used here returned by comparison of vec256 | ||||
|  | ||||
|     return { | ||||
|         vec_sel(a._vec0, b._vec0, mask._vecb0), | ||||
|  | ||||
| @ -142,7 +142,7 @@ class Vectorized<float> { | ||||
|       const Vectorized<float>& a, | ||||
|       const Vectorized<float>& b, | ||||
|       const Vectorized<float>& mask) { | ||||
|     // the mask used here returned by comparision of vec256 | ||||
|     // the mask used here returned by comparison of vec256 | ||||
|     // assuming this we can use the same mask directly with vec_sel | ||||
|     return { | ||||
|         vec_sel(a._vec0, b._vec0, mask._vecb0), | ||||
|  | ||||
| @ -202,7 +202,7 @@ class Vectorized<int16_t> { | ||||
|       const Vectorized<int16_t>& a, | ||||
|       const Vectorized<int16_t>& b, | ||||
|       const Vectorized<int16_t>& mask) { | ||||
|     // the mask used here returned by comparision of vec256 | ||||
|     // the mask used here returned by comparison of vec256 | ||||
|     // assuming this we can use the same mask directly with vec_sel | ||||
|     // warning intel style mask will not work properly | ||||
|     return { | ||||
|  | ||||
| @ -155,7 +155,7 @@ class Vectorized<int32_t> { | ||||
|       const Vectorized<int32_t>& a, | ||||
|       const Vectorized<int32_t>& b, | ||||
|       const Vectorized<int32_t>& mask) { | ||||
|     // the mask used here returned by comparision of vec256 | ||||
|     // the mask used here returned by comparison of vec256 | ||||
|     // assuming this we can use the same mask directly with vec_sel | ||||
|     // warning intel style mask will not work properly | ||||
|     return { | ||||
|  | ||||
| @ -119,7 +119,7 @@ class Vectorized<int64_t> { | ||||
|       const Vectorized<int64_t>& a, | ||||
|       const Vectorized<int64_t>& b, | ||||
|       const Vectorized<int64_t>& mask) { | ||||
|     // the mask used here returned by comparision of vec256 | ||||
|     // the mask used here returned by comparison of vec256 | ||||
|  | ||||
|     return { | ||||
|         vec_sel(a._vec0, b._vec0, mask._vecb0), | ||||
|  | ||||
| @ -397,7 +397,7 @@ inline Vectorized<bool> operator&&( | ||||
|   const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes()); | ||||
|   __m512i out = _mm512_and_si512(*self_, *other_); | ||||
|   Vectorized<bool> ret; | ||||
|   // We do not have a constructer that takes __m512i, so we need to memcpy | ||||
|   // We do not have a constructor that takes __m512i, so we need to memcpy | ||||
|   std::memcpy(ret, &out, ret.size() * sizeof(bool)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| @ -1852,7 +1852,7 @@ Vectorized<T> inline shift_512_8( | ||||
|  | ||||
|   // Control masks for shuffle operation, treating 512 bits as an | ||||
|   // array of 8-bit elements, and considering pairs of neighboring | ||||
|   // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and | ||||
|   // elements.  Specifically, a mask named "ctl_M_N" (M,N in [0,1], and | ||||
|   // M!=N) is set so that shuffle will move element with index M from | ||||
|   // input pair into element with index N in output pair, and element | ||||
|   // with index M in output pair will be set to all 0s. | ||||
|  | ||||
| @ -634,7 +634,7 @@ struct Vectorized { | ||||
|   } | ||||
|   Vectorized<T> neg() const { | ||||
|     // NB: the trailing return type is needed because we need to coerce the | ||||
|     // return value back to T in the case of unary operator- incuring a | ||||
|     // return value back to T in the case of unary operator- incurring a | ||||
|     // promotion | ||||
|     return map([](T x) -> T { return -x; }); | ||||
|   } | ||||
|  | ||||
| @ -1958,7 +1958,7 @@ void scaled_gemm( | ||||
|     ScalarType result_dtype, | ||||
|     bool use_fast_accum, | ||||
|     const std::optional<Tensor>& alpha) { | ||||
|   // Note: see `cublasCommonArgs` for various non-intuitive manupulations | ||||
|   // Note: see `cublasCommonArgs` for various non-intuitive manipulations | ||||
|   // of input arguments to this function. | ||||
|   const auto computeType = CUBLAS_COMPUTE_32F; | ||||
|   const auto scaleType = CUDA_R_32F; | ||||
|  | ||||
| @ -2,10 +2,10 @@ | ||||
|  | ||||
| #include <ATen/cuda/ATenCUDAGeneral.h> | ||||
| #include <ATen/cuda/CUDAContext.h> | ||||
| #include <c10/core/impl/GPUTrace.h> | ||||
| #include <c10/cuda/CUDAStream.h> | ||||
| #include <c10/cuda/CUDAGuard.h> | ||||
| #include <ATen/cuda/Exceptions.h> | ||||
| #include <c10/core/impl/GPUTrace.h> | ||||
| #include <c10/cuda/CUDAGuard.h> | ||||
| #include <c10/cuda/CUDAStream.h> | ||||
| #include <c10/util/Exception.h> | ||||
|  | ||||
| #include <cuda_runtime_api.h> | ||||
| @ -246,4 +246,79 @@ private: | ||||
|   } | ||||
| }; | ||||
|  | ||||
| // EventPool - Thread-safe pool of CUDA events to avoid expensive cudaEventCreate | ||||
| // calls. cudaEventCreate when concurrently invoked from multiple threads can be | ||||
| // very expensive (especially on certain device/driver combinations). | ||||
| using CUDAEventPtr = | ||||
|     std::unique_ptr<CUDAEvent, std::function<void(CUDAEvent*)>>; | ||||
|  | ||||
| class EventPool { | ||||
|  public: | ||||
|   EventPool() : pools_(at::cuda::device_count()) {} | ||||
|  | ||||
|   CUDAEventPtr get(const DeviceIndex device) { | ||||
|     // If the device is invalid, return a default event and no pooling | ||||
|     if (device < 0 || device >= (DeviceIndex)pools_.size()) { | ||||
|       auto deleter = [](CUDAEvent* event) { | ||||
|         delete event; | ||||
|       }; | ||||
|       return CUDAEventPtr( | ||||
|         std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(), deleter); | ||||
|     } | ||||
|  | ||||
|     auto& pool = pools_[device]; | ||||
|  | ||||
|     // Create a destructor that returns the event to the appropriate device pool | ||||
|     auto destructor = [&pool](CUDAEvent* event) noexcept { | ||||
|       if (event != nullptr) { | ||||
|         std::lock_guard<std::mutex> lock(pool.mutex_); | ||||
|         pool.event_pool_.emplace_back(event); | ||||
|       } | ||||
|     }; | ||||
|  | ||||
|     { | ||||
|       std::lock_guard<std::mutex> lock(pool.mutex_); | ||||
|       if (!pool.event_pool_.empty()) { | ||||
|         auto event = std::move(pool.event_pool_.back()); | ||||
|         pool.event_pool_.pop_back(); | ||||
|         return CUDAEventPtr(event.release(), destructor); | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     return CUDAEventPtr( | ||||
|         std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(), | ||||
|         destructor); | ||||
|   } | ||||
|  | ||||
|   void empty_cache() { | ||||
|     for (auto& pool : pools_) { | ||||
|       std::lock_guard<std::mutex> lock(pool.mutex_); | ||||
|       pool.event_pool_.clear(); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   void init_num_events(const size_t num_events) { | ||||
|     for (DeviceIndex device_idx = 0; device_idx < at::cuda::device_count(); ++device_idx) { | ||||
|         CUDAGuard device_guard(device_idx); | ||||
|         std::vector<CUDAEventPtr> temp_events; | ||||
|         temp_events.reserve(num_events); | ||||
|         for (size_t i = 0; i < num_events; ++i) { | ||||
|           auto event = get(device_idx); | ||||
|           // Record the event to ensure it's properly initialized | ||||
|           event->record(); | ||||
|           temp_events.emplace_back(std::move(event)); | ||||
|         } | ||||
|         // Events will be returned to pool when temp_events is destroyed | ||||
|     } | ||||
|   } | ||||
|  | ||||
|  private: | ||||
|   struct alignas(64) PerDevicePool { | ||||
|     alignas(64) std::mutex mutex_; | ||||
|     std::vector<std::unique_ptr<CUDAEvent>> event_pool_; | ||||
|   }; | ||||
|  | ||||
|   std::vector<PerDevicePool> pools_; | ||||
| }; | ||||
|  | ||||
| } // namespace at::cuda | ||||
|  | ||||
| @ -307,7 +307,7 @@ CUDAGraph::~CUDAGraph() { | ||||
| // There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory. | ||||
| // They wait for next sync point in order to free the memory, this is to ensure that all | ||||
| // hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2. | ||||
| // We need to ensure all async opreations finish before deleting the object. | ||||
| // We need to ensure all async operations finish before deleting the object. | ||||
| #if (defined(USE_ROCM) && ROCM_VERSION >= 60200) | ||||
|   if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id | ||||
|   { | ||||
|  | ||||
| @ -137,7 +137,7 @@ struct CUDACachingHostAllocatorImpl | ||||
|   void free_block_slowpath(Block* block) { | ||||
|     auto start = std::chrono::steady_clock::now(); | ||||
|     // Users may change the allocator config at will. torch unit tests do this. | ||||
|     // However, allocations using cudaHostRegister should use corresonding | ||||
|     // However, allocations using cudaHostRegister should use corresponding | ||||
|     // cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost. | ||||
|     void* ptr = block->ptr_; | ||||
|     bool use_register = false; | ||||
|  | ||||
| @ -4,7 +4,7 @@ | ||||
| #include <ATen/cuda/CUDAConfig.h> | ||||
|  | ||||
| // NOTE: These templates are intentionally not defined in this header, | ||||
| // which aviods re-compiling them for each translation unit. If you get | ||||
| // which avoids re-compiling them for each translation unit. If you get | ||||
| // a link error, you need to add an explicit instantiation for your | ||||
| // types in cub.cu | ||||
|  | ||||
|  | ||||
| @ -38,7 +38,7 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262 | ||||
| GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033 | ||||
| ``` | ||||
|  | ||||
| Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect | ||||
| Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect | ||||
| this and reject the tunings file because the prior tunings are likely affected by other software changes. | ||||
|  | ||||
| The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of | ||||
|  | ||||
| @ -235,7 +235,7 @@ class TunableOp { | ||||
|       // numeric check option is controlled by non-static env var, so check it once per tuned operator | ||||
|       bool do_numerics_check = ctx->IsNumericsCheckEnabled(); | ||||
|  | ||||
|       // calcaulte a reference answer for numerical check | ||||
|       // calculate a reference answer for numerical check | ||||
|       if (do_numerics_check) { | ||||
|         reference_params = params->DeepCopy(false); | ||||
|         TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK); | ||||
|  | ||||
| @ -12,7 +12,7 @@ namespace at { | ||||
|  | ||||
| // AcceleratorHooksInterface is a shared interface provided by all | ||||
| // accelerators to allow generic code. | ||||
| // This inferface is hook-based as it corresponds to all the functions | ||||
| // This interface is hook-based as it corresponds to all the functions | ||||
| // that are going to be called in a generic way from the CPU code. | ||||
|  | ||||
| struct TORCH_API AcceleratorHooksInterface { | ||||
|  | ||||
| @ -38,7 +38,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface { | ||||
|  | ||||
|   Generator getNewGenerator( | ||||
|       [[maybe_unused]] DeviceIndex device_index = -1) const override { | ||||
|     // TODO(FFFrog): Perserved for BC and will be removed in the future. | ||||
|     // TODO(FFFrog): Preserved for BC and will be removed in the future. | ||||
|     if (at::GetGeneratorPrivate().has_value()) | ||||
|       return at::GetGeneratorForPrivateuse1(device_index); | ||||
|  | ||||
|  | ||||
| @ -283,7 +283,7 @@ inline void boxed_existing_bdim_all_batch_rule( | ||||
| // Use when all tensors arguments accept one (normal) batch dim. | ||||
| // This batching rule expands the batch dim on all Tensors, reshapes it into | ||||
| // dim 0, calls the op, and then reshapes the batch dim out of dim 0. | ||||
| // This is not the most efficient thing; if there are alternatives, plese try | ||||
| // This is not the most efficient thing; if there are alternatives, please try | ||||
| // to use them. Use this only as a last resort. | ||||
| #define EXISTING_BDIM_ALL_BOXED(op) \ | ||||
|   m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>()); | ||||
|  | ||||
| @ -384,7 +384,7 @@ fourOutputs solve_ex_batch_rule( | ||||
|  | ||||
|   // NOTE [ solve_ex Batch Rule Contiguity ] | ||||
|   // A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on | ||||
|   // A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior | ||||
|   // A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior | ||||
|   // differs based on whether or not the optimized path was taken | ||||
|   const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous(); | ||||
|   if (batched_A_was_contiguous && !A.is_complex()) { | ||||
|  | ||||
| @ -282,7 +282,7 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule( | ||||
|  | ||||
|   dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim); | ||||
|  | ||||
|   // Not sure why output_ needs to be marked as .contiguous(). Someting must | ||||
|   // Not sure why output_ needs to be marked as .contiguous(). Something must | ||||
|   // have changed in PyTorch (and output of softmax is probably always contiguous) | ||||
|   return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0); | ||||
| } | ||||
|  | ||||
| @ -224,7 +224,7 @@ static Tensor safeStack(TensorList tensors) { | ||||
|   // is possible for the backward function to return an undefined grad for some | ||||
|   // grad_input for each example. In that case, we return an undefined grad. | ||||
|   // | ||||
|   // It is theoretically posssible for *some* of the examples to produce an | ||||
|   // It is theoretically possible for *some* of the examples to produce an | ||||
|   // undefined grad (a kernel could peek at the gradient values and return an | ||||
|   // undefined tensor if it determines the gradient is full of zeros). We | ||||
|   // could handle this by treating the undefined grad as a zero-filled tensor | ||||
|  | ||||
| @ -113,7 +113,7 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const { | ||||
|   return sym_sizes_default(); | ||||
| } | ||||
|  | ||||
| // The following are publically exposed as methods of Tensor | ||||
| // The following are publicly exposed as methods of Tensor | ||||
|  | ||||
| IntArrayRef BatchedTensorImpl::strides_custom() const { | ||||
|   return strides_default(); | ||||
|  | ||||
| @ -37,7 +37,7 @@ namespace at::functorch  { | ||||
| // how to perform the transform. | ||||
| // | ||||
| // TODO: we can excise DynamicLayer in favor of Interpreter, | ||||
| // But I am going to leave it for now as a compatiblity shim to avoid | ||||
| // But I am going to leave it for now as a compatibility shim to avoid | ||||
| // needing to refactor a lot of callsites... | ||||
| struct TORCH_API DynamicLayer { | ||||
|   explicit DynamicLayer( | ||||
|  | ||||
| @ -88,7 +88,7 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t); | ||||
| // >>> VmapInterpreterPtr(&interpreter).batchSize() | ||||
| // | ||||
| // Finally, Interpreter::process switches on the type of the interpreter | ||||
| // and calls one of {Transform}Intepreter::processImpl under the hood. | ||||
| // and calls one of {Transform}Interpreter::processImpl under the hood. | ||||
| // Same for Interpreter::sendToNextInterpreter :) | ||||
|  | ||||
| struct VmapInterpreterMeta { | ||||
|  | ||||
| @ -733,7 +733,7 @@ TORCH_LIBRARY_IMPL(_, FuncTorchBatched, m) { | ||||
| } | ||||
|  | ||||
| TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { | ||||
|   // still legacy b/c teturns multiple tensors | ||||
|   // still legacy b/c returns multiple tensors | ||||
|   m.impl("split.Tensor", split_batching_rule); | ||||
|   m.impl("split_with_sizes", split_with_sizes_batching_rule); | ||||
|   m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule); | ||||
|  | ||||
| @ -158,7 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t | ||||
|       endKernelCoalescing(); | ||||
|       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder]; | ||||
|  | ||||
|       // For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26 | ||||
|       // For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26 | ||||
|       // See https://github.com/pytorch/pytorch/issues/163962 | ||||
|       // Workaround by batching copy commands into 4Gb chunks | ||||
|       constexpr size_t max_copy_size = 0x100000000; // 4GB | ||||
|  | ||||
| @ -148,7 +148,7 @@ inline void checkInputsSolver(const Tensor& A, | ||||
|  | ||||
| inline bool is_row_or_column_contiguous(const Tensor& t) { | ||||
|   // This could be made more general, similar to how it's checked in matmul, which would allow to | ||||
|   // ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky. | ||||
|   // elide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky. | ||||
|   // We choose to be conservative for simplicity | ||||
|   return t.is_contiguous() || t.transpose(-2, -1).is_contiguous(); | ||||
| } | ||||
|  | ||||
| @ -21,7 +21,7 @@ enum class fft_norm_mode { | ||||
| // NOTE [ Fourier Transform Conjugate Symmetry ] | ||||
| // | ||||
| // Real-to-complex Fourier transform satisfies the conjugate symmetry. That is, | ||||
| // assuming X is the transformed K-dimensionsal signal, we have | ||||
| // assuming X is the transformed K-dimensional signal, we have | ||||
| // | ||||
| //     X[i_1, ..., i_K] = X[j_i, ..., j_K]*, | ||||
| // | ||||
|  | ||||
| @ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl( | ||||
|   auto* input_tr_ptr = | ||||
|       reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>()); | ||||
|   // TODO: Activation transpose before and after the kernel can be removed if we | ||||
|   // keep activation tensor always tranposed. | ||||
|   // keep activation tensor always transposed. | ||||
|   fbgemm::transpose_simd<uint8_t>( | ||||
|       batch_size, K, input_ptr, K, input_tr_ptr, batch_size); | ||||
|  | ||||
|  | ||||
| @ -520,7 +520,7 @@ cpu_adaptive_avg_pool3d_channels_last( | ||||
|       scalar_t* out = output_data + i * channels; | ||||
|       int64_t size = channels; | ||||
|  | ||||
|       // Note: For oridinary usage scenario, each out lane should | ||||
|       // Note: For ordinary usage scenario, each out lane should | ||||
|       //   fit in L1 cache; otherwise consider block dim C. | ||||
|       // Pass I: zero the out lane | ||||
|       int64_t d1 = 0; | ||||
|  | ||||
| @ -34,7 +34,7 @@ struct Dist { | ||||
|   //     finish :   This tells what to do with the aggregated value to compute | ||||
|   //                the norm. Generally this is the result of val ^ (1 / p). | ||||
|   //     backward : This is the gradient for that norm. Arguments are pretty | ||||
|   //                self explanitory. | ||||
|   //                self explanatory. | ||||
|   // | ||||
|   // There are a few cases where these aren't used. The 0 norm has no backward, | ||||
|   // because it's always 0, so that's shortcircuited earlier. There's a special | ||||
|  | ||||
| @ -30,7 +30,7 @@ vec::Vectorized<scalar_t> is_nan_vec(vec::Vectorized<scalar_t> vec) { | ||||
|   return vec.isnan(); | ||||
| } | ||||
|  | ||||
| // TODO: use is_integeral/is_same to check the scalar_t and simplify the implementation | ||||
| // TODO: use is_integral/is_same to check the scalar_t and simplify the implementation | ||||
| // currently it does not work | ||||
| template <> | ||||
| vec::Vectorized<unsigned char> is_nan_vec<unsigned char>(vec::Vectorized<unsigned char> vec) { | ||||
|  | ||||
| @ -74,7 +74,7 @@ it to sum up the entire array into a single value. | ||||
|  | ||||
| `ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which | ||||
| compiler flags it is currently compiled. This allows the programmer to write | ||||
| generic code, which will be compiled under multipled compilation settings. | ||||
| generic code, which will be compiled under multiplied compilation settings. | ||||
|  | ||||
| `../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains | ||||
| a generic definition of `sumImplAll`. This function allows the user to reduce | ||||
|  | ||||
| @ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u( | ||||
|             _mm_loadu_si128((__m128i *) (lineIn_min + stride * i))), | ||||
|             _mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1); | ||||
|  | ||||
|         // Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA | ||||
|         // Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA | ||||
|         // RGBA: pix1 = [ | ||||
|         //   r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  a0 0 a1 0 | ||||
|         //   r4 0 r5 0  g4 0 g5 0  b4 0 b5 0  a4 0 a5 0 | ||||
|  | ||||
| @ -240,7 +240,7 @@ _PS256_CONST(coscof_p2,  4.166664568298827E-002); | ||||
| _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI | ||||
|  | ||||
|  | ||||
| /* evaluation of 8 sines at onces using AVX intrinsics | ||||
| /* evaluation of 8 sines at once using AVX intrinsics | ||||
|  | ||||
|    The code is the exact rewriting of the cephes sinf function. | ||||
|    Precision is excellent as long as x < 8192 (I did not bother to | ||||
|  | ||||
| @ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal( | ||||
|   const bool gamma_null = (gamma_data == nullptr); | ||||
|   const bool beta_null = beta_data == nullptr; | ||||
|  | ||||
|   // NB: About algorithm choosen: | ||||
|   // NB: About algorithm chosen: | ||||
|   // | ||||
|   // On channels last, GroupNorm has a input shape of {N, H, W, GD}, | ||||
|   // Mean and rstd are collected per each n and g, which involves reduction | ||||
|  | ||||
| @ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel( | ||||
|         } | ||||
|       }; | ||||
|  | ||||
|   // Dynamically Quantize the float32 input to 8 bit assymetric | ||||
|   // Dynamically Quantize the float32 input to 8 bit asymmetric | ||||
|   input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx); | ||||
|  | ||||
|   const size_t lhs_stride = | ||||
| @ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel( | ||||
|   const int64_t weight_packed_size = | ||||
|       kleidiai::kai_pack_rhs_int4_size(N, K, block_size); | ||||
|   if (weight_packed_size == packed_weights.numel()) { | ||||
|     // KleidiAI interface intenally handles the Channelwise and groupwise | ||||
|     // KleidiAI interface internally handles the Channelwise and groupwise | ||||
|     // distinction | ||||
|     kleidiai::kai_quant_pack_lhs_int4_mm( | ||||
|         output, inp, packed_weights, M, N, K, block_size); | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,11 +1,11 @@ | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/Context.h> | ||||
| #include <ATen/Dispatch.h> | ||||
| #include <ATen/Dispatch_v2.h> | ||||
| #include <ATen/cuda/CachingHostAllocator.h> | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/cuda/CUDAContext.h> | ||||
| #include <ATen/cuda/CUDAEvent.h> | ||||
| #include <ATen/cuda/CachingHostAllocator.h> | ||||
| #include <ATen/cuda/PeerToPeerAccess.h> | ||||
| #include <ATen/native/Copy.h> | ||||
| #include <ATen/native/TensorIterator.h> | ||||
| @ -27,6 +27,24 @@ | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| // Initial pool size for CUDA events per device. | ||||
| constexpr size_t kInitialEventPoolSize = 8; | ||||
|  | ||||
| at::cuda::CUDAEventPtr getEventFromPool(const at::DeviceIndex device_idx) { | ||||
|   static auto* event_pool = []() { | ||||
|     auto* pool = new at::cuda::EventPool(); | ||||
|     // Pre-populate the pool with events to avoid stalls in creating events | ||||
|     pool->init_num_events(kInitialEventPoolSize); | ||||
|     return pool; | ||||
|   }(); | ||||
|  | ||||
|   return event_pool->get(device_idx); | ||||
| } | ||||
|  | ||||
| } // namespace | ||||
|  | ||||
| void neg_kernel_cuda(TensorIteratorBase &iter); | ||||
| void conj_kernel_cuda(TensorIteratorBase &iter); | ||||
|  | ||||
| @ -263,12 +281,14 @@ void copy_device_to_device(TensorIterator& iter, | ||||
|     // write-after-read dependencies on the destination side are handled, so | ||||
|     // that no one is operating on the dst memory when we perform the copy. | ||||
|     // src waits on dst barrier (src already waits on src) | ||||
|     CUDAEvent dst_ready; | ||||
|  | ||||
|     // Use event pool for better performance instead of creating new events | ||||
|     auto dst_ready = getEventFromPool(dst_device.index()); | ||||
|     device_guard.set_device(dst_device); | ||||
|     dst_ready.record(getCurrentCUDAStream(dst_device.index())); | ||||
|     dst_ready->record(getCurrentCUDAStream(dst_device.index())); | ||||
|  | ||||
|     device_guard.set_device(src_device); | ||||
|     dst_ready.block(copy_stream); | ||||
|     dst_ready->block(copy_stream); | ||||
|   } | ||||
|  | ||||
|   if (memcpy_eligible) { | ||||
| @ -307,11 +327,11 @@ void copy_device_to_device(TensorIterator& iter, | ||||
|     // operate on dst's copy until the copy is complete. | ||||
|  | ||||
|     // Still on src_device, record stream event | ||||
|     CUDAEvent src_ready; | ||||
|     src_ready.record(copy_stream); | ||||
|     auto src_ready = getEventFromPool(src_device.index()); | ||||
|     src_ready->record(copy_stream); | ||||
|  | ||||
|     device_guard.set_device(dst_device); | ||||
|     src_ready.block(getCurrentCUDAStream(dst_device.index())); | ||||
|     src_ready->block(getCurrentCUDAStream(dst_device.index())); | ||||
|   } | ||||
|  | ||||
|   AT_CUDA_CHECK(cudaGetLastError()); | ||||
|  | ||||
| @ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) | ||||
|       auto value = static_cast<scalar_t>(rand * range + from); | ||||
|       // reverse the bounds of curand4 from (0, 1] to [0, 1) | ||||
|       // Note that this method is from legacy THCTensorRandom and is likely to give | ||||
|       // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and | ||||
|       // you more 0-s, since, the probability of getting 1-s is higher than 0-s and | ||||
|       // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s. | ||||
|       // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706 | ||||
|       auto reverse_bound_value = value == to ? from : value; | ||||
|  | ||||
| @ -154,7 +154,7 @@ REGISTER_CUDA_DISPATCH(lstsq_stub, &lazy_lstsq_kernel) | ||||
|  | ||||
| // Old style dispatches | ||||
| // torch_cuda_linalg dynamic library should have a global constructor | ||||
| // that calls regiserLinaglDispatch so in order ot lazy bind | ||||
| // that calls registerLinalgDispatch so in order ot lazy bind | ||||
| // old style dispatch all one have to do is to load library and call disp.func_name | ||||
| // Protect from infinite recursion by initializing dispatch to self and checking | ||||
| // that values are different after linalg library were loaded | ||||
|  | ||||
| @ -311,7 +311,7 @@ __global__ void batch_norm_collect_statistics_kernel( | ||||
|     stat_accscalar_t v_[UNRL]; | ||||
|     for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) { | ||||
|       for (int u = 0; u < UNRL; u++) | ||||
|         v_[u] = input[batch][plane][min(x+u*blockDim.x, input.size(2)-1)]; | ||||
|         v_[u] = input[batch][plane][std::min(x+u*blockDim.x, input.size(2)-1)]; | ||||
|       for (int u = 0; u < UNRL; u++) { | ||||
|         if (x+u*blockDim.x < input.size(2)) { | ||||
|           stat_accscalar_t d1 = v_[u] - avg; | ||||
|  | ||||
							
								
								
									
										1284
									
								
								aten/src/ATen/native/cuda/ScaledBlas.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1284
									
								
								aten/src/ATen/native/cuda/ScaledBlas.cpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										171
									
								
								aten/src/ATen/native/cuda/cuBlasCommonArgs.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										171
									
								
								aten/src/ATen/native/cuda/cuBlasCommonArgs.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,171 @@ | ||||
| #pragma once | ||||
|  | ||||
| #include <ATen/core/Tensor.h> | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| using at::blas::ScalingType; | ||||
| using at::blas::SwizzleType; | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| // TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492 | ||||
| c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) { | ||||
|   if (resolve_conj && tensor.is_conj()) { | ||||
|     return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj()); | ||||
|   } else { | ||||
|     return c10::MaybeOwned<Tensor>::borrowed(tensor); | ||||
|   } | ||||
| } | ||||
|  | ||||
| c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) { | ||||
|   if (tensor.is_non_overlapping_and_dense()) { // common case | ||||
|       transpose_tensor = tensor.is_contiguous(); | ||||
|       return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor); | ||||
|   } | ||||
|   IntArrayRef tensor_strides = tensor.strides(); | ||||
|   IntArrayRef tensor_sizes = tensor.sizes(); | ||||
|   if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) { | ||||
|     transpose_tensor = false; | ||||
|     return resolve_conj_if_indicated(tensor, !transpose_result); | ||||
|   } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) { | ||||
|     transpose_tensor = true; | ||||
|     return resolve_conj_if_indicated(tensor, transpose_result); | ||||
|   } else { | ||||
|     transpose_tensor = true; | ||||
|     return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous)); | ||||
|   } | ||||
| } | ||||
|  | ||||
| c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) { | ||||
|   if (tensor.is_non_overlapping_and_dense()) { // common case | ||||
|       transpose_tensor = tensor.is_contiguous(); | ||||
|       return resolve_conj_if_indicated(tensor, true); | ||||
|   } | ||||
|  | ||||
|   IntArrayRef tensor_strides = tensor.strides(); | ||||
|   IntArrayRef tensor_sizes = tensor.sizes(); | ||||
|   if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) { | ||||
|     transpose_tensor = false; | ||||
|     return resolve_conj_if_indicated(tensor, true); | ||||
|   } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) { | ||||
|     transpose_tensor = true; | ||||
|     return resolve_conj_if_indicated(tensor, true); | ||||
|   } else { | ||||
|     transpose_tensor = true; | ||||
|     return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous)); | ||||
|   } | ||||
| } | ||||
|  | ||||
| } // namespace | ||||
|  | ||||
| /** | ||||
|  * @brief Prepares matrices for CUBLAS operation | ||||
|  * | ||||
|  * This constructor prepares tensors for CUBLAS | ||||
|  * The main difference is that PyTorch uses row-major as the default and | ||||
|  * CUBLAS expects column-major. | ||||
|  * | ||||
|  * @details | ||||
|  * To enable row-major output while using CUBLAS, | ||||
|  * we use the mathematical identity that (A × B)^T = B^T × A^T. | ||||
|  * | ||||
|  * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major) | ||||
|  * T = row-major, N = col-major | ||||
|  * | ||||
|  * Example: | ||||
|  * For matrices A (M×K)(row-major) and B (K×N)(row-major): | ||||
|  *   - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major) | ||||
|  *   - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N) | ||||
|  *   - However, since the output form cublas is column-major this is | ||||
|  *   - equivalent to an output of size MxN row-major as expected | ||||
|  * | ||||
|  * The transpose flags are derived from the layouts of the passed in tensors | ||||
|  * | ||||
|  * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted | ||||
|  * to their unpacked values to match what cuBLAS expects. | ||||
|  * | ||||
|  * @param mat1 First input matrix | ||||
|  * @param mat2 Second input matrix | ||||
|  * @param c Output matrix (result) | ||||
|  * @param scale_a Optional scaling factor for first matrix | ||||
|  * @param scale_b Optional scaling factor for second matrix | ||||
|  * @param scale_result Optional scaling factor for result | ||||
|  */ | ||||
| struct cublasCommonArgs { | ||||
|   cublasCommonArgs( | ||||
|       const Tensor& mat1, | ||||
|       const Tensor& mat2, | ||||
|       Tensor& c, | ||||
|       const std::optional<Tensor>& scale_a = std::nullopt, | ||||
|       const std::optional<Tensor>& scale_b = std::nullopt, | ||||
|       const std::optional<Tensor>& scale_result = std::nullopt, | ||||
|       const std::optional<ScalingType>& scaling_choice_a = std::nullopt, | ||||
|       const std::optional<ScalingType>& scaling_choice_b = std::nullopt) { | ||||
|     bool transpose_result = false, transpose_a = false, transpose_b = false; | ||||
|     result = prepare_matrix_for_cublas(c, transpose_result); | ||||
|     mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); | ||||
|     matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); | ||||
|  | ||||
|     // Handle scale tensors if provided | ||||
|     if (scale_a && scale_b) { | ||||
|       // By default since we return in row-major we run the gemm | ||||
|       // as B.T @ A.T, check transpose_result to determine if we flip the scales | ||||
|       scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); | ||||
|       scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); | ||||
|       scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; | ||||
|       scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); | ||||
|       scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); | ||||
|       scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; | ||||
|     } | ||||
|  | ||||
|     if (scale_result) { | ||||
|       scale_result_ptr = scale_result->data_ptr(); | ||||
|       scale_result_dtype = scale_result->scalar_type(); | ||||
|     } | ||||
|  | ||||
|     // Update transpose flags | ||||
|     if (transpose_result) { | ||||
|       transpose_a = !transpose_a; | ||||
|       transpose_b = !transpose_b; | ||||
|     } | ||||
|  | ||||
|     auto sizes_a = mata->sizes(); | ||||
|     auto sizes_b = matb->sizes(); | ||||
|  | ||||
|     m = sizes_a[transpose_result ? 1 : 0]; | ||||
|     k = sizes_a[transpose_result ? 0 : 1]; | ||||
|     n = sizes_b[transpose_result ? 0 : 1]; | ||||
|     lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); | ||||
|     ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); | ||||
|     result_ld = result->stride(transpose_result ? 0 : 1); | ||||
|     transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; | ||||
|     transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; | ||||
|  | ||||
|     // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing | ||||
|     // if the gemm operands are in packed float4 | ||||
|     if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) { | ||||
|       k = k * 2; | ||||
|       lda = lda * 2; | ||||
|       ldb = ldb * 2; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   // Matrix members | ||||
|   char transa, transb; | ||||
|   int64_t m, n, k; | ||||
|   int64_t lda, ldb, result_ld; | ||||
|   c10::MaybeOwned<Tensor> mata, matb, result; | ||||
|  | ||||
|   // Scale members | ||||
|   void* scale_mata_ptr = nullptr; | ||||
|   void* scale_matb_ptr = nullptr; | ||||
|   void* scale_result_ptr = nullptr; | ||||
|   std::optional<c10::ScalarType> scale_mata_dtype; | ||||
|   std::optional<ScalingType> scaling_mata_type; | ||||
|   std::optional<c10::ScalarType> scale_matb_dtype; | ||||
|   std::optional<ScalingType> scaling_matb_type; | ||||
|   std::optional<c10::ScalarType> scale_result_dtype; | ||||
| }; | ||||
|  | ||||
| } // namespace at::native | ||||
| @ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function( | ||||
|  | ||||
|   std::string file_path; | ||||
|   if (cache_dir.has_value()) { | ||||
|     // Attemps to read from the cache. | ||||
|     // Attempts to read from the cache. | ||||
|     // Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash> | ||||
|     // Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents, | ||||
|     //   because we hash on the CUDA code, but we save the compiled ptx or sass | ||||
|  | ||||
| @ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info) | ||||
|     }); | ||||
|  | ||||
|   if (input.dim() > 2) { | ||||
|     // if upper=true we need to tranpose and conjugate the result tensor | ||||
|     // if upper=true we need to transpose and conjugate the result tensor | ||||
|     // because the cholesky decomposition is stored in the lower triangular part | ||||
|     if (upper) { | ||||
|       input.copy_(result.mH()); | ||||
| @ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) { | ||||
|  | ||||
|   auto preferred_backend = at::globalContext().linalgPreferredBackend(); | ||||
|   switch (preferred_backend) { | ||||
|   // TODO Investigate whether the following magma bug is still occuring. | ||||
|   // TODO Investigate whether the following magma bug is still occurring. | ||||
|   // It may be the case that geqrf followed by orgqr is wrong for the magma backend | ||||
|   // geqrf_magma currently uses geqrf2_gpu | ||||
|   // | ||||
|  | ||||
| @ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T | ||||
| #if defined(BUILD_LAZY_CUDA_LINALG) | ||||
| namespace cuda { namespace detail { | ||||
| // This is only used for an old-style dispatches | ||||
| // Please do not add any new entires to it | ||||
| // Please do not add any new entries to it | ||||
| struct LinalgDispatch { | ||||
|    Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper); | ||||
| }; | ||||
|  | ||||
| @ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input, | ||||
| //  blocked format will propagate between layers. Input, output will be in blocked format. | ||||
| // | ||||
| //  For inference case, weight can be prepacked into blocked format by | ||||
| //  (so as to save weight reoder overhead): | ||||
| //  (so as to save weight reorder overhead): | ||||
| //      model = torch.utils.mkldnn.to_mkldnn(model) | ||||
| // | ||||
| //  For training case, grad_output can be CPU tensor or MKLDNN tensor, | ||||
| @ -723,7 +723,7 @@ Tensor _mkldnn_convolution_transpose( | ||||
|   ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true); | ||||
|   if (!weight.is_mkldnn()) { | ||||
|     // mkldnn transposed convolution has weight in logical order of OIHW or OIDHW, | ||||
|     // while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy). | ||||
|     // while PyTorch has IOHW or IODHW, `._transpose()` switches strides (no memory copy). | ||||
|     w.transpose_(0, 1); | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive( | ||||
|   args.insert({DNNL_ARG_WEIGHTS, expected_weight}); | ||||
|   args.insert({DNNL_ARG_DST, dst}); | ||||
|   args.insert({DNNL_ARG_SCRATCHPAD, scratchpad}); | ||||
|   // Create primitve and execute | ||||
|   // Create primitive and execute | ||||
|   auto primitive = dnnl::matmul(prim_desc); | ||||
|   primitive.execute(ideep::stream::default_stream(), args); | ||||
| } | ||||
|  | ||||
| @ -439,7 +439,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> mkldnn_rnn_la | ||||
| // I. Memory Formats | ||||
| //   a. mkldnn will use plain formats for input, hx/cx, output, hy/cy | ||||
| //      and possibly use blocked formats for weights depending shape info. | ||||
| //   b. All mkldnn memorys are created (in plain format) as views on ATen tensor, | ||||
| //   b. All mkldnn memories are created (in plain format) as views on ATen tensor, | ||||
| //      the weight reorder(if any) is handed automatically inside ideep (mkldnn bridge) | ||||
| // | ||||
| // II. MKLDNN Primitive Mapping | ||||
|  | ||||
| @ -39,7 +39,7 @@ void check_mkldnn_binary_fusion_inputs( | ||||
| inline std::vector<int64_t> padding_r( | ||||
|     IntArrayRef padding, IntArrayRef output_padding) | ||||
| { | ||||
|   // ConvTranpose padding adjustment | ||||
|   // ConvTranspose padding adjustment | ||||
|   // | ||||
|   // PyTorch uses padding/output_padding: | ||||
|   //   osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1 | ||||
|  | ||||
| @ -75,7 +75,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) { | ||||
| } | ||||
|  | ||||
| bool can_use_flash_attention(sdp::sdp_params const& params, bool debug) { | ||||
|   // Currently, XPU fallbacks flash attention to overrideable | ||||
|   // Currently, XPU fallbacks flash attention to overridable | ||||
|   return can_use_overrideable_attention(params, debug); | ||||
| } | ||||
|  | ||||
| @ -115,7 +115,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) { | ||||
|   // 1. Flash Attention | ||||
|   // 2. Math fallback | ||||
|   auto& ctx = at::globalContext(); | ||||
|   // use overrideable linked to onednn as overrideable implementation | ||||
|   // use overridable linked to onednn as overridable implementation | ||||
|   if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() && | ||||
|       !ctx.userEnabledFlashSDP()) { | ||||
|     return sdp::SDPBackend::error; | ||||
| @ -165,7 +165,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) { | ||||
|     } | ||||
|   } | ||||
|   // If we have gotten to this point then two things have happened: | ||||
|   // 1. can_use_overrideable_attention did not satisfy the constraints to be ran | ||||
|   // 1. can_use_overridable_attention did not satisfy the constraints to be ran | ||||
|   // 2. The user has explicitly disabled the math kernel | ||||
|   // We then re-run the kernel checks with debug enabled to print out the | ||||
|   // reason why the kernel was not selected | ||||
|  | ||||
| @ -215,7 +215,7 @@ partition create_sdpa_graph_partition( | ||||
|   // For optional additive mask | ||||
|   std::optional<op> mask_add; | ||||
|  | ||||
|   // For optional implicite causal mask | ||||
|   // For optional implicit causal mask | ||||
|   std::optional<op> mask_gen_idx_row; | ||||
|   std::optional<logical_tensor> mask_row_idx; | ||||
|   std::optional<op> mask_gen_idx_col; | ||||
| @ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition( | ||||
|   // For optional additive mask | ||||
|   std::optional<op> mask_add; | ||||
|  | ||||
|   // For optional implicite causal mask | ||||
|   // For optional implicit causal mask | ||||
|   std::optional<op> mask_gen_idx_row; | ||||
|   std::optional<logical_tensor> mask_row_idx; | ||||
|   std::optional<op> mask_gen_idx_col; | ||||
|  | ||||
| @ -345,7 +345,7 @@ class Attr { | ||||
|         dnnl::memory binary_m; | ||||
|         auto binary = ops_params_[i].binary_; | ||||
|         auto md = ops_params_[i].meta_; | ||||
|         // qeury expected_md to achieve peak performance | ||||
|         // query expected_md to achieve peak performance | ||||
|         auto expected_md = pd.query_md( | ||||
|             dnnl::query::exec_arg_md, | ||||
|             DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1); | ||||
|  | ||||
| @ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) { | ||||
|       return false; | ||||
|   } | ||||
|  | ||||
|   // the overlaped cases are not supported | ||||
|   // the overlapped cases are not supported | ||||
|   dnnl::memory::dims strides = get_onednn_strides(tensor); | ||||
|   int64_t storage_size = 1; | ||||
|   for (size_t dim = 0; dim < tensor_dim; ++dim) | ||||
|  | ||||
| @ -29,7 +29,7 @@ | ||||
|                                                             secondaryTensor:(MPSGraphTensor*)secondaryTensor | ||||
|                                                                        name:(NSString*)name { | ||||
|   // As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral | ||||
|   // agruments results in | ||||
|   // arguments results in | ||||
|   //  /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805: | ||||
|   //  failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library' | ||||
|   if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) { | ||||
| @ -42,7 +42,7 @@ | ||||
|                                                             secondaryTensor:(MPSGraphTensor*)secondaryTensor | ||||
|                                                                        name:(NSString*)name { | ||||
|   // As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral | ||||
|   // agruments results in | ||||
|   // arguments results in | ||||
|   //  /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805: | ||||
|   //  failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library' | ||||
|   if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) { | ||||
| @ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, | ||||
|  | ||||
|   static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS); | ||||
|   // Use gather kernel to solve strides for macOS < 15.0 | ||||
|   // Starting with macOS 15.0, MPS supports native strides direclty in the kernels | ||||
|   // Starting with macOS 15.0, MPS supports native strides directly in the kernels | ||||
|   if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) { | ||||
|     if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) { | ||||
|       Tensor emptyShell = Tensor(); | ||||
|  | ||||
| @ -1,4 +1,4 @@ | ||||
| #pragma onces | ||||
| #pragma once | ||||
| #include <c10/metal/common.h> | ||||
|  | ||||
| template <unsigned N = c10::metal::max_ndim> | ||||
|  | ||||
| @ -1,3 +1,5 @@ | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/native/Resize.h> | ||||
| #include <ATen/native/SpectralOpsUtils.h> | ||||
| #include <ATen/native/mps/OperationUtils.h> | ||||
|  | ||||
| @ -37,25 +39,12 @@ NSArray<NSNumber*>* IntArrayToNSArray(IntArrayRef arr) { | ||||
| } // anonymous namespace | ||||
|  | ||||
| Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) { | ||||
|   TORCH_CHECK(self.is_complex()); | ||||
|   auto in_sizes = self.sizes(); | ||||
|   DimVector out_sizes(in_sizes.begin(), in_sizes.end()); | ||||
|   out_sizes[dim.back()] = last_dim_size; | ||||
|   auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type()))); | ||||
|   auto out = at::empty({}, self.options().dtype(c10::toRealValueType(self.scalar_type()))); | ||||
|   return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out); | ||||
| } | ||||
|  | ||||
| Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) { | ||||
|   TORCH_CHECK(self.is_floating_point()); | ||||
|   auto input_sizes = self.sizes(); | ||||
|   DimVector out_sizes(input_sizes.begin(), input_sizes.end()); | ||||
|   auto last_dim = dim.back(); | ||||
|   auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1; | ||||
|   if (onesided) { | ||||
|     out_sizes[last_dim] = last_dim_halfsize; | ||||
|   } | ||||
|  | ||||
|   auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type()))); | ||||
|   auto out = at::empty({}, self.options().dtype(c10::toComplexType(self.scalar_type()))); | ||||
|   return _fft_r2c_mps_out(self, dim, normalization, onesided, out); | ||||
| } | ||||
|  | ||||
| @ -72,6 +61,17 @@ using namespace mps; | ||||
|  | ||||
| // TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237 | ||||
| Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) { | ||||
|   TORCH_CHECK(self.scalar_type() == kFloat || self.scalar_type() == kHalf, "Only float and half dtypes are supported"); | ||||
|   TORCH_CHECK(out.scalar_type() == c10::toComplexType(self.scalar_type())); | ||||
|   const auto input_sizes = self.sym_sizes(); | ||||
|   SymDimVector out_sizes(input_sizes.begin(), input_sizes.end()); | ||||
|   auto last_dim = dim.back(); | ||||
|   auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1; | ||||
|   if (onesided) { | ||||
|     out_sizes[last_dim] = last_dim_halfsize; | ||||
|   } | ||||
|   at::native::resize_output_symint(out, out_sizes); | ||||
|  | ||||
|   auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" + | ||||
|       std::to_string(normalization) + ":" + std::to_string(onesided); | ||||
|   @autoreleasepool { | ||||
| @ -112,6 +112,12 @@ Tensor& _fft_c2r_mps_out(const Tensor& self, | ||||
|                          int64_t normalization, | ||||
|                          int64_t last_dim_size, | ||||
|                          Tensor& out) { | ||||
|   TORCH_CHECK(self.is_complex(), "Input must be complex"); | ||||
|   TORCH_CHECK(out.scalar_type() == c10::toRealValueType(self.scalar_type()), "Unexpected output type"); | ||||
|   const auto in_sizes = self.sym_sizes(); | ||||
|   SymDimVector out_sizes(in_sizes.begin(), in_sizes.end()); | ||||
|   out_sizes[dim.back()] = last_dim_size; | ||||
|   at::native::resize_output_symint(out, out_sizes); | ||||
|   auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" + | ||||
|       std::to_string(normalization) + ":" + std::to_string(last_dim_size); | ||||
|   @autoreleasepool { | ||||
|  | ||||
| @ -158,7 +158,7 @@ static void reduction_out_mps(const Tensor& input_t, | ||||
|     IntArrayRef dim = opt_dim.value(); | ||||
|     for (const auto dim_val : dim) { | ||||
|       auto wrap_dim = maybe_wrap_dim(dim_val, input_shape.size()); | ||||
|       // canSqueeze logic is broken when dim is negative, it introduces off-by-one-erros or crashes | ||||
|       // canSqueeze logic is broken when dim is negative, it introduces off-by-one-errors or crashes | ||||
|       // See https://github.com/pytorch/pytorch/issues/136132#issuecomment-2354482608 | ||||
|       if (wrap_dim >= 4 || dim_val < 0) { | ||||
|         canSqueezeLastDim = false; | ||||
| @ -1282,7 +1282,7 @@ static void all_any_common_impl_mps(const Tensor& input_t, | ||||
|       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); | ||||
|  | ||||
|       auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); | ||||
|       // reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4 | ||||
|       // reductionOrWithTensor:axis: will throw an internal assert if number of dimensions is more than 4 | ||||
|       // See https://github.com/pytorch/pytorch/issues/95538 | ||||
|       MPSGraphTensor* outputTensor = nil; | ||||
|       if (input_t.ndimension() > 4) { | ||||
| @ -1352,7 +1352,7 @@ TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t) | ||||
|     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { | ||||
|       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); | ||||
|       auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); | ||||
|       // reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4 | ||||
|       // reductionOrWithTensor:axes: will throw an internal assert if number of dimensions is more than 4 | ||||
|       // See https://github.com/pytorch/pytorch/issues/95538 | ||||
|       if (input_t.dim() > 4) { | ||||
|         castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil]; | ||||
| @ -1400,7 +1400,7 @@ TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t) | ||||
|     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { | ||||
|       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t); | ||||
|       auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t); | ||||
|       // reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4 | ||||
|       // reductionAndWithTensor:axes: will throw an internal assert if number of dimensions is more than 4 | ||||
|       // See https://github.com/pytorch/pytorch/issues/95538 | ||||
|       if (input_t.ndimension() > 4) { | ||||
|         castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil]; | ||||
|  | ||||
| @ -19,7 +19,7 @@ namespace at::native::mps { | ||||
|  | ||||
| // For both scatter and gather kernels, there are 4 specized ones (for 1D to 4D tensor) | ||||
| // and one generic, for 5+D ones. Assumption (to be tested) about specialized kernels | ||||
| // is that reduction of n-dimentional vector, where n is 2, should be slower | ||||
| // is that reduction of n-dimensional vector, where n is 2, should be slower | ||||
| // than reduction of 2D one, as n is not known at compiler time, therefore compiler | ||||
| // could not do loop unrolls, that is | ||||
| // float sum(float* v, int n) { | ||||
|  | ||||
| @ -53,7 +53,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs( | ||||
|       normalized_shape); | ||||
|  | ||||
|   // Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input | ||||
|   // Also, compute M and N considering the idiosyncracies of NestedTensors | ||||
|   // Also, compute M and N considering the idiosyncrasies of NestedTensors | ||||
|   int64_t N = 1; | ||||
|   for (const auto i: c10::irange(normalized_ndim)) { | ||||
|     TORCH_CHECK( | ||||
|  | ||||
| @ -95,7 +95,7 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6 | ||||
|   for (const auto split_idx : c10::irange(chunks)) { | ||||
|       auto new_sizes = sizes.clone(); | ||||
|       auto new_strides = strides.clone(); | ||||
|       // This copys offsets so we are safe to move | ||||
|       // This copies offsets so we are safe to move | ||||
|       auto new_offsets = offsets.clone(); | ||||
|       int64_t *size_ptr = new_sizes.data_ptr<int64_t>(); | ||||
|       int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>(); | ||||
|  | ||||
| @ -245,7 +245,7 @@ int64_t get_nnz(const Tensor& nestedtensor) { | ||||
|     //     this is because needs_broadcast indicates that the batch_size is 1 | ||||
|     //     and hence there is only 1 value for seq_len | ||||
|     // (2) The cum_seq_lens are given by [0, {*}_t.size(1), 2 * {*}_t.size(1), | ||||
|     // ..., outut_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by | ||||
|     // ..., output_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by | ||||
|     // output_batch_size * {*}_t.size(1); | ||||
|  | ||||
|     int64_t max_seqlen_batch_q = 0, Nnz_q = 0; | ||||
|  | ||||
| @ -193,12 +193,12 @@ vTensor pack_biases_quantized_weights( | ||||
|         src_kw_sz = b_sizes[Layout::BatchMatrices::width]; | ||||
|         src_kh_sz = b_sizes[Layout::BatchMatrices::height]; | ||||
|       } else if (bias.sizes().size() == 2) { | ||||
|         // skip batch dim for boardcasting; index -1 | ||||
|         // skip batch dim for broadcasting; index -1 | ||||
|         src_kb_sz = 1; | ||||
|         src_kw_sz = b_sizes[Layout::BatchMatrices::height]; | ||||
|         src_kh_sz = b_sizes[Layout::BatchMatrices::batch]; | ||||
|       } else { | ||||
|         // skip batch & height dim for boardcasting; index -2 | ||||
|         // skip batch & height dim for broadcasting; index -2 | ||||
|         src_kb_sz = 1; | ||||
|         src_kw_sz = b_sizes[Layout::BatchMatrices::batch]; | ||||
|         src_kh_sz = 1; | ||||
| @ -327,13 +327,13 @@ bool available_check_with_batch( | ||||
|              weight.size(Layout::BatchMatrices::batch) || | ||||
|          bias->size(Layout::BatchMatrices::batch) == 1); | ||||
|   } else if (bias->ndimension() == 2) { | ||||
|     // skip batch dim for boardcasting; index -1 | ||||
|     // skip batch dim for broadcasting; index -1 | ||||
|     bias_available &= | ||||
|         (bias->size(Layout::BatchMatrices::height) == | ||||
|              weight.size(Layout::BatchMatrices::width) || | ||||
|          bias->size(Layout::BatchMatrices::height) == 1); | ||||
|   } else { | ||||
|     // skip batch & height dim for boardcasting; index -2 | ||||
|     // skip batch & height dim for broadcasting; index -2 | ||||
|     bias_available &= | ||||
|         (bias->size(Layout::BatchMatrices::batch) == | ||||
|              weight.size(Layout::BatchMatrices::width) || | ||||
|  | ||||
| @ -158,7 +158,7 @@ class TORCH_API Tensor: public TensorBase { | ||||
|   // will only lead to trouble and dangling references. | ||||
|   c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete; | ||||
|  | ||||
|   // The following overloads are very intruiging.  Consider the following | ||||
|   // The following overloads are very intriguing.  Consider the following | ||||
|   // program: | ||||
|   // | ||||
|   //    x[1] = 3; | ||||
|  | ||||
| @ -6894,7 +6894,7 @@ TEST_F(VulkanAPITest, slice_height_success) { | ||||
|     {2, {2, 3, 40, 50}},  // 4D tensors with dim=height | ||||
|     {1, {3, 40, 50}},     // 3D tensors with dim=height | ||||
|     {0, {40, 50}},        // 2D tensors with dim=height | ||||
|                           // 1D tesnors don't have height dim for test | ||||
|                           // 1D tensors don't have height dim for test | ||||
|   }; | ||||
|  | ||||
|   // Act/Assert | ||||
| @ -6906,7 +6906,7 @@ TEST_F(VulkanAPITest, slice_feature_success) { | ||||
|   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes { | ||||
|     {1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel) | ||||
|     {0, {40, 13, 14}},    // 3D tensors with dim=feature(channel) | ||||
|                           // 1D and 2D tesnors don't have feature(channel) dim for test | ||||
|                           // 1D and 2D tensors don't have feature(channel) dim for test | ||||
|   }; | ||||
|  | ||||
|   // Act/Assert | ||||
| @ -6917,7 +6917,7 @@ TEST_F(VulkanAPITest, slice_batch_success) { | ||||
|   // Arrange | ||||
|   std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes { | ||||
|     {0, {40, 3, 13, 14}}, // 4D tensors with dim=batch | ||||
|                           // 1D, 2D and 3D tesnors don't have batch dim for test | ||||
|                           // 1D, 2D and 3D tensors don't have batch dim for test | ||||
|   }; | ||||
|  | ||||
|   // Act/Assert | ||||
|  | ||||
| @ -916,6 +916,7 @@ libtorch_python_core_sources = [ | ||||
|     "torch/csrc/autograd/python_torch_functions_manual.cpp", | ||||
|     "torch/csrc/autograd/python_variable.cpp", | ||||
|     "torch/csrc/autograd/python_variable_indexing.cpp", | ||||
|     "torch/csrc/distributed/python_placement.cpp", | ||||
|     "torch/csrc/dynamo/python_compiled_autograd.cpp", | ||||
|     "torch/csrc/dynamo/cache_entry.cpp", | ||||
|     "torch/csrc/dynamo/cpp_shim.cpp", | ||||
| @ -1073,6 +1074,7 @@ aten_cpu_non_globed_sources = [ | ||||
|     "aten/src/ATen/detail/MPSHooksInterface.cpp", | ||||
|     "aten/src/ATen/detail/MAIAHooksInterface.cpp", | ||||
|     "aten/src/ATen/detail/PrivateUse1HooksInterface.cpp", | ||||
|     "aten/src/ATen/detail/XLAHooksInterface.cpp", | ||||
|     "aten/src/ATen/detail/XPUHooksInterface.cpp", | ||||
|     "aten/src/ATen/detail/MTIAHooksInterface.cpp", | ||||
|     "aten/src/ATen/detail/IPUHooksInterface.cpp", | ||||
| @ -1091,6 +1093,7 @@ aten_cpu_non_globed_headers = [ | ||||
|     "aten/src/ATen/detail/HPUHooksInterface.h", | ||||
|     "aten/src/ATen/detail/MAIAHooksInterface.h", | ||||
|     "aten/src/ATen/detail/PrivateUse1HooksInterface.h", | ||||
|     "aten/src/ATen/detail/XLAHooksInterface.h", | ||||
|     "aten/src/ATen/detail/XPUHooksInterface.h", | ||||
|     "aten/src/ATen/detail/MTIAHooksInterface.h", | ||||
|     "aten/src/ATen/detail/IPUHooksInterface.h", | ||||
|  | ||||
| @ -329,17 +329,17 @@ struct pair { | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| static T conj(T a) { | ||||
| inline T conj(T a) { | ||||
|   return a; | ||||
| } | ||||
|  | ||||
| template <> | ||||
| half2 conj(half2 a) { | ||||
| inline half2 conj(half2 a) { | ||||
|   return half2(a.x, -a.y); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| float2 conj(float2 a) { | ||||
| inline float2 conj(float2 a) { | ||||
|   return float2(a.x, -a.y); | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -1638,7 +1638,38 @@ if(USE_KINETO) | ||||
|   message(STATUS "  KINETO_LIBRARY_TYPE = ${KINETO_LIBRARY_TYPE}") | ||||
|  | ||||
|   if(NOT LIBKINETO_NOCUPTI) | ||||
|     if(TARGET CUDA::cupti) | ||||
|     set(CUDA_SOURCE_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE STRING "") | ||||
|     message(STATUS "  CUDA_SOURCE_DIR = ${CUDA_SOURCE_DIR}") | ||||
|     message(STATUS "  CUDA_INCLUDE_DIRS = ${CUDA_INCLUDE_DIRS}") | ||||
|  | ||||
|     if(NOT MSVC) | ||||
|       if(USE_CUPTI_SO) | ||||
|         set(CUPTI_LIB_NAME "libcupti.so") | ||||
|       else() | ||||
|         set(CUPTI_LIB_NAME "libcupti_static.a") | ||||
|       endif() | ||||
|     else() | ||||
|       set(CUPTI_LIB_NAME "cupti.lib") | ||||
|     endif() | ||||
|  | ||||
|     find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS | ||||
|         ${CUDA_SOURCE_DIR} | ||||
|         ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64 | ||||
|         ${CUDA_SOURCE_DIR}/lib | ||||
|         ${CUDA_SOURCE_DIR}/lib64 | ||||
|         NO_DEFAULT_PATH) | ||||
|  | ||||
|     find_path(CUPTI_INCLUDE_DIR cupti.h PATHS | ||||
|         ${CUDA_SOURCE_DIR}/extras/CUPTI/include | ||||
|         ${CUDA_INCLUDE_DIRS} | ||||
|         ${CUDA_SOURCE_DIR} | ||||
|         ${CUDA_SOURCE_DIR}/include | ||||
|         NO_DEFAULT_PATH) | ||||
|  | ||||
|     if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR) | ||||
|       message(STATUS "  CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}") | ||||
|       set(CUDA_cupti_LIBRARY ${CUPTI_LIBRARY_PATH}) | ||||
|       message(STATUS "  CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}") | ||||
|       message(STATUS "Found CUPTI") | ||||
|       set(LIBKINETO_NOCUPTI OFF CACHE STRING "" FORCE) | ||||
|  | ||||
| @ -1651,7 +1682,7 @@ if(USE_KINETO) | ||||
|         if(NOT APPLE) | ||||
|           set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} "dl" "pthread") | ||||
|         endif() | ||||
|         set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} $<LINK_LIBRARY:WHOLE_ARCHIVE,CUDA::cupti_static>) | ||||
|         set(CMAKE_REQUIRED_LINK_OPTIONS "-Wl,--whole-archive,${CUPTI_LIBRARY_PATH},--no-whole-archive") | ||||
|         check_cxx_source_runs("#include <stdexcept> | ||||
|   int main() { | ||||
|     try { | ||||
|  | ||||
| @ -29,10 +29,15 @@ SET(Open_BLAS_LIB_SEARCH_PATHS | ||||
|         $ENV{OpenBLAS}/lib | ||||
|         $ENV{OpenBLAS_HOME} | ||||
|         $ENV{OpenBLAS_HOME}/lib | ||||
|  ) | ||||
| ) | ||||
|  | ||||
| SET(Open_BLAS_LIB_NAME openblas) | ||||
| IF(DEFINED ENV{OpenBLAS_LIB_NAME}) | ||||
|   SET(Open_BLAS_LIB_NAME $ENV{OpenBLAS_LIB_NAME}) | ||||
| ENDIF() | ||||
|  | ||||
| FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS}) | ||||
| FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS}) | ||||
| FIND_LIBRARY(OpenBLAS_LIB NAMES ${Open_BLAS_LIB_NAME} PATHS ${Open_BLAS_LIB_SEARCH_PATHS}) | ||||
|  | ||||
| SET(OpenBLAS_FOUND ON) | ||||
|  | ||||
|  | ||||
| @ -263,12 +263,31 @@ offers a comprehensive example of using these features to manipulate a checkpoin | ||||
| Starting in version 2.6, ``torch.load`` will use ``weights_only=True`` if the ``pickle_module`` | ||||
| argument is not passed. | ||||
|  | ||||
| .. _weights-only-security: | ||||
|  | ||||
| weights_only security | ||||
| ^^^^^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| As discussed in the documentation for :func:`torch.load`, ``weights_only=True`` restricts | ||||
| the unpickler used in ``torch.load`` to only executing functions/building classes required for | ||||
| ``state_dicts`` of plain ``torch.Tensors`` as well as some other primitive types. Further, | ||||
| unlike the default ``Unpickler`` provided by the ``pickle`` module, the ``weights_only`` Unpickler | ||||
| is not allowed to dynamically import anything during unpickling. | ||||
|  | ||||
| ``weights_only=True`` narrows the surface of remote code execution attacks but has the following limitations: | ||||
|  | ||||
| 1. ``weights_only=True`` does not guard against denial of service attacks. | ||||
| 2. We try to prevent memory corruptions during ``torch.load(weights_only=True)`` but they might still be possible. | ||||
|  | ||||
| Note that even if memory corruption does not occur during ``torch.load`` itself, loading CAN create | ||||
| unexpected objects for the downstream code that can also lead to memory corruption (e.g. a Tensor of | ||||
| indices and values made to a sparse Tensor in user code might write/read out of bounds). | ||||
|  | ||||
| .. _weights-only-allowlist: | ||||
|  | ||||
| weights_only allowlist | ||||
| ^^^^^^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| As mentioned above, saving a module's ``state_dict`` is a best practice when using ``torch.save``. If loading an old | ||||
| checkpoint that contains an ``nn.Module``, we recommend ``weights_only=False``. When loading a checkpoint that contains | ||||
| tensor subclasses, there will likely be functions/classes that need to be allowlisted, see below for further details. | ||||
|  | ||||
| @ -8,7 +8,8 @@ class TestAutocast(TestCase): | ||||
|     def test_autocast_with_unsupported_type(self): | ||||
|         with self.assertWarnsRegex( | ||||
|             UserWarning, | ||||
|             "In openreg autocast, but the target dtype torch.float32 is not supported.", | ||||
|             "In openreg autocast, but the target dtype is not supported. Disabling autocast.\n" | ||||
|             "openreg Autocast only supports dtypes of torch.float16, torch.bfloat16 currently.", | ||||
|         ): | ||||
|             with torch.autocast(device_type="openreg", dtype=torch.float32): | ||||
|                 _ = torch.ones(10) | ||||
|  | ||||
| @ -101,14 +101,14 @@ class ComposabilityTest(MultiProcessTestCase): | ||||
|  | ||||
|     @property | ||||
|     def world_size(self): | ||||
|         return 4 | ||||
|         return 8 | ||||
|  | ||||
|     @property | ||||
|     def device(self): | ||||
|         return self.rank | ||||
|  | ||||
|     @requires_accelerator_dist_backend(["nccl", "xccl"]) | ||||
|     @skip_if_lt_x_gpu(4) | ||||
|     @skip_if_lt_x_gpu(8) | ||||
|     @skip_but_pass_in_sandcastle_if( | ||||
|         not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs" | ||||
|     ) | ||||
| @ -169,8 +169,8 @@ class ComposabilityTest(MultiProcessTestCase): | ||||
|             {f"{i}": MLPModule(dim) for i in range(total_layers)} | ||||
|         ) | ||||
|         # Calculate start and end indices based on rank | ||||
|         start_index = self.rank * 2 | ||||
|         end_index = start_index + 2 | ||||
|         start_index = self.rank | ||||
|         end_index = start_index + 1 | ||||
|         pp_model = PPModelChunk(full_model, start_index, end_index) | ||||
|  | ||||
|         pp_model.to(self.device) | ||||
| @ -224,7 +224,6 @@ class ComposabilityTest(MultiProcessTestCase): | ||||
|         ], | ||||
|     ) | ||||
|     def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam): | ||||
|         _device_raii = torch.device(device_type, self.device) | ||||
|         torch.accelerator.set_device_index(self.device) | ||||
|         store = torch.distributed.FileStore(self.file_name, self.world_size) | ||||
|         torch.distributed.init_process_group( | ||||
| @ -286,56 +285,44 @@ class ComposabilityTest(MultiProcessTestCase): | ||||
|                 parallelize_module(layer, tp_mesh, parallelize_plan) | ||||
|             return model | ||||
|  | ||||
|         # Attach to a schedule | ||||
|         if issubclass(ScheduleClass, PipelineScheduleSingle): | ||||
|             stage_idx = pp_group.rank() | ||||
|             partial_model = nn.Sequential( | ||||
|                 *full_model[stage_idx * 2 : stage_idx * 2 + 2] | ||||
|             ) | ||||
|             partial_model.to(self.device) | ||||
|             n_virtual = 1 | ||||
|         else: | ||||
|             n_virtual = 2 | ||||
|  | ||||
|         num_stages = pp_group.size() * n_virtual | ||||
|         layers_per_stage = total_layers // num_stages | ||||
|         stages = [] | ||||
|         for i in range(n_virtual): | ||||
|             stage_idx = pp_group.rank() + pp_group.size() * i | ||||
|             start_layer = stage_idx * layers_per_stage | ||||
|             end_layer = start_layer + layers_per_stage | ||||
|             # divide the model layers by the number of stages | ||||
|             partial_model = nn.Sequential(*full_model[start_layer:end_layer]) | ||||
|             partial_model.to(self.device) | ||||
|             tp_model = apply_tp(partial_model, tp_mesh) | ||||
|             dp_model = apply_fsdp(tp_model) | ||||
|             pipeline_stage = PipelineStage( | ||||
|  | ||||
|             stage = PipelineStage( | ||||
|                 dp_model, | ||||
|                 stage_idx, | ||||
|                 pp_group.size(), | ||||
|                 num_stages, | ||||
|                 self.device, | ||||
|                 group=pp_group, | ||||
|             ) | ||||
|             partial_models = [pipeline_stage.submod] | ||||
|             pipeline_schedule = ScheduleClass( | ||||
|                 pipeline_stage, | ||||
|                 n_microbatches=num_microbatches, | ||||
|                 loss_fn=loss_fn, | ||||
|             ) | ||||
|         else: | ||||
|             n_virtual = 2 | ||||
|             num_stages = pp_group.size() * n_virtual | ||||
|             stages = [] | ||||
|             for i in range(n_virtual): | ||||
|                 stage_idx = pp_group.rank() + n_virtual * i | ||||
|                 # divide the model layers by the number of stages | ||||
|                 partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1]) | ||||
|                 partial_model.to(self.device) | ||||
|  | ||||
|                 tp_model = apply_tp(partial_model, tp_mesh) | ||||
|                 dp_model = apply_fsdp(tp_model) | ||||
|                 stage = PipelineStage( | ||||
|                     dp_model, | ||||
|                     stage_idx, | ||||
|                     num_stages, | ||||
|                     self.device, | ||||
|                     group=pp_group, | ||||
|                 ) | ||||
|             stages.append(stage) | ||||
|             partial_models = [pipeline_stage.submod for pipeline_stage in stages] | ||||
|  | ||||
|                 stages.append(stage) | ||||
|                 partial_models = [pipeline_stage.submod for pipeline_stage in stages] | ||||
|             pipeline_schedule = ScheduleClass( | ||||
|                 stages, | ||||
|                 n_microbatches=num_microbatches, | ||||
|                 loss_fn=loss_fn, | ||||
|             ) | ||||
|         if issubclass(ScheduleClass, PipelineScheduleSingle): | ||||
|             stages = stages[0] | ||||
|  | ||||
|         pipeline_schedule = ScheduleClass( | ||||
|             stages, | ||||
|             n_microbatches=num_microbatches, | ||||
|             loss_fn=loss_fn, | ||||
|             scale_grads=False, | ||||
|         ) | ||||
|  | ||||
|         optimizer_kwargs = { | ||||
|             "lr": 0.01, | ||||
| @ -369,7 +356,7 @@ class ComposabilityTest(MultiProcessTestCase): | ||||
|         torch.distributed.destroy_process_group() | ||||
|  | ||||
|     @requires_accelerator_dist_backend(["nccl", "xccl"]) | ||||
|     @skip_if_lt_x_gpu(4) | ||||
|     @skip_if_lt_x_gpu(8) | ||||
|     @skip_but_pass_in_sandcastle_if( | ||||
|         not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs" | ||||
|     ) | ||||
| @ -447,109 +434,71 @@ class ComposabilityTest(MultiProcessTestCase): | ||||
|                 partial_model = partial_model.to(dtype=MixedPrecisionParam) | ||||
|             return partial_model | ||||
|  | ||||
|         # Attach to a schedule | ||||
|         if issubclass(ScheduleClass, PipelineScheduleSingle): | ||||
|             stage_idx = pp_group.rank() | ||||
|             partial_model = nn.Sequential( | ||||
|                 *full_model[stage_idx * 2 : stage_idx * 2 + 2] | ||||
|             ) | ||||
|             partial_model.to(self.device) | ||||
|  | ||||
|             dp_model = apply_replicate(partial_model) | ||||
|             pipeline_stage = PipelineStage( | ||||
|                 dp_model, | ||||
|                 stage_idx, | ||||
|                 pp_group.size(), | ||||
|                 self.device, | ||||
|                 group=pp_group, | ||||
|             ) | ||||
|             partial_models = [pipeline_stage.submod] | ||||
|             pipeline_schedule = ScheduleClass( | ||||
|                 pipeline_stage, | ||||
|                 n_microbatches=num_microbatches, | ||||
|                 loss_fn=loss_fn, | ||||
|                 scale_grads=False, | ||||
|             ) | ||||
|  | ||||
|             ref_partial_model = nn.Sequential( | ||||
|                 *ref_full_model[stage_idx * 2 : stage_idx * 2 + 2] | ||||
|             ) | ||||
|             ref_partial_model.to(self.device) | ||||
|             ref_partial_model = apply_same_precision( | ||||
|                 ref_partial_model | ||||
|             )  # Apply same precision | ||||
|  | ||||
|             ref_pipeline_stage = PipelineStage( | ||||
|                 ref_partial_model, | ||||
|                 stage_idx, | ||||
|                 pp_group.size(), | ||||
|                 self.device, | ||||
|                 group=pp_group, | ||||
|             ) | ||||
|             ref_partial_models = [ref_pipeline_stage.submod] | ||||
|             ref_pipeline_schedule = ScheduleClass( | ||||
|                 ref_pipeline_stage, | ||||
|                 n_microbatches=num_microbatches, | ||||
|                 loss_fn=loss_fn, | ||||
|                 scale_grads=False, | ||||
|             ) | ||||
|             n_virtual = 1 | ||||
|         else: | ||||
|             n_virtual = 2 | ||||
|             num_stages = pp_group.size() * n_virtual | ||||
|             stages = [] | ||||
|             ref_stages = [] | ||||
|             for i in range(n_virtual): | ||||
|                 stage_idx = pp_group.rank() + n_virtual * i | ||||
|                 # divide the model layers by the number of stages | ||||
|                 partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1]) | ||||
|                 partial_model.to(self.device) | ||||
|  | ||||
|                 dp_model = apply_replicate(partial_model) | ||||
|                 stage = PipelineStage( | ||||
|                     dp_model, | ||||
|                     stage_idx, | ||||
|                     num_stages, | ||||
|                     self.device, | ||||
|                     group=pp_group, | ||||
|                 ) | ||||
|         num_stages = pp_group.size() * n_virtual | ||||
|         layers_per_stage = total_layers // num_stages | ||||
|         stages = [] | ||||
|         ref_stages = [] | ||||
|         for i in range(n_virtual): | ||||
|             stage_idx = pp_group.rank() + pp_group.size() * i | ||||
|             start_layer = stage_idx * layers_per_stage | ||||
|             end_layer = start_layer + layers_per_stage | ||||
|             # divide the model layers by the number of stages | ||||
|             partial_model = nn.Sequential(*full_model[start_layer:end_layer]) | ||||
|             partial_model.to(self.device) | ||||
|  | ||||
|                 stages.append(stage) | ||||
|                 partial_models = [pipeline_stage.submod for pipeline_stage in stages] | ||||
|             ref_partial_model = nn.Sequential(*ref_full_model[start_layer:end_layer]) | ||||
|             ref_partial_model.to(self.device) | ||||
|  | ||||
|                 ref_partial_model = nn.Sequential( | ||||
|                     *ref_full_model[stage_idx : stage_idx + 1] | ||||
|                 ) | ||||
|                 ref_partial_model.to(self.device) | ||||
|                 ref_partial_model = apply_same_precision( | ||||
|                     ref_partial_model | ||||
|                 )  # Apply same precision | ||||
|             dp_model = apply_replicate(partial_model) | ||||
|             ref_dp_model = apply_same_precision(ref_partial_model) | ||||
|  | ||||
|                 ref_stage = PipelineStage( | ||||
|                     ref_partial_model, | ||||
|                     stage_idx, | ||||
|                     num_stages, | ||||
|                     self.device, | ||||
|                     group=pp_group, | ||||
|                 ) | ||||
|  | ||||
|                 ref_stages.append(ref_stage) | ||||
|                 ref_partial_models = [ | ||||
|                     pipeline_stage.submod for pipeline_stage in ref_stages | ||||
|                 ] | ||||
|             pipeline_schedule = ScheduleClass( | ||||
|                 stages, | ||||
|                 n_microbatches=num_microbatches, | ||||
|                 loss_fn=loss_fn, | ||||
|                 scale_grads=False, | ||||
|             stage = PipelineStage( | ||||
|                 dp_model, | ||||
|                 stage_idx, | ||||
|                 num_stages, | ||||
|                 self.device, | ||||
|                 group=pp_group, | ||||
|             ) | ||||
|  | ||||
|             ref_pipeline_schedule = ScheduleClass( | ||||
|                 ref_stages, | ||||
|                 n_microbatches=num_microbatches, | ||||
|                 loss_fn=loss_fn, | ||||
|                 scale_grads=False, | ||||
|             ref_stage = PipelineStage( | ||||
|                 ref_dp_model, | ||||
|                 stage_idx, | ||||
|                 num_stages, | ||||
|                 self.device, | ||||
|                 group=pp_group, | ||||
|             ) | ||||
|  | ||||
|             stages.append(stage) | ||||
|             ref_stages.append(ref_stage) | ||||
|  | ||||
|             partial_models = [pipeline_stage.submod for pipeline_stage in stages] | ||||
|             ref_partial_models = [ | ||||
|                 pipeline_stage.submod for pipeline_stage in ref_stages | ||||
|             ] | ||||
|  | ||||
|         if issubclass(ScheduleClass, PipelineScheduleSingle): | ||||
|             stages = stages[0] | ||||
|             ref_stages = ref_stages[0] | ||||
|  | ||||
|         pipeline_schedule = ScheduleClass( | ||||
|             stages, | ||||
|             n_microbatches=num_microbatches, | ||||
|             loss_fn=loss_fn, | ||||
|             scale_grads=False, | ||||
|         ) | ||||
|  | ||||
|         ref_pipeline_schedule = ScheduleClass( | ||||
|             ref_stages, | ||||
|             n_microbatches=num_microbatches, | ||||
|             loss_fn=loss_fn, | ||||
|             scale_grads=False, | ||||
|         ) | ||||
|  | ||||
|         optimizer_kwargs = { | ||||
|             "lr": 0.01, | ||||
|             "betas": (0.9, 0.95), | ||||
| @ -604,7 +553,7 @@ class ComposabilityTest(MultiProcessTestCase): | ||||
|         torch.distributed.destroy_process_group() | ||||
|  | ||||
|     @requires_accelerator_dist_backend(["nccl", "xccl"]) | ||||
|     @skip_if_lt_x_gpu(4) | ||||
|     @skip_if_lt_x_gpu(8) | ||||
|     @skip_but_pass_in_sandcastle_if( | ||||
|         not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs" | ||||
|     ) | ||||
| @ -736,67 +685,44 @@ class ComposabilityTest(MultiProcessTestCase): | ||||
|  | ||||
|         pipeline_model_parameter_dict = {} | ||||
|  | ||||
|         # Attach to a schedule | ||||
|         if issubclass(ScheduleClass, PipelineScheduleSingle): | ||||
|             stage_idx = pp_group.rank() | ||||
|             # Calculate layers per stage correctly | ||||
|             layers_per_stage = total_layers // pp_group.size()  # 8 // 2 = 4 | ||||
|             n_virtual = 1 | ||||
|         else: | ||||
|             n_virtual = 2 | ||||
|  | ||||
|         num_stages = pp_group.size() * n_virtual | ||||
|         layers_per_stage = total_layers // num_stages | ||||
|         stages = [] | ||||
|         for i in range(n_virtual): | ||||
|             stage_idx = pp_group.rank() + pp_group.size() * i | ||||
|             start_layer = stage_idx * layers_per_stage | ||||
|             end_layer = start_layer + layers_per_stage | ||||
|  | ||||
|             # divide the model layers by the number of stages | ||||
|             partial_model = nn.Sequential(*full_model[start_layer:end_layer]) | ||||
|             partial_model.to(self.device) | ||||
|  | ||||
|             dp_model = apply_replicate(partial_model) | ||||
|             pipelined_models_parameters(start_layer, dp_model) | ||||
|  | ||||
|             pipeline_stage = PipelineStage( | ||||
|             stage = PipelineStage( | ||||
|                 dp_model, | ||||
|                 stage_idx, | ||||
|                 pp_group.size(), | ||||
|                 num_stages, | ||||
|                 self.device, | ||||
|                 group=pp_group, | ||||
|             ) | ||||
|             partial_models = [pipeline_stage.submod] | ||||
|             pipeline_schedule = ScheduleClass( | ||||
|                 pipeline_stage, | ||||
|                 n_microbatches=num_microbatches, | ||||
|                 loss_fn=loss_fn, | ||||
|                 scale_grads=False, | ||||
|             ) | ||||
|  | ||||
|         else: | ||||
|             n_virtual = 2 | ||||
|             num_stages = pp_group.size() * n_virtual | ||||
|             layers_per_stage = total_layers // num_stages | ||||
|             stages = [] | ||||
|             for i in range(n_virtual): | ||||
|                 stage_idx = pp_group.rank() + pp_group.size() * i | ||||
|                 start_layer = stage_idx * layers_per_stage | ||||
|                 end_layer = start_layer + layers_per_stage | ||||
|                 # divide the model layers by the number of stages | ||||
|                 partial_model = nn.Sequential(*full_model[start_layer:end_layer]) | ||||
|                 partial_model.to(self.device) | ||||
|             stages.append(stage) | ||||
|             partial_models = [pipeline_stage.submod for pipeline_stage in stages] | ||||
|  | ||||
|                 dp_model = apply_replicate(partial_model) | ||||
|                 pipelined_models_parameters(start_layer, dp_model) | ||||
|                 stage = PipelineStage( | ||||
|                     dp_model, | ||||
|                     stage_idx, | ||||
|                     num_stages, | ||||
|                     self.device, | ||||
|                     group=pp_group, | ||||
|                 ) | ||||
|         if issubclass(ScheduleClass, PipelineScheduleSingle): | ||||
|             stages = stages[0] | ||||
|  | ||||
|                 stages.append(stage) | ||||
|                 partial_models = [pipeline_stage.submod for pipeline_stage in stages] | ||||
|  | ||||
|             pipeline_schedule = ScheduleClass( | ||||
|                 stages, | ||||
|                 n_microbatches=num_microbatches, | ||||
|                 loss_fn=loss_fn, | ||||
|                 scale_grads=False, | ||||
|             ) | ||||
|         pipeline_schedule = ScheduleClass( | ||||
|             stages, | ||||
|             n_microbatches=num_microbatches, | ||||
|             loss_fn=loss_fn, | ||||
|             scale_grads=False, | ||||
|         ) | ||||
|  | ||||
|         optimizer_kwargs = { | ||||
|             "lr": 0.01, | ||||
|  | ||||
| @ -18,7 +18,9 @@ from torch.distributed.tensor import ( | ||||
| from torch.distributed.tensor.debug import CommDebugMode | ||||
| from torch.testing._internal.common_utils import run_tests | ||||
| from torch.testing._internal.distributed._tensor.common_dtensor import ( | ||||
|     create_local_tensor_test_class, | ||||
|     DTensorTestBase, | ||||
|     map_local_tensor_for_rank, | ||||
|     with_comms, | ||||
| ) | ||||
|  | ||||
| @ -78,17 +80,21 @@ class DTensorAPITest(DTensorTestBase): | ||||
|         self.assertEqual(dist_tensor.placements[0].dim, 1) | ||||
|  | ||||
|         placement_combs = [[Shard(0)], [Shard(1)], [Replicate()]] | ||||
|         # test src_data_rank == 1 | ||||
|         # set seed differently for each rank | ||||
|         torch.manual_seed(self.rank) | ||||
|         for placement in placement_combs: | ||||
|             tensor_to_distribute = torch.randn(3 * self.world_size, 3 * self.world_size) | ||||
|             dtensor = distribute_tensor( | ||||
|                 tensor_to_distribute, device_mesh, placement, src_data_rank=1 | ||||
|             ) | ||||
|             full_dtensor = dtensor.full_tensor() | ||||
|             if self.rank == 1: | ||||
|                 self.assertEqual(full_dtensor, tensor_to_distribute) | ||||
|  | ||||
|         if not self.is_local_tensor_enabled: | ||||
|             # test src_data_rank == 1 | ||||
|             # set seed differently for each rank | ||||
|             self.init_manual_seed_for_rank() | ||||
|             for placement in placement_combs: | ||||
|                 tensor_to_distribute = torch.randn( | ||||
|                     3 * self.world_size, 3 * self.world_size | ||||
|                 ) | ||||
|                 dtensor = distribute_tensor( | ||||
|                     tensor_to_distribute, device_mesh, placement, src_data_rank=1 | ||||
|                 ) | ||||
|                 full_dtensor = dtensor.full_tensor() | ||||
|                 if self.rank == 1: | ||||
|                     self.assertEqual(full_dtensor, tensor_to_distribute) | ||||
|  | ||||
|         # test src_data_rank = None, make sure it does not have communication | ||||
|         with comm_mode: | ||||
| @ -156,7 +162,12 @@ class DTensorAPITest(DTensorTestBase): | ||||
|             dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_spec) | ||||
|             self.assertEqual(dist_tensor.size(), torch.Size(input_size)) | ||||
|             local_tensor = dist_tensor.to_local() | ||||
|             self.assertEqual(local_tensor, splitted_tensor_list[self.rank]) | ||||
|             self.assertEqual( | ||||
|                 local_tensor, | ||||
|                 map_local_tensor_for_rank( | ||||
|                     splitted_tensor_list, self.rank, lambda tl, r: tl[r] | ||||
|                 ), | ||||
|             ) | ||||
|  | ||||
|     @with_comms | ||||
|     def test_distribute_module(self): | ||||
| @ -388,5 +399,9 @@ class DTensorAPITest(DTensorTestBase): | ||||
|             dcp.save({"fqn": dtensor}, checkpoint_id=tempfile.mkdtemp()) | ||||
|  | ||||
|  | ||||
| DTensorAPITestWithLocalTensor = create_local_tensor_test_class( | ||||
|     DTensorAPITest, skipped_tests=["test_checkpoint_apis_check_partial_placement"] | ||||
| ) | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     run_tests() | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	