mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-30 03:34:56 +08:00
Compare commits
98 Commits
ciflow/ind
...
revert-cpp
| Author | SHA1 | Date | |
|---|---|---|---|
| 2eacbe792a | |||
| 8110ce02a2 | |||
| 43c30f607e | |||
| 5ebf74a655 | |||
| acd936cc1a | |||
| a4a0378e6b | |||
| ac841267a1 | |||
| 0eacd934bc | |||
| 5016e7b2eb | |||
| 544b443ea1 | |||
| 3041ede082 | |||
| 34d6ef7022 | |||
| 110efe4df4 | |||
| e137cd0a10 | |||
| be28329710 | |||
| 85a7c745aa | |||
| 32fe4f681e | |||
| ebb2b2e894 | |||
| 13413b3b07 | |||
| 5d0b3e28dc | |||
| 9139368b64 | |||
| 02095cc09d | |||
| 65868156c6 | |||
| f93ea7dab1 | |||
| a77f5d9a00 | |||
| ff46d5a79b | |||
| f452edd782 | |||
| ea698e8bfc | |||
| 7f7a28046b | |||
| d8283a317a | |||
| e0ca3049c0 | |||
| 8417981c96 | |||
| 06e71c8558 | |||
| a76b59cc45 | |||
| 74336f8c77 | |||
| 236ce736a1 | |||
| 17bdb232e1 | |||
| add37bacda | |||
| 1425b40f29 | |||
| 8af9ed0824 | |||
| 7045aab143 | |||
| 7ae8aaf4c0 | |||
| f2450798cd | |||
| 46d17e8871 | |||
| dc011d3203 | |||
| e95920e3e6 | |||
| 5e769ff867 | |||
| 0ae3e30621 | |||
| 47f50cfd45 | |||
| a51f877287 | |||
| b44423bbb4 | |||
| 8e1e4ee8e0 | |||
| 1e836bc769 | |||
| 9a91486e45 | |||
| 92381a5aa7 | |||
| 2a5f87decf | |||
| 840d63c12d | |||
| 2ce894bb1d | |||
| 47ec1e9990 | |||
| 904abfc2ca | |||
| 7d16fcf2df | |||
| 483845a9c4 | |||
| 60bcb4ee88 | |||
| ee7434be82 | |||
| d049ed2cb1 | |||
| 9901d44418 | |||
| 6096c0fc74 | |||
| f6951cb8ea | |||
| 8887a33ede | |||
| 36a48e7e6d | |||
| c6a02eae5b | |||
| 6ecd6b23b6 | |||
| 3f69b4d9b4 | |||
| a04edcb27a | |||
| eb2bad5bb5 | |||
| a076b4d7ac | |||
| a988510c33 | |||
| 99e07c39ec | |||
| 610c09f8f4 | |||
| 61bad3c1ea | |||
| f89a7e9fe8 | |||
| f2c81635c8 | |||
| e214af6ae8 | |||
| 7ce723d21c | |||
| 4295a9a158 | |||
| 90d7be35e9 | |||
| 8d4e48831e | |||
| 90b30ebf7e | |||
| 173bcda436 | |||
| 6530bc70fb | |||
| 4c38887346 | |||
| 81fa4a204c | |||
| 4e6afa8c07 | |||
| 79aa88cc5d | |||
| fa4cb91846 | |||
| c58d0ad85d | |||
| 000f49551b | |||
| 9940e894ea |
@ -150,7 +150,7 @@ function install_130 {
|
||||
CUDNN_VERSION=9.13.0.50
|
||||
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
# install CUDA 13.0 in the same container
|
||||
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
|
||||
install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
|
||||
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
install_cudnn 13 $CUDNN_VERSION
|
||||
|
||||
@ -1,3 +1,8 @@
|
||||
---
|
||||
name: docstring
|
||||
description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
|
||||
---
|
||||
|
||||
# PyTorch Docstring Writing Guide
|
||||
|
||||
This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.
|
||||
385
.claude/skills/skill-writer/SKILL.md
Normal file
385
.claude/skills/skill-writer/SKILL.md
Normal file
@ -0,0 +1,385 @@
|
||||
---
|
||||
name: skill-writer
|
||||
description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
|
||||
---
|
||||
|
||||
# Skill Writer
|
||||
|
||||
This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
|
||||
|
||||
## When to use this Skill
|
||||
|
||||
Use this Skill when:
|
||||
- Creating a new Agent Skill
|
||||
- Writing or updating SKILL.md files
|
||||
- Designing skill structure and frontmatter
|
||||
- Troubleshooting skill discovery issues
|
||||
- Converting existing prompts or workflows into Skills
|
||||
|
||||
## Instructions
|
||||
|
||||
### Step 1: Determine Skill scope
|
||||
|
||||
First, understand what the Skill should do:
|
||||
|
||||
1. **Ask clarifying questions**:
|
||||
- What specific capability should this Skill provide?
|
||||
- When should Claude use this Skill?
|
||||
- What tools or resources does it need?
|
||||
- Is this for personal use or team sharing?
|
||||
|
||||
2. **Keep it focused**: One Skill = one capability
|
||||
- Good: "PDF form filling", "Excel data analysis"
|
||||
- Too broad: "Document processing", "Data tools"
|
||||
|
||||
### Step 2: Choose Skill location
|
||||
|
||||
Determine where to create the Skill:
|
||||
|
||||
**Personal Skills** (`~/.claude/skills/`):
|
||||
- Individual workflows and preferences
|
||||
- Experimental Skills
|
||||
- Personal productivity tools
|
||||
|
||||
**Project Skills** (`.claude/skills/`):
|
||||
- Team workflows and conventions
|
||||
- Project-specific expertise
|
||||
- Shared utilities (committed to git)
|
||||
|
||||
### Step 3: Create Skill structure
|
||||
|
||||
Create the directory and files:
|
||||
|
||||
```bash
|
||||
# Personal
|
||||
mkdir -p ~/.claude/skills/skill-name
|
||||
|
||||
# Project
|
||||
mkdir -p .claude/skills/skill-name
|
||||
```
|
||||
|
||||
For multi-file Skills:
|
||||
```
|
||||
skill-name/
|
||||
├── SKILL.md (required)
|
||||
├── reference.md (optional)
|
||||
├── examples.md (optional)
|
||||
├── scripts/
|
||||
│ └── helper.py (optional)
|
||||
└── templates/
|
||||
└── template.txt (optional)
|
||||
```
|
||||
|
||||
### Step 4: Write SKILL.md frontmatter
|
||||
|
||||
Create YAML frontmatter with required fields:
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: skill-name
|
||||
description: Brief description of what this does and when to use it
|
||||
---
|
||||
```
|
||||
|
||||
**Field requirements**:
|
||||
|
||||
- **name**:
|
||||
- Lowercase letters, numbers, hyphens only
|
||||
- Max 64 characters
|
||||
- Must match directory name
|
||||
- Good: `pdf-processor`, `git-commit-helper`
|
||||
- Bad: `PDF_Processor`, `Git Commits!`
|
||||
|
||||
- **description**:
|
||||
- Max 1024 characters
|
||||
- Include BOTH what it does AND when to use it
|
||||
- Use specific trigger words users would say
|
||||
- Mention file types, operations, and context
|
||||
|
||||
**Optional frontmatter fields**:
|
||||
|
||||
- **allowed-tools**: Restrict tool access (comma-separated list)
|
||||
```yaml
|
||||
allowed-tools: Read, Grep, Glob
|
||||
```
|
||||
Use for:
|
||||
- Read-only Skills
|
||||
- Security-sensitive workflows
|
||||
- Limited-scope operations
|
||||
|
||||
### Step 5: Write effective descriptions
|
||||
|
||||
The description is critical for Claude to discover your Skill.
|
||||
|
||||
**Formula**: `[What it does] + [When to use it] + [Key triggers]`
|
||||
|
||||
**Examples**:
|
||||
|
||||
✅ **Good**:
|
||||
```yaml
|
||||
description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
|
||||
```
|
||||
|
||||
✅ **Good**:
|
||||
```yaml
|
||||
description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
|
||||
```
|
||||
|
||||
❌ **Too vague**:
|
||||
```yaml
|
||||
description: Helps with documents
|
||||
description: For data analysis
|
||||
```
|
||||
|
||||
**Tips**:
|
||||
- Include specific file extensions (.pdf, .xlsx, .json)
|
||||
- Mention common user phrases ("analyze", "extract", "generate")
|
||||
- List concrete operations (not generic verbs)
|
||||
- Add context clues ("Use when...", "For...")
|
||||
|
||||
### Step 6: Structure the Skill content
|
||||
|
||||
Use clear Markdown sections:
|
||||
|
||||
```markdown
|
||||
# Skill Name
|
||||
|
||||
Brief overview of what this Skill does.
|
||||
|
||||
## Quick start
|
||||
|
||||
Provide a simple example to get started immediately.
|
||||
|
||||
## Instructions
|
||||
|
||||
Step-by-step guidance for Claude:
|
||||
1. First step with clear action
|
||||
2. Second step with expected outcome
|
||||
3. Handle edge cases
|
||||
|
||||
## Examples
|
||||
|
||||
Show concrete usage examples with code or commands.
|
||||
|
||||
## Best practices
|
||||
|
||||
- Key conventions to follow
|
||||
- Common pitfalls to avoid
|
||||
- When to use vs. not use
|
||||
|
||||
## Requirements
|
||||
|
||||
List any dependencies or prerequisites:
|
||||
```bash
|
||||
pip install package-name
|
||||
```
|
||||
|
||||
## Advanced usage
|
||||
|
||||
For complex scenarios, see [reference.md](reference.md).
|
||||
```
|
||||
|
||||
### Step 7: Add supporting files (optional)
|
||||
|
||||
Create additional files for progressive disclosure:
|
||||
|
||||
**reference.md**: Detailed API docs, advanced options
|
||||
**examples.md**: Extended examples and use cases
|
||||
**scripts/**: Helper scripts and utilities
|
||||
**templates/**: File templates or boilerplate
|
||||
|
||||
Reference them from SKILL.md:
|
||||
```markdown
|
||||
For advanced usage, see [reference.md](reference.md).
|
||||
|
||||
Run the helper script:
|
||||
\`\`\`bash
|
||||
python scripts/helper.py input.txt
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
### Step 8: Validate the Skill
|
||||
|
||||
Check these requirements:
|
||||
|
||||
✅ **File structure**:
|
||||
- [ ] SKILL.md exists in correct location
|
||||
- [ ] Directory name matches frontmatter `name`
|
||||
|
||||
✅ **YAML frontmatter**:
|
||||
- [ ] Opening `---` on line 1
|
||||
- [ ] Closing `---` before content
|
||||
- [ ] Valid YAML (no tabs, correct indentation)
|
||||
- [ ] `name` follows naming rules
|
||||
- [ ] `description` is specific and < 1024 chars
|
||||
|
||||
✅ **Content quality**:
|
||||
- [ ] Clear instructions for Claude
|
||||
- [ ] Concrete examples provided
|
||||
- [ ] Edge cases handled
|
||||
- [ ] Dependencies listed (if any)
|
||||
|
||||
✅ **Testing**:
|
||||
- [ ] Description matches user questions
|
||||
- [ ] Skill activates on relevant queries
|
||||
- [ ] Instructions are clear and actionable
|
||||
|
||||
### Step 9: Test the Skill
|
||||
|
||||
1. **Restart Claude Code** (if running) to load the Skill
|
||||
|
||||
2. **Ask relevant questions** that match the description:
|
||||
```
|
||||
Can you help me extract text from this PDF?
|
||||
```
|
||||
|
||||
3. **Verify activation**: Claude should use the Skill automatically
|
||||
|
||||
4. **Check behavior**: Confirm Claude follows the instructions correctly
|
||||
|
||||
### Step 10: Debug if needed
|
||||
|
||||
If Claude doesn't use the Skill:
|
||||
|
||||
1. **Make description more specific**:
|
||||
- Add trigger words
|
||||
- Include file types
|
||||
- Mention common user phrases
|
||||
|
||||
2. **Check file location**:
|
||||
```bash
|
||||
ls ~/.claude/skills/skill-name/SKILL.md
|
||||
ls .claude/skills/skill-name/SKILL.md
|
||||
```
|
||||
|
||||
3. **Validate YAML**:
|
||||
```bash
|
||||
cat SKILL.md | head -n 10
|
||||
```
|
||||
|
||||
4. **Run debug mode**:
|
||||
```bash
|
||||
claude --debug
|
||||
```
|
||||
|
||||
## Common patterns
|
||||
|
||||
### Read-only Skill
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: code-reader
|
||||
description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
|
||||
allowed-tools: Read, Grep, Glob
|
||||
---
|
||||
```
|
||||
|
||||
### Script-based Skill
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: data-processor
|
||||
description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
|
||||
---
|
||||
|
||||
# Data Processor
|
||||
|
||||
## Instructions
|
||||
|
||||
1. Use the processing script:
|
||||
\`\`\`bash
|
||||
python scripts/process.py input.csv --output results.json
|
||||
\`\`\`
|
||||
|
||||
2. Validate output with:
|
||||
\`\`\`bash
|
||||
python scripts/validate.py results.json
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
### Multi-file Skill with progressive disclosure
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: api-designer
|
||||
description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
|
||||
---
|
||||
|
||||
# API Designer
|
||||
|
||||
Quick start: See [examples.md](examples.md)
|
||||
|
||||
Detailed reference: See [reference.md](reference.md)
|
||||
|
||||
## Instructions
|
||||
|
||||
1. Gather requirements
|
||||
2. Design endpoints (see examples.md)
|
||||
3. Document with OpenAPI spec
|
||||
4. Review against best practices (see reference.md)
|
||||
```
|
||||
|
||||
## Best practices for Skill authors
|
||||
|
||||
1. **One Skill, one purpose**: Don't create mega-Skills
|
||||
2. **Specific descriptions**: Include trigger words users will say
|
||||
3. **Clear instructions**: Write for Claude, not humans
|
||||
4. **Concrete examples**: Show real code, not pseudocode
|
||||
5. **List dependencies**: Mention required packages in description
|
||||
6. **Test with teammates**: Verify activation and clarity
|
||||
7. **Version your Skills**: Document changes in content
|
||||
8. **Use progressive disclosure**: Put advanced details in separate files
|
||||
|
||||
## Validation checklist
|
||||
|
||||
Before finalizing a Skill, verify:
|
||||
|
||||
- [ ] Name is lowercase, hyphens only, max 64 chars
|
||||
- [ ] Description is specific and < 1024 chars
|
||||
- [ ] Description includes "what" and "when"
|
||||
- [ ] YAML frontmatter is valid
|
||||
- [ ] Instructions are step-by-step
|
||||
- [ ] Examples are concrete and realistic
|
||||
- [ ] Dependencies are documented
|
||||
- [ ] File paths use forward slashes
|
||||
- [ ] Skill activates on relevant queries
|
||||
- [ ] Claude follows instructions correctly
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Skill doesn't activate**:
|
||||
- Make description more specific with trigger words
|
||||
- Include file types and operations in description
|
||||
- Add "Use when..." clause with user phrases
|
||||
|
||||
**Multiple Skills conflict**:
|
||||
- Make descriptions more distinct
|
||||
- Use different trigger words
|
||||
- Narrow the scope of each Skill
|
||||
|
||||
**Skill has errors**:
|
||||
- Check YAML syntax (no tabs, proper indentation)
|
||||
- Verify file paths (use forward slashes)
|
||||
- Ensure scripts have execute permissions
|
||||
- List all dependencies
|
||||
|
||||
## Examples
|
||||
|
||||
See the documentation for complete examples:
|
||||
- Simple single-file Skill (commit-helper)
|
||||
- Skill with tool permissions (code-reviewer)
|
||||
- Multi-file Skill (pdf-processing)
|
||||
|
||||
## Output format
|
||||
|
||||
When creating a Skill, I will:
|
||||
|
||||
1. Ask clarifying questions about scope and requirements
|
||||
2. Suggest a Skill name and location
|
||||
3. Create the SKILL.md file with proper frontmatter
|
||||
4. Include clear instructions and examples
|
||||
5. Add supporting files if needed
|
||||
6. Provide testing instructions
|
||||
7. Validate against all requirements
|
||||
|
||||
The result will be a complete, working Skill that follows all best practices and validation rules.
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
0fa6e3129e61143224663e1ec67980d12b7ec4eb
|
||||
df6798dfb931ce7c7fe5bed2447cd1092a5981af
|
||||
|
||||
22
.github/scripts/generate_binary_build_matrix.py
vendored
22
.github/scripts/generate_binary_build_matrix.py
vendored
@ -22,7 +22,7 @@ CUDA_ARCHES_FULL_VERSION = {
|
||||
"12.6": "12.6.3",
|
||||
"12.8": "12.8.1",
|
||||
"12.9": "12.9.1",
|
||||
"13.0": "13.0.0",
|
||||
"13.0": "13.0.2",
|
||||
}
|
||||
CUDA_ARCHES_CUDNN_VERSION = {
|
||||
"12.6": "9",
|
||||
@ -96,21 +96,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
|
||||
"nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
|
||||
"nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
|
||||
"nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
|
||||
),
|
||||
"xpu": (
|
||||
"intel-cmplr-lib-rt==2025.2.1 | "
|
||||
|
||||
14
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
14
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -270,7 +270,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -519,7 +519,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -768,7 +768,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1017,7 +1017,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1266,7 +1266,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1515,7 +1515,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1764,7 +1764,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
14
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
14
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -325,7 +325,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda13_0-test: # Testing
|
||||
@ -991,7 +991,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda13_0-test: # Testing
|
||||
@ -1657,7 +1657,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
@ -2323,7 +2323,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda13_0-test: # Testing
|
||||
@ -2989,7 +2989,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||
@ -3655,7 +3655,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda13_0-test: # Testing
|
||||
@ -4321,7 +4321,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||
|
||||
@ -31,9 +31,9 @@ Be careful when running untrusted models. This classification includes models cr
|
||||
|
||||
**Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing).
|
||||
|
||||
**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done.
|
||||
|
||||
**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details.
|
||||
|
||||
Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs.
|
||||
|
||||
Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT)
|
||||
|
||||
configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
|
||||
# TODO: Do not generate CUDAConfig.h for ROCm BUILDS
|
||||
# At the moment, `jit_macors.h` include CUDAConfig.h for both CUDA and HIP builds
|
||||
# At the moment, `jit_macros.h` include CUDAConfig.h for both CUDA and HIP builds
|
||||
if(USE_CUDA OR USE_ROCM)
|
||||
configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
|
||||
endif()
|
||||
|
||||
@ -122,7 +122,7 @@ void FunctionalTensorWrapper::freeze_storage() const {
|
||||
// | have their own storages, but backends like functorch |
|
||||
// \/ are allowed to re-alias underneath the pass \/
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
// | underyling_storage | | underyling_storage |
|
||||
// | underlying_storage | | underlying_storage |
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
//
|
||||
// This constructor is only used by view ops.
|
||||
|
||||
@ -1534,7 +1534,7 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
|
||||
|
||||
// XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
|
||||
// Nothing beyond this point is important for meta functions, so it's fine to exit early here.
|
||||
// Extend the condition to MAIA tesnors as MAIA tensors also don't have storage.
|
||||
// Extend the condition to MAIA tensors as MAIA tensors also don't have storage.
|
||||
if (privateuse1_without_storage ||
|
||||
common_device_.type() == DeviceType::XLA ||
|
||||
common_device_.type() == DeviceType::IPU ||
|
||||
|
||||
@ -94,11 +94,11 @@ struct PinnedReserveSegment {
|
||||
struct TORCH_API HostStats {
|
||||
// COUNT: total allocations (active)
|
||||
Stat active_requests;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. (active)
|
||||
// SUM: bytes allocated/reserved by this memory allocator. (active)
|
||||
Stat active_bytes;
|
||||
// COUNT: total allocations (active + free)
|
||||
Stat allocations;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
||||
// SUM: bytes allocated/reserved by this memory allocator. This accounts
|
||||
// for both free and in-use blocks.
|
||||
Stat allocated_bytes;
|
||||
|
||||
@ -127,7 +127,7 @@ struct alignas(hardware_destructive_interference_size) HostStatsStaged {
|
||||
// COUNT: total allocations (active + free)
|
||||
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
||||
Stat allocations;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
||||
// SUM: bytes allocated/reserved by this memory allocator. This accounts
|
||||
// for both free and in-use blocks.
|
||||
Stat allocated_bytes;
|
||||
// COUNT: number of allocations per bucket (active)
|
||||
@ -455,7 +455,7 @@ struct CachingHostAllocatorImpl {
|
||||
}
|
||||
|
||||
void resetAccumulatedStats() {
|
||||
// Reseting accumulated memory stats requires concurrently holding both the
|
||||
// Resetting accumulated memory stats requires concurrently holding both the
|
||||
// free list mutexes and the blocks mutex. Previously, this was only done in
|
||||
// empty_cache function.
|
||||
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||
@ -482,7 +482,7 @@ struct CachingHostAllocatorImpl {
|
||||
}
|
||||
|
||||
void resetPeakStats() {
|
||||
// Reseting peak memory stats requires concurrently holding both the
|
||||
// Resetting peak memory stats requires concurrently holding both the
|
||||
// free list mutexes and the blocks mutex. Previously, this was only done in
|
||||
// empty_cache function.
|
||||
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||
|
||||
@ -148,7 +148,7 @@ struct TORCH_API ClassType : public NamedType {
|
||||
|
||||
void checkNotExist(const std::string& name, const std::string& what) const;
|
||||
|
||||
// Attributes are stored in a specific slot at runtime for effiency.
|
||||
// Attributes are stored in a specific slot at runtime for efficiency.
|
||||
// When emitting instructions we specify the slot so that attribute access is
|
||||
// a constant lookup
|
||||
std::optional<size_t> findAttributeSlot(const std::string& name) const {
|
||||
@ -412,7 +412,7 @@ struct TORCH_API ClassType : public NamedType {
|
||||
// Holds method attributes
|
||||
std::weak_ptr<CompilationUnit> compilation_unit_;
|
||||
|
||||
// Holds all atrributes, attribute details are found on ClassAttribute
|
||||
// Holds all attributes, attribute details are found on ClassAttribute
|
||||
std::vector<ClassAttribute> attributes_;
|
||||
// Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
|
||||
// Never fill this without using the appropriate provideNewClassAttribute method
|
||||
|
||||
@ -537,7 +537,7 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK
|
||||
|
||||
// Note: this records a sequence number for both Autograd keys, and for
|
||||
// non-Autograd keys where the dispatchKeySet still contains an autograd key.
|
||||
// This means that we might collect the same sequence nubmer two different
|
||||
// This means that we might collect the same sequence number two different
|
||||
// events if they all occurred above Autograd and still had the Autograd
|
||||
// dispatch key in the dispatch key set.
|
||||
// However, this usually doesn't happen: normally the first call will
|
||||
|
||||
@ -585,7 +585,7 @@ class TORCH_API OperatorHandle {
|
||||
|
||||
// We need to store this iterator in order to make
|
||||
// Dispatcher::cleanup() fast -- it runs a lot on program
|
||||
// termination (and presuambly library unloading).
|
||||
// termination (and presumably library unloading).
|
||||
std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
|
||||
};
|
||||
|
||||
|
||||
@ -365,7 +365,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
|
||||
// For autograd keys, we only use kernel from CompositeImplicitAutograd when there's no direct registration
|
||||
// to its corresponding backend key or CompositeExplicitAutograd. See Note [CompositeExplicitAutograd and CompositeImplicitAutograd].
|
||||
// For AutogradOther, we eagerly return ambiguousAutogradOtherKernel() if there's registration to any of
|
||||
// its backends and ask backend extender to request a decicated Autograd key for the backend.
|
||||
// its backends and ask backend extender to request a dedicated Autograd key for the backend.
|
||||
// See Note [Ambiguity in AutogradOther kernel] for more details.
|
||||
// A CompositeExplicitAutograd kernel prevents CompositeImplicitAutograd kernel being used for Autograd keys, but it doesn't
|
||||
// cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available)
|
||||
|
||||
@ -261,7 +261,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
|
||||
//
|
||||
// There are 2 cases
|
||||
// 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
|
||||
// without the extra parenthesis, the c++ schem parser can not parse it.
|
||||
// without the extra parenthesis, the c++ scheme parser can not parse it.
|
||||
// 2. something like '-> ((str, str))'. Need extra parenthesis so the return
|
||||
// type is a single tuple rather than two strings.
|
||||
// PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
|
||||
|
||||
@ -1176,7 +1176,7 @@ struct TORCH_API IValue final {
|
||||
using HashIdentityIValueMap =
|
||||
std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
|
||||
|
||||
// Chechs if this and rhs has a subvalues in common.
|
||||
// Checks if this and rhs has a subvalues in common.
|
||||
// [t1,t2] and [t2, t3] returns true.
|
||||
bool overlaps(const IValue& rhs) const;
|
||||
|
||||
|
||||
@ -1501,7 +1501,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
|
||||
// However, the CompilationUnit holds ownership of the type's graphs, so
|
||||
// inserting a constant object into a Graph would create a reference cycle if
|
||||
// that constant object held a shared_ptr to its CU. For these objects we
|
||||
// instatiate them with non-owning references to its CU
|
||||
// instantiate them with non-owning references to its CU
|
||||
Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
|
||||
slots_.resize(numSlots);
|
||||
}
|
||||
|
||||
@ -373,7 +373,7 @@ struct TORCH_API SymbolicShape {
|
||||
// Unranked shape constructor.
|
||||
SymbolicShape() : dims_(std::nullopt) {}
|
||||
|
||||
// Known rank but unknown dimentions.
|
||||
// Known rank but unknown dimensions.
|
||||
SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
|
||||
if(!rank) {
|
||||
return;
|
||||
@ -884,9 +884,9 @@ struct TORCH_API ListType
|
||||
|
||||
// global singleton
|
||||
// Given an inner type T and an identifier,
|
||||
// this function wil return the global singleton type pointer
|
||||
// this function will return the global singleton type pointer
|
||||
// the type List<T>.
|
||||
// The extra "identifier" argument is needed beccause we have multiple container types
|
||||
// The extra "identifier" argument is needed because we have multiple container types
|
||||
// that all re-use this function (List<T>, array<T, N>, etc.)
|
||||
static TypePtr get(const std::string& identifier, TypePtr inner);
|
||||
|
||||
|
||||
@ -21,7 +21,7 @@ namespace c10 {
|
||||
|
||||
namespace detail {
|
||||
// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
|
||||
// We do this because every argument in a function schema is expected to be convertable
|
||||
// We do this because every argument in a function schema is expected to be convertible
|
||||
// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
|
||||
// See Note [Plumbing Keys Through The Dispatcher]
|
||||
template<class KernelFunctor>
|
||||
|
||||
@ -251,7 +251,7 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox
|
||||
callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CPU), dummyTensor(c10::DispatchKey::CUDA));
|
||||
EXPECT_TRUE(called_kernel_cpu);
|
||||
|
||||
// Ensure that disptach key from tensor is not used here.
|
||||
// Ensure that dispatch key from tensor is not used here.
|
||||
called_kernel_cpu = false;
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CUDA), dummyTensor(c10::DispatchKey::CPU));
|
||||
|
||||
@ -172,7 +172,7 @@ VaryingShape<Stride> TensorType::computeStrideProps(
|
||||
// The logic below follows what TensorIterator uses in its logic:
|
||||
// 1. Fast_set_up is the short-cut to identify a. channels_last and
|
||||
// b. contiguous format, which is what we have in the below logic.
|
||||
// 2. In more generla cases, it does best effort to preserve permutatoin.
|
||||
// 2. In more general cases, it does best effort to preserve permutatoin.
|
||||
if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
|
||||
// case 1.a. short cut channels last
|
||||
std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);
|
||||
|
||||
@ -77,21 +77,36 @@ CONVERT_TEMPLATE(double, int64_t)
|
||||
CONVERT_TEMPLATE(double, float)
|
||||
CONVERT_TEMPLATE(double, double)
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
CONVERT_TEMPLATE(float16_t, uint8_t)
|
||||
CONVERT_TEMPLATE(float16_t, int8_t)
|
||||
CONVERT_TEMPLATE(float16_t, int16_t)
|
||||
CONVERT_TEMPLATE(float16_t, int32_t)
|
||||
CONVERT_TEMPLATE(float16_t, int64_t)
|
||||
CONVERT_TEMPLATE(float16_t, float16_t)
|
||||
CONVERT_TEMPLATE(float16_t, float)
|
||||
CONVERT_TEMPLATE(float16_t, double)
|
||||
CONVERT_TEMPLATE(uint8_t, float16_t)
|
||||
CONVERT_TEMPLATE(int8_t, float16_t)
|
||||
CONVERT_TEMPLATE(int16_t, float16_t)
|
||||
CONVERT_TEMPLATE(int32_t, float16_t)
|
||||
CONVERT_TEMPLATE(int64_t, float16_t)
|
||||
CONVERT_TEMPLATE(float, float16_t)
|
||||
CONVERT_TEMPLATE(double, float16_t)
|
||||
|
||||
#define CONVERT_FROM_FP16_TEMPLATE(to_type) \
|
||||
template <> \
|
||||
inline void convert(const at::Half* src, to_type* dst, int64_t n) { \
|
||||
const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src); \
|
||||
return convertImpl<float16_t, to_type>(srcPtr, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_TO_FP16_TEMPLATE(from_type) \
|
||||
template <> \
|
||||
inline void convert(const from_type* src, at::Half* dst, int64_t n) { \
|
||||
float16_t* dstPtr = reinterpret_cast<float16_t*>(dst); \
|
||||
return convertImpl<from_type, float16_t>(src, dstPtr, n); \
|
||||
}
|
||||
|
||||
CONVERT_FROM_FP16_TEMPLATE(uint8_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int8_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int16_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int32_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int64_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(float16_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(float)
|
||||
CONVERT_FROM_FP16_TEMPLATE(double)
|
||||
CONVERT_TO_FP16_TEMPLATE(uint8_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int8_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int16_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int32_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int64_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(float)
|
||||
CONVERT_TO_FP16_TEMPLATE(double)
|
||||
#endif
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
CONVERT_TEMPLATE(bfloat16_t, uint8_t)
|
||||
|
||||
@ -634,8 +634,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
|
||||
// - exp(- x * x)
|
||||
auto pow_2 = (*this) * (*this);
|
||||
auto neg_pow_2 = pow_2 ^ neg_zero_vec;
|
||||
auto tmp4 = neg_pow_2.map(
|
||||
std::exp); // This can be swapped for a faster implementation of exp.
|
||||
auto tmp4 = neg_pow_2.exp();
|
||||
auto tmp5 = tmp4 ^ neg_zero_vec;
|
||||
// erf(x) = sign(x) * (1 - r * t * exp(- x * x))
|
||||
auto tmp6 = t * tmp5;
|
||||
|
||||
@ -234,7 +234,7 @@ class Vectorized<c10::Half> : public Vectorized16<
|
||||
vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
|
||||
return vaddvq_u16(bits_vec);
|
||||
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
// use known working implmentation.
|
||||
// use known working implementation.
|
||||
__at_align__ value_type tmp[size()];
|
||||
store(tmp);
|
||||
int mask = 0;
|
||||
|
||||
@ -1740,7 +1740,7 @@ Vectorized<int16_t> inline shift_256_16(
|
||||
|
||||
// Control masks for shuffle operation, treating 256 bits as an
|
||||
// array of 16-bit elements, and considering pairs of neighboring
|
||||
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// M!=N) is set so that shuffle will move element with index M from
|
||||
// input pair into element with index N in output pair, and element
|
||||
// with index M in output pair will be set to all 0s.
|
||||
@ -1875,7 +1875,7 @@ Vectorized<T> inline shift_256_8(
|
||||
|
||||
// Control masks for shuffle operation, treating 256 bits as an
|
||||
// array of 8-bit elements, and considering quadruples of
|
||||
// neighboring elements. Specifially, a mask named "ctl_M_N" (M,N
|
||||
// neighboring elements. Specifically, a mask named "ctl_M_N" (M,N
|
||||
// in [0,1,2,3], and M!=N) is set so that shuffle will move element
|
||||
// with index M from input quadruple into element with index N in
|
||||
// output quadruple, and other elements in output quadruple will be
|
||||
|
||||
@ -143,7 +143,7 @@ class Vectorized<double> {
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
||||
@ -142,7 +142,7 @@ class Vectorized<float> {
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
||||
@ -202,7 +202,7 @@ class Vectorized<int16_t> {
|
||||
const Vectorized<int16_t>& a,
|
||||
const Vectorized<int16_t>& b,
|
||||
const Vectorized<int16_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
// warning intel style mask will not work properly
|
||||
return {
|
||||
|
||||
@ -155,7 +155,7 @@ class Vectorized<int32_t> {
|
||||
const Vectorized<int32_t>& a,
|
||||
const Vectorized<int32_t>& b,
|
||||
const Vectorized<int32_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
// warning intel style mask will not work properly
|
||||
return {
|
||||
|
||||
@ -119,7 +119,7 @@ class Vectorized<int64_t> {
|
||||
const Vectorized<int64_t>& a,
|
||||
const Vectorized<int64_t>& b,
|
||||
const Vectorized<int64_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
||||
@ -397,7 +397,7 @@ inline Vectorized<bool> operator&&(
|
||||
const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
|
||||
__m512i out = _mm512_and_si512(*self_, *other_);
|
||||
Vectorized<bool> ret;
|
||||
// We do not have a constructer that takes __m512i, so we need to memcpy
|
||||
// We do not have a constructor that takes __m512i, so we need to memcpy
|
||||
std::memcpy(ret, &out, ret.size() * sizeof(bool));
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1852,7 +1852,7 @@ Vectorized<T> inline shift_512_8(
|
||||
|
||||
// Control masks for shuffle operation, treating 512 bits as an
|
||||
// array of 8-bit elements, and considering pairs of neighboring
|
||||
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// M!=N) is set so that shuffle will move element with index M from
|
||||
// input pair into element with index N in output pair, and element
|
||||
// with index M in output pair will be set to all 0s.
|
||||
|
||||
@ -634,7 +634,7 @@ struct Vectorized {
|
||||
}
|
||||
Vectorized<T> neg() const {
|
||||
// NB: the trailing return type is needed because we need to coerce the
|
||||
// return value back to T in the case of unary operator- incuring a
|
||||
// return value back to T in the case of unary operator- incurring a
|
||||
// promotion
|
||||
return map([](T x) -> T { return -x; });
|
||||
}
|
||||
|
||||
@ -1958,7 +1958,7 @@ void scaled_gemm(
|
||||
ScalarType result_dtype,
|
||||
bool use_fast_accum,
|
||||
const std::optional<Tensor>& alpha) {
|
||||
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
|
||||
// Note: see `cublasCommonArgs` for various non-intuitive manipulations
|
||||
// of input arguments to this function.
|
||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||
const auto scaleType = CUDA_R_32F;
|
||||
|
||||
@ -2,10 +2,10 @@
|
||||
|
||||
#include <ATen/cuda/ATenCUDAGeneral.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/core/impl/GPUTrace.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <c10/core/impl/GPUTrace.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
@ -246,4 +246,79 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
// EventPool - Thread-safe pool of CUDA events to avoid expensive cudaEventCreate
|
||||
// calls. cudaEventCreate when concurrently invoked from multiple threads can be
|
||||
// very expensive (especially on certain device/driver combinations).
|
||||
using CUDAEventPtr =
|
||||
std::unique_ptr<CUDAEvent, std::function<void(CUDAEvent*)>>;
|
||||
|
||||
class EventPool {
|
||||
public:
|
||||
EventPool() : pools_(at::cuda::device_count()) {}
|
||||
|
||||
CUDAEventPtr get(const DeviceIndex device) {
|
||||
// If the device is invalid, return a default event and no pooling
|
||||
if (device < 0 || device >= (DeviceIndex)pools_.size()) {
|
||||
auto deleter = [](CUDAEvent* event) {
|
||||
delete event;
|
||||
};
|
||||
return CUDAEventPtr(
|
||||
std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(), deleter);
|
||||
}
|
||||
|
||||
auto& pool = pools_[device];
|
||||
|
||||
// Create a destructor that returns the event to the appropriate device pool
|
||||
auto destructor = [&pool](CUDAEvent* event) noexcept {
|
||||
if (event != nullptr) {
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
pool.event_pool_.emplace_back(event);
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
if (!pool.event_pool_.empty()) {
|
||||
auto event = std::move(pool.event_pool_.back());
|
||||
pool.event_pool_.pop_back();
|
||||
return CUDAEventPtr(event.release(), destructor);
|
||||
}
|
||||
}
|
||||
|
||||
return CUDAEventPtr(
|
||||
std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(),
|
||||
destructor);
|
||||
}
|
||||
|
||||
void empty_cache() {
|
||||
for (auto& pool : pools_) {
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
pool.event_pool_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void init_num_events(const size_t num_events) {
|
||||
for (DeviceIndex device_idx = 0; device_idx < at::cuda::device_count(); ++device_idx) {
|
||||
CUDAGuard device_guard(device_idx);
|
||||
std::vector<CUDAEventPtr> temp_events;
|
||||
temp_events.reserve(num_events);
|
||||
for (size_t i = 0; i < num_events; ++i) {
|
||||
auto event = get(device_idx);
|
||||
// Record the event to ensure it's properly initialized
|
||||
event->record();
|
||||
temp_events.emplace_back(std::move(event));
|
||||
}
|
||||
// Events will be returned to pool when temp_events is destroyed
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
struct alignas(64) PerDevicePool {
|
||||
alignas(64) std::mutex mutex_;
|
||||
std::vector<std::unique_ptr<CUDAEvent>> event_pool_;
|
||||
};
|
||||
|
||||
std::vector<PerDevicePool> pools_;
|
||||
};
|
||||
|
||||
} // namespace at::cuda
|
||||
|
||||
@ -168,11 +168,9 @@ void CUDAGraph::instantiate() {
|
||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
|
||||
// cudaGraphInstantiateWithFlags
|
||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
|
||||
#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
|
||||
int version = 0;
|
||||
AT_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||
if (version < 11040) {
|
||||
#endif
|
||||
// Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
|
||||
// who prefer not to report error message through these arguments moving forward
|
||||
// (they prefer return value, or errors on api calls internal to the capture)
|
||||
@ -183,13 +181,11 @@ void CUDAGraph::instantiate() {
|
||||
#endif
|
||||
//Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
|
||||
//It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
|
||||
#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
|
||||
} else {
|
||||
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
|
||||
graph_,
|
||||
cudaGraphInstantiateFlagAutoFreeOnLaunch));
|
||||
}
|
||||
#endif
|
||||
has_graph_exec_ = true;
|
||||
}
|
||||
|
||||
@ -311,7 +307,7 @@ CUDAGraph::~CUDAGraph() {
|
||||
// There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
|
||||
// They wait for next sync point in order to free the memory, this is to ensure that all
|
||||
// hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
|
||||
// We need to ensure all async opreations finish before deleting the object.
|
||||
// We need to ensure all async operations finish before deleting the object.
|
||||
#if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
|
||||
if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
|
||||
{
|
||||
|
||||
@ -137,7 +137,7 @@ struct CUDACachingHostAllocatorImpl
|
||||
void free_block_slowpath(Block* block) {
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
// Users may change the allocator config at will. torch unit tests do this.
|
||||
// However, allocations using cudaHostRegister should use corresonding
|
||||
// However, allocations using cudaHostRegister should use corresponding
|
||||
// cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
|
||||
void* ptr = block->ptr_;
|
||||
bool use_register = false;
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#include <ATen/cuda/CUDAConfig.h>
|
||||
|
||||
// NOTE: These templates are intentionally not defined in this header,
|
||||
// which aviods re-compiling them for each translation unit. If you get
|
||||
// which avoids re-compiling them for each translation unit. If you get
|
||||
// a link error, you need to add an explicit instantiation for your
|
||||
// types in cub.cu
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
|
||||
GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
|
||||
```
|
||||
|
||||
Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect
|
||||
Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect
|
||||
this and reject the tunings file because the prior tunings are likely affected by other software changes.
|
||||
|
||||
The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
|
||||
|
||||
@ -235,7 +235,7 @@ class TunableOp {
|
||||
// numeric check option is controlled by non-static env var, so check it once per tuned operator
|
||||
bool do_numerics_check = ctx->IsNumericsCheckEnabled();
|
||||
|
||||
// calcaulte a reference answer for numerical check
|
||||
// calculate a reference answer for numerical check
|
||||
if (do_numerics_check) {
|
||||
reference_params = params->DeepCopy(false);
|
||||
TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
|
||||
|
||||
@ -12,7 +12,7 @@ namespace at {
|
||||
|
||||
// AcceleratorHooksInterface is a shared interface provided by all
|
||||
// accelerators to allow generic code.
|
||||
// This inferface is hook-based as it corresponds to all the functions
|
||||
// This interface is hook-based as it corresponds to all the functions
|
||||
// that are going to be called in a generic way from the CPU code.
|
||||
|
||||
struct TORCH_API AcceleratorHooksInterface {
|
||||
|
||||
@ -38,7 +38,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
|
||||
|
||||
Generator getNewGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||
// TODO(FFFrog): Perserved for BC and will be removed in the future.
|
||||
// TODO(FFFrog): Preserved for BC and will be removed in the future.
|
||||
if (at::GetGeneratorPrivate().has_value())
|
||||
return at::GetGeneratorForPrivateuse1(device_index);
|
||||
|
||||
|
||||
@ -283,7 +283,7 @@ inline void boxed_existing_bdim_all_batch_rule(
|
||||
// Use when all tensors arguments accept one (normal) batch dim.
|
||||
// This batching rule expands the batch dim on all Tensors, reshapes it into
|
||||
// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
|
||||
// This is not the most efficient thing; if there are alternatives, plese try
|
||||
// This is not the most efficient thing; if there are alternatives, please try
|
||||
// to use them. Use this only as a last resort.
|
||||
#define EXISTING_BDIM_ALL_BOXED(op) \
|
||||
m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
|
||||
|
||||
@ -384,7 +384,7 @@ fourOutputs solve_ex_batch_rule(
|
||||
|
||||
// NOTE [ solve_ex Batch Rule Contiguity ]
|
||||
// A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
|
||||
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior
|
||||
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior
|
||||
// differs based on whether or not the optimized path was taken
|
||||
const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
|
||||
if (batched_A_was_contiguous && !A.is_complex()) {
|
||||
|
||||
@ -282,7 +282,7 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(
|
||||
|
||||
dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);
|
||||
|
||||
// Not sure why output_ needs to be marked as .contiguous(). Someting must
|
||||
// Not sure why output_ needs to be marked as .contiguous(). Something must
|
||||
// have changed in PyTorch (and output of softmax is probably always contiguous)
|
||||
return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
|
||||
}
|
||||
|
||||
@ -224,7 +224,7 @@ static Tensor safeStack(TensorList tensors) {
|
||||
// is possible for the backward function to return an undefined grad for some
|
||||
// grad_input for each example. In that case, we return an undefined grad.
|
||||
//
|
||||
// It is theoretically posssible for *some* of the examples to produce an
|
||||
// It is theoretically possible for *some* of the examples to produce an
|
||||
// undefined grad (a kernel could peek at the gradient values and return an
|
||||
// undefined tensor if it determines the gradient is full of zeros). We
|
||||
// could handle this by treating the undefined grad as a zero-filled tensor
|
||||
|
||||
@ -113,7 +113,7 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const {
|
||||
return sym_sizes_default();
|
||||
}
|
||||
|
||||
// The following are publically exposed as methods of Tensor
|
||||
// The following are publicly exposed as methods of Tensor
|
||||
|
||||
IntArrayRef BatchedTensorImpl::strides_custom() const {
|
||||
return strides_default();
|
||||
|
||||
@ -37,7 +37,7 @@ namespace at::functorch {
|
||||
// how to perform the transform.
|
||||
//
|
||||
// TODO: we can excise DynamicLayer in favor of Interpreter,
|
||||
// But I am going to leave it for now as a compatiblity shim to avoid
|
||||
// But I am going to leave it for now as a compatibility shim to avoid
|
||||
// needing to refactor a lot of callsites...
|
||||
struct TORCH_API DynamicLayer {
|
||||
explicit DynamicLayer(
|
||||
|
||||
@ -88,7 +88,7 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
|
||||
// >>> VmapInterpreterPtr(&interpreter).batchSize()
|
||||
//
|
||||
// Finally, Interpreter::process switches on the type of the interpreter
|
||||
// and calls one of {Transform}Intepreter::processImpl under the hood.
|
||||
// and calls one of {Transform}Interpreter::processImpl under the hood.
|
||||
// Same for Interpreter::sendToNextInterpreter :)
|
||||
|
||||
struct VmapInterpreterMeta {
|
||||
|
||||
@ -733,7 +733,7 @@ TORCH_LIBRARY_IMPL(_, FuncTorchBatched, m) {
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
|
||||
// still legacy b/c teturns multiple tensors
|
||||
// still legacy b/c returns multiple tensors
|
||||
m.impl("split.Tensor", split_batching_rule);
|
||||
m.impl("split_with_sizes", split_with_sizes_batching_rule);
|
||||
m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule);
|
||||
|
||||
@ -158,7 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
|
||||
endKernelCoalescing();
|
||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||
|
||||
// For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
|
||||
// For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26
|
||||
// See https://github.com/pytorch/pytorch/issues/163962
|
||||
// Workaround by batching copy commands into 4Gb chunks
|
||||
constexpr size_t max_copy_size = 0x100000000; // 4GB
|
||||
|
||||
@ -148,7 +148,7 @@ inline void checkInputsSolver(const Tensor& A,
|
||||
|
||||
inline bool is_row_or_column_contiguous(const Tensor& t) {
|
||||
// This could be made more general, similar to how it's checked in matmul, which would allow to
|
||||
// ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
|
||||
// elide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
|
||||
// We choose to be conservative for simplicity
|
||||
return t.is_contiguous() || t.transpose(-2, -1).is_contiguous();
|
||||
}
|
||||
|
||||
@ -21,7 +21,7 @@ enum class fft_norm_mode {
|
||||
// NOTE [ Fourier Transform Conjugate Symmetry ]
|
||||
//
|
||||
// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is,
|
||||
// assuming X is the transformed K-dimensionsal signal, we have
|
||||
// assuming X is the transformed K-dimensional signal, we have
|
||||
//
|
||||
// X[i_1, ..., i_K] = X[j_i, ..., j_K]*,
|
||||
//
|
||||
|
||||
@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
|
||||
auto* input_tr_ptr =
|
||||
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
|
||||
// TODO: Activation transpose before and after the kernel can be removed if we
|
||||
// keep activation tensor always tranposed.
|
||||
// keep activation tensor always transposed.
|
||||
fbgemm::transpose_simd<uint8_t>(
|
||||
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);
|
||||
|
||||
|
||||
@ -520,7 +520,7 @@ cpu_adaptive_avg_pool3d_channels_last(
|
||||
scalar_t* out = output_data + i * channels;
|
||||
int64_t size = channels;
|
||||
|
||||
// Note: For oridinary usage scenario, each out lane should
|
||||
// Note: For ordinary usage scenario, each out lane should
|
||||
// fit in L1 cache; otherwise consider block dim C.
|
||||
// Pass I: zero the out lane
|
||||
int64_t d1 = 0;
|
||||
|
||||
@ -34,7 +34,7 @@ struct Dist {
|
||||
// finish : This tells what to do with the aggregated value to compute
|
||||
// the norm. Generally this is the result of val ^ (1 / p).
|
||||
// backward : This is the gradient for that norm. Arguments are pretty
|
||||
// self explanitory.
|
||||
// self explanatory.
|
||||
//
|
||||
// There are a few cases where these aren't used. The 0 norm has no backward,
|
||||
// because it's always 0, so that's shortcircuited earlier. There's a special
|
||||
|
||||
@ -30,7 +30,7 @@ vec::Vectorized<scalar_t> is_nan_vec(vec::Vectorized<scalar_t> vec) {
|
||||
return vec.isnan();
|
||||
}
|
||||
|
||||
// TODO: use is_integeral/is_same to check the scalar_t and simplify the implementation
|
||||
// TODO: use is_integral/is_same to check the scalar_t and simplify the implementation
|
||||
// currently it does not work
|
||||
template <>
|
||||
vec::Vectorized<unsigned char> is_nan_vec<unsigned char>(vec::Vectorized<unsigned char> vec) {
|
||||
|
||||
@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
|
||||
|
||||
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
|
||||
compiler flags it is currently compiled. This allows the programmer to write
|
||||
generic code, which will be compiled under multipled compilation settings.
|
||||
generic code, which will be compiled under multiplied compilation settings.
|
||||
|
||||
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
|
||||
a generic definition of `sumImplAll`. This function allows the user to reduce
|
||||
|
||||
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
|
||||
|
||||
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
|
||||
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
|
||||
// RGBA: pix1 = [
|
||||
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
|
||||
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
|
||||
|
||||
@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
|
||||
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
||||
|
||||
|
||||
/* evaluation of 8 sines at onces using AVX intrinsics
|
||||
/* evaluation of 8 sines at once using AVX intrinsics
|
||||
|
||||
The code is the exact rewriting of the cephes sinf function.
|
||||
Precision is excellent as long as x < 8192 (I did not bother to
|
||||
|
||||
@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
|
||||
const bool gamma_null = (gamma_data == nullptr);
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
|
||||
// NB: About algorithm choosen:
|
||||
// NB: About algorithm chosen:
|
||||
//
|
||||
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
|
||||
// Mean and rstd are collected per each n and g, which involves reduction
|
||||
|
||||
@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
|
||||
}
|
||||
};
|
||||
|
||||
// Dynamically Quantize the float32 input to 8 bit assymetric
|
||||
// Dynamically Quantize the float32 input to 8 bit asymmetric
|
||||
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
|
||||
|
||||
const size_t lhs_stride =
|
||||
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
|
||||
const int64_t weight_packed_size =
|
||||
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
|
||||
if (weight_packed_size == packed_weights.numel()) {
|
||||
// KleidiAI interface intenally handles the Channelwise and groupwise
|
||||
// KleidiAI interface internally handles the Channelwise and groupwise
|
||||
// distinction
|
||||
kleidiai::kai_quant_pack_lhs_int4_mm(
|
||||
output, inp, packed_weights, M, N, K, block_size);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,11 +1,11 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/cuda/CachingHostAllocator.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
#include <ATen/cuda/CachingHostAllocator.h>
|
||||
#include <ATen/cuda/PeerToPeerAccess.h>
|
||||
#include <ATen/native/Copy.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
@ -27,6 +27,24 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
namespace {
|
||||
|
||||
// Initial pool size for CUDA events per device.
|
||||
constexpr size_t kInitialEventPoolSize = 8;
|
||||
|
||||
at::cuda::CUDAEventPtr getEventFromPool(const at::DeviceIndex device_idx) {
|
||||
static auto* event_pool = []() {
|
||||
auto* pool = new at::cuda::EventPool();
|
||||
// Pre-populate the pool with events to avoid stalls in creating events
|
||||
pool->init_num_events(kInitialEventPoolSize);
|
||||
return pool;
|
||||
}();
|
||||
|
||||
return event_pool->get(device_idx);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void neg_kernel_cuda(TensorIteratorBase &iter);
|
||||
void conj_kernel_cuda(TensorIteratorBase &iter);
|
||||
|
||||
@ -263,12 +281,14 @@ void copy_device_to_device(TensorIterator& iter,
|
||||
// write-after-read dependencies on the destination side are handled, so
|
||||
// that no one is operating on the dst memory when we perform the copy.
|
||||
// src waits on dst barrier (src already waits on src)
|
||||
CUDAEvent dst_ready;
|
||||
|
||||
// Use event pool for better performance instead of creating new events
|
||||
auto dst_ready = getEventFromPool(dst_device.index());
|
||||
device_guard.set_device(dst_device);
|
||||
dst_ready.record(getCurrentCUDAStream(dst_device.index()));
|
||||
dst_ready->record(getCurrentCUDAStream(dst_device.index()));
|
||||
|
||||
device_guard.set_device(src_device);
|
||||
dst_ready.block(copy_stream);
|
||||
dst_ready->block(copy_stream);
|
||||
}
|
||||
|
||||
if (memcpy_eligible) {
|
||||
@ -307,11 +327,11 @@ void copy_device_to_device(TensorIterator& iter,
|
||||
// operate on dst's copy until the copy is complete.
|
||||
|
||||
// Still on src_device, record stream event
|
||||
CUDAEvent src_ready;
|
||||
src_ready.record(copy_stream);
|
||||
auto src_ready = getEventFromPool(src_device.index());
|
||||
src_ready->record(copy_stream);
|
||||
|
||||
device_guard.set_device(dst_device);
|
||||
src_ready.block(getCurrentCUDAStream(dst_device.index()));
|
||||
src_ready->block(getCurrentCUDAStream(dst_device.index()));
|
||||
}
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
|
||||
auto value = static_cast<scalar_t>(rand * range + from);
|
||||
// reverse the bounds of curand4 from (0, 1] to [0, 1)
|
||||
// Note that this method is from legacy THCTensorRandom and is likely to give
|
||||
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
|
||||
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
|
||||
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
|
||||
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
|
||||
auto reverse_bound_value = value == to ? from : value;
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#endif
|
||||
|
||||
// ROCm 6.3 is planned to have these functions, but until then here they are.
|
||||
#if defined(USE_ROCM) && ROCM_VERSION >= 60201
|
||||
#if defined(USE_ROCM)
|
||||
#include <device_functions.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bf16.h>
|
||||
@ -115,9 +115,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
|
||||
index_t index,
|
||||
const index_t numel,
|
||||
scalar_t value) {
|
||||
#if ( \
|
||||
(defined(USE_ROCM) && ROCM_VERSION < 60201) || \
|
||||
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))
|
||||
gpuAtomicAddNoReturn(
|
||||
reinterpret_cast<at::Half*>(tensor) + index,
|
||||
static_cast<at::Half>(value));
|
||||
@ -160,9 +158,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
|
||||
index_t index,
|
||||
const index_t numel,
|
||||
scalar_t value) {
|
||||
#if ( \
|
||||
(defined(USE_ROCM) && ROCM_VERSION < 60201) || \
|
||||
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
|
||||
gpuAtomicAddNoReturn(
|
||||
reinterpret_cast<at::BFloat16*>(tensor) + index,
|
||||
static_cast<at::BFloat16>(value));
|
||||
|
||||
@ -154,7 +154,7 @@ REGISTER_CUDA_DISPATCH(lstsq_stub, &lazy_lstsq_kernel)
|
||||
|
||||
// Old style dispatches
|
||||
// torch_cuda_linalg dynamic library should have a global constructor
|
||||
// that calls regiserLinaglDispatch so in order ot lazy bind
|
||||
// that calls registerLinalgDispatch so in order ot lazy bind
|
||||
// old style dispatch all one have to do is to load library and call disp.func_name
|
||||
// Protect from infinite recursion by initializing dispatch to self and checking
|
||||
// that values are different after linalg library were loaded
|
||||
|
||||
@ -311,7 +311,7 @@ __global__ void batch_norm_collect_statistics_kernel(
|
||||
stat_accscalar_t v_[UNRL];
|
||||
for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) {
|
||||
for (int u = 0; u < UNRL; u++)
|
||||
v_[u] = input[batch][plane][min(x+u*blockDim.x, input.size(2)-1)];
|
||||
v_[u] = input[batch][plane][std::min(x+u*blockDim.x, input.size(2)-1)];
|
||||
for (int u = 0; u < UNRL; u++) {
|
||||
if (x+u*blockDim.x < input.size(2)) {
|
||||
stat_accscalar_t d1 = v_[u] - avg;
|
||||
|
||||
1284
aten/src/ATen/native/cuda/ScaledBlas.cpp
Normal file
1284
aten/src/ATen/native/cuda/ScaledBlas.cpp
Normal file
File diff suppressed because it is too large
Load Diff
171
aten/src/ATen/native/cuda/cuBlasCommonArgs.h
Normal file
171
aten/src/ATen/native/cuda/cuBlasCommonArgs.h
Normal file
@ -0,0 +1,171 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
using at::blas::ScalingType;
|
||||
using at::blas::SwizzleType;
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
|
||||
c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
|
||||
if (resolve_conj && tensor.is_conj()) {
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
|
||||
} else {
|
||||
return c10::MaybeOwned<Tensor>::borrowed(tensor);
|
||||
}
|
||||
}
|
||||
|
||||
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
|
||||
if (tensor.is_non_overlapping_and_dense()) { // common case
|
||||
transpose_tensor = tensor.is_contiguous();
|
||||
return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
|
||||
}
|
||||
IntArrayRef tensor_strides = tensor.strides();
|
||||
IntArrayRef tensor_sizes = tensor.sizes();
|
||||
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
|
||||
transpose_tensor = false;
|
||||
return resolve_conj_if_indicated(tensor, !transpose_result);
|
||||
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
|
||||
transpose_tensor = true;
|
||||
return resolve_conj_if_indicated(tensor, transpose_result);
|
||||
} else {
|
||||
transpose_tensor = true;
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
|
||||
}
|
||||
}
|
||||
|
||||
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
|
||||
if (tensor.is_non_overlapping_and_dense()) { // common case
|
||||
transpose_tensor = tensor.is_contiguous();
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
}
|
||||
|
||||
IntArrayRef tensor_strides = tensor.strides();
|
||||
IntArrayRef tensor_sizes = tensor.sizes();
|
||||
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
|
||||
transpose_tensor = false;
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
|
||||
transpose_tensor = true;
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
} else {
|
||||
transpose_tensor = true;
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* @brief Prepares matrices for CUBLAS operation
|
||||
*
|
||||
* This constructor prepares tensors for CUBLAS
|
||||
* The main difference is that PyTorch uses row-major as the default and
|
||||
* CUBLAS expects column-major.
|
||||
*
|
||||
* @details
|
||||
* To enable row-major output while using CUBLAS,
|
||||
* we use the mathematical identity that (A × B)^T = B^T × A^T.
|
||||
*
|
||||
* Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
|
||||
* T = row-major, N = col-major
|
||||
*
|
||||
* Example:
|
||||
* For matrices A (M×K)(row-major) and B (K×N)(row-major):
|
||||
* - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
|
||||
* - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
|
||||
* - However, since the output form cublas is column-major this is
|
||||
* - equivalent to an output of size MxN row-major as expected
|
||||
*
|
||||
* The transpose flags are derived from the layouts of the passed in tensors
|
||||
*
|
||||
* If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
|
||||
* to their unpacked values to match what cuBLAS expects.
|
||||
*
|
||||
* @param mat1 First input matrix
|
||||
* @param mat2 Second input matrix
|
||||
* @param c Output matrix (result)
|
||||
* @param scale_a Optional scaling factor for first matrix
|
||||
* @param scale_b Optional scaling factor for second matrix
|
||||
* @param scale_result Optional scaling factor for result
|
||||
*/
|
||||
struct cublasCommonArgs {
|
||||
cublasCommonArgs(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
Tensor& c,
|
||||
const std::optional<Tensor>& scale_a = std::nullopt,
|
||||
const std::optional<Tensor>& scale_b = std::nullopt,
|
||||
const std::optional<Tensor>& scale_result = std::nullopt,
|
||||
const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
|
||||
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
|
||||
bool transpose_result = false, transpose_a = false, transpose_b = false;
|
||||
result = prepare_matrix_for_cublas(c, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
|
||||
|
||||
// Handle scale tensors if provided
|
||||
if (scale_a && scale_b) {
|
||||
// By default since we return in row-major we run the gemm
|
||||
// as B.T @ A.T, check transpose_result to determine if we flip the scales
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
|
||||
}
|
||||
|
||||
if (scale_result) {
|
||||
scale_result_ptr = scale_result->data_ptr();
|
||||
scale_result_dtype = scale_result->scalar_type();
|
||||
}
|
||||
|
||||
// Update transpose flags
|
||||
if (transpose_result) {
|
||||
transpose_a = !transpose_a;
|
||||
transpose_b = !transpose_b;
|
||||
}
|
||||
|
||||
auto sizes_a = mata->sizes();
|
||||
auto sizes_b = matb->sizes();
|
||||
|
||||
m = sizes_a[transpose_result ? 1 : 0];
|
||||
k = sizes_a[transpose_result ? 0 : 1];
|
||||
n = sizes_b[transpose_result ? 0 : 1];
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
|
||||
result_ld = result->stride(transpose_result ? 0 : 1);
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
|
||||
|
||||
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
|
||||
// if the gemm operands are in packed float4
|
||||
if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
|
||||
k = k * 2;
|
||||
lda = lda * 2;
|
||||
ldb = ldb * 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Matrix members
|
||||
char transa, transb;
|
||||
int64_t m, n, k;
|
||||
int64_t lda, ldb, result_ld;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result;
|
||||
|
||||
// Scale members
|
||||
void* scale_mata_ptr = nullptr;
|
||||
void* scale_matb_ptr = nullptr;
|
||||
void* scale_result_ptr = nullptr;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype;
|
||||
std::optional<ScalingType> scaling_mata_type;
|
||||
std::optional<c10::ScalarType> scale_matb_dtype;
|
||||
std::optional<ScalingType> scaling_matb_type;
|
||||
std::optional<c10::ScalarType> scale_result_dtype;
|
||||
};
|
||||
|
||||
} // namespace at::native
|
||||
@ -1,4 +1,4 @@
|
||||
#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
|
||||
#if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
@ -133,7 +133,7 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
|
||||
#define CDNA2_OR_LATER 0
|
||||
#endif
|
||||
|
||||
#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
|
||||
#if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
// TODO: Support RDNA
|
||||
@ -1161,7 +1161,7 @@ at::Tensor _weight_int4pack_mm_cuda(
|
||||
auto C_final = at::empty(
|
||||
{m, n}, at::TensorOptions().dtype(at::kBFloat16).device(A.device()));
|
||||
|
||||
#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
|
||||
#if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
#define RUN_GEMM(WARPS, K_TILES_PER_WARP, Q_GROUP_SIZE, REDUCE_TYPE) \
|
||||
do { \
|
||||
@ -1327,7 +1327,7 @@ at::Tensor _convert_weight_to_int4pack_cuda(
|
||||
{nTilesTensor, kSuperTiles, 32, innerKTiles / 2},
|
||||
at::TensorOptions().dtype(at::kInt).device(in.device()));
|
||||
|
||||
#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
|
||||
#if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
dim3 grid(kSuperTiles, nTiles);
|
||||
|
||||
|
||||
@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
|
||||
|
||||
std::string file_path;
|
||||
if (cache_dir.has_value()) {
|
||||
// Attemps to read from the cache.
|
||||
// Attempts to read from the cache.
|
||||
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
|
||||
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
|
||||
// because we hash on the CUDA code, but we save the compiled ptx or sass
|
||||
|
||||
@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
|
||||
});
|
||||
|
||||
if (input.dim() > 2) {
|
||||
// if upper=true we need to tranpose and conjugate the result tensor
|
||||
// if upper=true we need to transpose and conjugate the result tensor
|
||||
// because the cholesky decomposition is stored in the lower triangular part
|
||||
if (upper) {
|
||||
input.copy_(result.mH());
|
||||
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
||||
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
// TODO Investigate whether the following magma bug is still occuring.
|
||||
// TODO Investigate whether the following magma bug is still occurring.
|
||||
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
|
||||
// geqrf_magma currently uses geqrf2_gpu
|
||||
//
|
||||
|
||||
@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
|
||||
#if defined(BUILD_LAZY_CUDA_LINALG)
|
||||
namespace cuda { namespace detail {
|
||||
// This is only used for an old-style dispatches
|
||||
// Please do not add any new entires to it
|
||||
// Please do not add any new entries to it
|
||||
struct LinalgDispatch {
|
||||
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
|
||||
};
|
||||
|
||||
@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
|
||||
// blocked format will propagate between layers. Input, output will be in blocked format.
|
||||
//
|
||||
// For inference case, weight can be prepacked into blocked format by
|
||||
// (so as to save weight reoder overhead):
|
||||
// (so as to save weight reorder overhead):
|
||||
// model = torch.utils.mkldnn.to_mkldnn(model)
|
||||
//
|
||||
// For training case, grad_output can be CPU tensor or MKLDNN tensor,
|
||||
@ -723,7 +723,7 @@ Tensor _mkldnn_convolution_transpose(
|
||||
ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
|
||||
if (!weight.is_mkldnn()) {
|
||||
// mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
|
||||
// while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy).
|
||||
// while PyTorch has IOHW or IODHW, `._transpose()` switches strides (no memory copy).
|
||||
w.transpose_(0, 1);
|
||||
}
|
||||
|
||||
|
||||
@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
|
||||
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
|
||||
args.insert({DNNL_ARG_DST, dst});
|
||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
||||
// Create primitve and execute
|
||||
// Create primitive and execute
|
||||
auto primitive = dnnl::matmul(prim_desc);
|
||||
primitive.execute(ideep::stream::default_stream(), args);
|
||||
}
|
||||
|
||||
@ -439,7 +439,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> mkldnn_rnn_la
|
||||
// I. Memory Formats
|
||||
// a. mkldnn will use plain formats for input, hx/cx, output, hy/cy
|
||||
// and possibly use blocked formats for weights depending shape info.
|
||||
// b. All mkldnn memorys are created (in plain format) as views on ATen tensor,
|
||||
// b. All mkldnn memories are created (in plain format) as views on ATen tensor,
|
||||
// the weight reorder(if any) is handed automatically inside ideep (mkldnn bridge)
|
||||
//
|
||||
// II. MKLDNN Primitive Mapping
|
||||
|
||||
@ -39,7 +39,7 @@ void check_mkldnn_binary_fusion_inputs(
|
||||
inline std::vector<int64_t> padding_r(
|
||||
IntArrayRef padding, IntArrayRef output_padding)
|
||||
{
|
||||
// ConvTranpose padding adjustment
|
||||
// ConvTranspose padding adjustment
|
||||
//
|
||||
// PyTorch uses padding/output_padding:
|
||||
// osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1
|
||||
|
||||
@ -75,7 +75,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
|
||||
}
|
||||
|
||||
bool can_use_flash_attention(sdp::sdp_params const& params, bool debug) {
|
||||
// Currently, XPU fallbacks flash attention to overrideable
|
||||
// Currently, XPU fallbacks flash attention to overridable
|
||||
return can_use_overrideable_attention(params, debug);
|
||||
}
|
||||
|
||||
@ -115,7 +115,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
||||
// 1. Flash Attention
|
||||
// 2. Math fallback
|
||||
auto& ctx = at::globalContext();
|
||||
// use overrideable linked to onednn as overrideable implementation
|
||||
// use overridable linked to onednn as overridable implementation
|
||||
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
|
||||
!ctx.userEnabledFlashSDP()) {
|
||||
return sdp::SDPBackend::error;
|
||||
@ -165,7 +165,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
||||
}
|
||||
}
|
||||
// If we have gotten to this point then two things have happened:
|
||||
// 1. can_use_overrideable_attention did not satisfy the constraints to be ran
|
||||
// 1. can_use_overridable_attention did not satisfy the constraints to be ran
|
||||
// 2. The user has explicitly disabled the math kernel
|
||||
// We then re-run the kernel checks with debug enabled to print out the
|
||||
// reason why the kernel was not selected
|
||||
|
||||
@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
|
||||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
|
||||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
|
||||
@ -345,7 +345,7 @@ class Attr {
|
||||
dnnl::memory binary_m;
|
||||
auto binary = ops_params_[i].binary_;
|
||||
auto md = ops_params_[i].meta_;
|
||||
// qeury expected_md to achieve peak performance
|
||||
// query expected_md to achieve peak performance
|
||||
auto expected_md = pd.query_md(
|
||||
dnnl::query::exec_arg_md,
|
||||
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
|
||||
|
||||
@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// the overlaped cases are not supported
|
||||
// the overlapped cases are not supported
|
||||
dnnl::memory::dims strides = get_onednn_strides(tensor);
|
||||
int64_t storage_size = 1;
|
||||
for (size_t dim = 0; dim < tensor_dim; ++dim)
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
@ -42,7 +42,7 @@
|
||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||
|
||||
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||
// Use gather kernel to solve strides for macOS < 15.0
|
||||
// Starting with macOS 15.0, MPS supports native strides direclty in the kernels
|
||||
// Starting with macOS 15.0, MPS supports native strides directly in the kernels
|
||||
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
|
||||
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
|
||||
Tensor emptyShell = Tensor();
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#pragma onces
|
||||
#pragma once
|
||||
#include <c10/metal/common.h>
|
||||
|
||||
template <unsigned N = c10::metal::max_ndim>
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/SpectralOpsUtils.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
@ -37,25 +39,12 @@ NSArray<NSNumber*>* IntArrayToNSArray(IntArrayRef arr) {
|
||||
} // anonymous namespace
|
||||
|
||||
Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
|
||||
TORCH_CHECK(self.is_complex());
|
||||
auto in_sizes = self.sizes();
|
||||
DimVector out_sizes(in_sizes.begin(), in_sizes.end());
|
||||
out_sizes[dim.back()] = last_dim_size;
|
||||
auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
|
||||
auto out = at::empty({}, self.options().dtype(c10::toRealValueType(self.scalar_type())));
|
||||
return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out);
|
||||
}
|
||||
|
||||
Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
|
||||
TORCH_CHECK(self.is_floating_point());
|
||||
auto input_sizes = self.sizes();
|
||||
DimVector out_sizes(input_sizes.begin(), input_sizes.end());
|
||||
auto last_dim = dim.back();
|
||||
auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
|
||||
if (onesided) {
|
||||
out_sizes[last_dim] = last_dim_halfsize;
|
||||
}
|
||||
|
||||
auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type())));
|
||||
auto out = at::empty({}, self.options().dtype(c10::toComplexType(self.scalar_type())));
|
||||
return _fft_r2c_mps_out(self, dim, normalization, onesided, out);
|
||||
}
|
||||
|
||||
@ -72,6 +61,17 @@ using namespace mps;
|
||||
|
||||
// TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237
|
||||
Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) {
|
||||
TORCH_CHECK(self.scalar_type() == kFloat || self.scalar_type() == kHalf, "Only float and half dtypes are supported");
|
||||
TORCH_CHECK(out.scalar_type() == c10::toComplexType(self.scalar_type()));
|
||||
const auto input_sizes = self.sym_sizes();
|
||||
SymDimVector out_sizes(input_sizes.begin(), input_sizes.end());
|
||||
auto last_dim = dim.back();
|
||||
auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
|
||||
if (onesided) {
|
||||
out_sizes[last_dim] = last_dim_halfsize;
|
||||
}
|
||||
at::native::resize_output_symint(out, out_sizes);
|
||||
|
||||
auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" +
|
||||
std::to_string(normalization) + ":" + std::to_string(onesided);
|
||||
@autoreleasepool {
|
||||
@ -112,6 +112,12 @@ Tensor& _fft_c2r_mps_out(const Tensor& self,
|
||||
int64_t normalization,
|
||||
int64_t last_dim_size,
|
||||
Tensor& out) {
|
||||
TORCH_CHECK(self.is_complex(), "Input must be complex");
|
||||
TORCH_CHECK(out.scalar_type() == c10::toRealValueType(self.scalar_type()), "Unexpected output type");
|
||||
const auto in_sizes = self.sym_sizes();
|
||||
SymDimVector out_sizes(in_sizes.begin(), in_sizes.end());
|
||||
out_sizes[dim.back()] = last_dim_size;
|
||||
at::native::resize_output_symint(out, out_sizes);
|
||||
auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
|
||||
std::to_string(normalization) + ":" + std::to_string(last_dim_size);
|
||||
@autoreleasepool {
|
||||
|
||||
@ -158,7 +158,7 @@ static void reduction_out_mps(const Tensor& input_t,
|
||||
IntArrayRef dim = opt_dim.value();
|
||||
for (const auto dim_val : dim) {
|
||||
auto wrap_dim = maybe_wrap_dim(dim_val, input_shape.size());
|
||||
// canSqueeze logic is broken when dim is negative, it introduces off-by-one-erros or crashes
|
||||
// canSqueeze logic is broken when dim is negative, it introduces off-by-one-errors or crashes
|
||||
// See https://github.com/pytorch/pytorch/issues/136132#issuecomment-2354482608
|
||||
if (wrap_dim >= 4 || dim_val < 0) {
|
||||
canSqueezeLastDim = false;
|
||||
@ -1282,7 +1282,7 @@ static void all_any_common_impl_mps(const Tensor& input_t,
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
MPSGraphTensor* outputTensor = nil;
|
||||
if (input_t.ndimension() > 4) {
|
||||
@ -1352,7 +1352,7 @@ TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
if (input_t.dim() > 4) {
|
||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||
@ -1400,7 +1400,7 @@ TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
if (input_t.ndimension() > 4) {
|
||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||
|
||||
@ -19,7 +19,7 @@ namespace at::native::mps {
|
||||
|
||||
// For both scatter and gather kernels, there are 4 specized ones (for 1D to 4D tensor)
|
||||
// and one generic, for 5+D ones. Assumption (to be tested) about specialized kernels
|
||||
// is that reduction of n-dimentional vector, where n is 2, should be slower
|
||||
// is that reduction of n-dimensional vector, where n is 2, should be slower
|
||||
// than reduction of 2D one, as n is not known at compiler time, therefore compiler
|
||||
// could not do loop unrolls, that is
|
||||
// float sum(float* v, int n) {
|
||||
|
||||
@ -53,7 +53,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
|
||||
normalized_shape);
|
||||
|
||||
// Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
|
||||
// Also, compute M and N considering the idiosyncracies of NestedTensors
|
||||
// Also, compute M and N considering the idiosyncrasies of NestedTensors
|
||||
int64_t N = 1;
|
||||
for (const auto i: c10::irange(normalized_ndim)) {
|
||||
TORCH_CHECK(
|
||||
|
||||
@ -95,7 +95,7 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6
|
||||
for (const auto split_idx : c10::irange(chunks)) {
|
||||
auto new_sizes = sizes.clone();
|
||||
auto new_strides = strides.clone();
|
||||
// This copys offsets so we are safe to move
|
||||
// This copies offsets so we are safe to move
|
||||
auto new_offsets = offsets.clone();
|
||||
int64_t *size_ptr = new_sizes.data_ptr<int64_t>();
|
||||
int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>();
|
||||
|
||||
@ -245,7 +245,7 @@ int64_t get_nnz(const Tensor& nestedtensor) {
|
||||
// this is because needs_broadcast indicates that the batch_size is 1
|
||||
// and hence there is only 1 value for seq_len
|
||||
// (2) The cum_seq_lens are given by [0, {*}_t.size(1), 2 * {*}_t.size(1),
|
||||
// ..., outut_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by
|
||||
// ..., output_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by
|
||||
// output_batch_size * {*}_t.size(1);
|
||||
|
||||
int64_t max_seqlen_batch_q = 0, Nnz_q = 0;
|
||||
|
||||
@ -575,24 +575,9 @@ void spmm(
|
||||
cusparseOperation_t opB = transpose_B ? CUSPARSE_OPERATION_TRANSPOSE
|
||||
: CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
|
||||
// CUDA < 11.0 doesn't support 64-bit indices and doesn't raise an error about this
|
||||
// silently returning incorrect results
|
||||
#if defined(USE_ROCM) && (ROCM_VERSION < 60300)
|
||||
auto mat1_32 = at::native::_sparse_csr_tensor_unsafe(
|
||||
mat1.crow_indices().to(kInt),
|
||||
mat1.col_indices().to(kInt),
|
||||
mat1.values(),
|
||||
mat1.sizes(),
|
||||
mat1.scalar_type(),
|
||||
mat1.layout(),
|
||||
mat1.device());
|
||||
auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(mat1_32);
|
||||
auto algorithm = CUSPARSE_MM_ALG_DEFAULT;
|
||||
#else // defined(USE_ROCM) && (ROCM_VERSION < 60300)
|
||||
// TODO: update this to support COO sparse layout
|
||||
auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(mat1);
|
||||
auto algorithm = CUSPARSE_SPMM_CSR_ALG2;
|
||||
#endif // defined(USE_ROCM) && (ROCM_VERSION < 60300)
|
||||
|
||||
auto descB = at::cuda::sparse::CuSparseConstDnMatDescriptor(
|
||||
transpose_B ? mat2_->mT() : *mat2_);
|
||||
|
||||
@ -193,12 +193,12 @@ vTensor pack_biases_quantized_weights(
|
||||
src_kw_sz = b_sizes[Layout::BatchMatrices::width];
|
||||
src_kh_sz = b_sizes[Layout::BatchMatrices::height];
|
||||
} else if (bias.sizes().size() == 2) {
|
||||
// skip batch dim for boardcasting; index -1
|
||||
// skip batch dim for broadcasting; index -1
|
||||
src_kb_sz = 1;
|
||||
src_kw_sz = b_sizes[Layout::BatchMatrices::height];
|
||||
src_kh_sz = b_sizes[Layout::BatchMatrices::batch];
|
||||
} else {
|
||||
// skip batch & height dim for boardcasting; index -2
|
||||
// skip batch & height dim for broadcasting; index -2
|
||||
src_kb_sz = 1;
|
||||
src_kw_sz = b_sizes[Layout::BatchMatrices::batch];
|
||||
src_kh_sz = 1;
|
||||
@ -327,13 +327,13 @@ bool available_check_with_batch(
|
||||
weight.size(Layout::BatchMatrices::batch) ||
|
||||
bias->size(Layout::BatchMatrices::batch) == 1);
|
||||
} else if (bias->ndimension() == 2) {
|
||||
// skip batch dim for boardcasting; index -1
|
||||
// skip batch dim for broadcasting; index -1
|
||||
bias_available &=
|
||||
(bias->size(Layout::BatchMatrices::height) ==
|
||||
weight.size(Layout::BatchMatrices::width) ||
|
||||
bias->size(Layout::BatchMatrices::height) == 1);
|
||||
} else {
|
||||
// skip batch & height dim for boardcasting; index -2
|
||||
// skip batch & height dim for broadcasting; index -2
|
||||
bias_available &=
|
||||
(bias->size(Layout::BatchMatrices::batch) ==
|
||||
weight.size(Layout::BatchMatrices::width) ||
|
||||
|
||||
@ -158,7 +158,7 @@ class TORCH_API Tensor: public TensorBase {
|
||||
// will only lead to trouble and dangling references.
|
||||
c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
|
||||
|
||||
// The following overloads are very intruiging. Consider the following
|
||||
// The following overloads are very intriguing. Consider the following
|
||||
// program:
|
||||
//
|
||||
// x[1] = 3;
|
||||
|
||||
@ -6894,7 +6894,7 @@ TEST_F(VulkanAPITest, slice_height_success) {
|
||||
{2, {2, 3, 40, 50}}, // 4D tensors with dim=height
|
||||
{1, {3, 40, 50}}, // 3D tensors with dim=height
|
||||
{0, {40, 50}}, // 2D tensors with dim=height
|
||||
// 1D tesnors don't have height dim for test
|
||||
// 1D tensors don't have height dim for test
|
||||
};
|
||||
|
||||
// Act/Assert
|
||||
@ -6906,7 +6906,7 @@ TEST_F(VulkanAPITest, slice_feature_success) {
|
||||
std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
|
||||
{1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel)
|
||||
{0, {40, 13, 14}}, // 3D tensors with dim=feature(channel)
|
||||
// 1D and 2D tesnors don't have feature(channel) dim for test
|
||||
// 1D and 2D tensors don't have feature(channel) dim for test
|
||||
};
|
||||
|
||||
// Act/Assert
|
||||
@ -6917,7 +6917,7 @@ TEST_F(VulkanAPITest, slice_batch_success) {
|
||||
// Arrange
|
||||
std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
|
||||
{0, {40, 3, 13, 14}}, // 4D tensors with dim=batch
|
||||
// 1D, 2D and 3D tesnors don't have batch dim for test
|
||||
// 1D, 2D and 3D tensors don't have batch dim for test
|
||||
};
|
||||
|
||||
// Act/Assert
|
||||
|
||||
@ -916,6 +916,7 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/autograd/python_torch_functions_manual.cpp",
|
||||
"torch/csrc/autograd/python_variable.cpp",
|
||||
"torch/csrc/autograd/python_variable_indexing.cpp",
|
||||
"torch/csrc/distributed/python_placement.cpp",
|
||||
"torch/csrc/dynamo/python_compiled_autograd.cpp",
|
||||
"torch/csrc/dynamo/cache_entry.cpp",
|
||||
"torch/csrc/dynamo/cpp_shim.cpp",
|
||||
@ -1073,6 +1074,7 @@ aten_cpu_non_globed_sources = [
|
||||
"aten/src/ATen/detail/MPSHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/MAIAHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/PrivateUse1HooksInterface.cpp",
|
||||
"aten/src/ATen/detail/XLAHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/XPUHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/MTIAHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/IPUHooksInterface.cpp",
|
||||
@ -1091,6 +1093,7 @@ aten_cpu_non_globed_headers = [
|
||||
"aten/src/ATen/detail/HPUHooksInterface.h",
|
||||
"aten/src/ATen/detail/MAIAHooksInterface.h",
|
||||
"aten/src/ATen/detail/PrivateUse1HooksInterface.h",
|
||||
"aten/src/ATen/detail/XLAHooksInterface.h",
|
||||
"aten/src/ATen/detail/XPUHooksInterface.h",
|
||||
"aten/src/ATen/detail/MTIAHooksInterface.h",
|
||||
"aten/src/ATen/detail/IPUHooksInterface.h",
|
||||
|
||||
@ -329,17 +329,17 @@ struct pair {
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static T conj(T a) {
|
||||
inline T conj(T a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
template <>
|
||||
half2 conj(half2 a) {
|
||||
inline half2 conj(half2 a) {
|
||||
return half2(a.x, -a.y);
|
||||
}
|
||||
|
||||
template <>
|
||||
float2 conj(float2 a) {
|
||||
inline float2 conj(float2 a) {
|
||||
return float2(a.x, -a.y);
|
||||
}
|
||||
|
||||
|
||||
@ -1638,7 +1638,38 @@ if(USE_KINETO)
|
||||
message(STATUS " KINETO_LIBRARY_TYPE = ${KINETO_LIBRARY_TYPE}")
|
||||
|
||||
if(NOT LIBKINETO_NOCUPTI)
|
||||
if(TARGET CUDA::cupti)
|
||||
set(CUDA_SOURCE_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE STRING "")
|
||||
message(STATUS " CUDA_SOURCE_DIR = ${CUDA_SOURCE_DIR}")
|
||||
message(STATUS " CUDA_INCLUDE_DIRS = ${CUDA_INCLUDE_DIRS}")
|
||||
|
||||
if(NOT MSVC)
|
||||
if(USE_CUPTI_SO)
|
||||
set(CUPTI_LIB_NAME "libcupti.so")
|
||||
else()
|
||||
set(CUPTI_LIB_NAME "libcupti_static.a")
|
||||
endif()
|
||||
else()
|
||||
set(CUPTI_LIB_NAME "cupti.lib")
|
||||
endif()
|
||||
|
||||
find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS
|
||||
${CUDA_SOURCE_DIR}
|
||||
${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
|
||||
${CUDA_SOURCE_DIR}/lib
|
||||
${CUDA_SOURCE_DIR}/lib64
|
||||
NO_DEFAULT_PATH)
|
||||
|
||||
find_path(CUPTI_INCLUDE_DIR cupti.h PATHS
|
||||
${CUDA_SOURCE_DIR}/extras/CUPTI/include
|
||||
${CUDA_INCLUDE_DIRS}
|
||||
${CUDA_SOURCE_DIR}
|
||||
${CUDA_SOURCE_DIR}/include
|
||||
NO_DEFAULT_PATH)
|
||||
|
||||
if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR)
|
||||
message(STATUS " CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}")
|
||||
set(CUDA_cupti_LIBRARY ${CUPTI_LIBRARY_PATH})
|
||||
message(STATUS " CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}")
|
||||
message(STATUS "Found CUPTI")
|
||||
set(LIBKINETO_NOCUPTI OFF CACHE STRING "" FORCE)
|
||||
|
||||
@ -1651,7 +1682,7 @@ if(USE_KINETO)
|
||||
if(NOT APPLE)
|
||||
set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} "dl" "pthread")
|
||||
endif()
|
||||
set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} $<LINK_LIBRARY:WHOLE_ARCHIVE,CUDA::cupti_static>)
|
||||
set(CMAKE_REQUIRED_LINK_OPTIONS "-Wl,--whole-archive,${CUPTI_LIBRARY_PATH},--no-whole-archive")
|
||||
check_cxx_source_runs("#include <stdexcept>
|
||||
int main() {
|
||||
try {
|
||||
|
||||
@ -29,10 +29,15 @@ SET(Open_BLAS_LIB_SEARCH_PATHS
|
||||
$ENV{OpenBLAS}/lib
|
||||
$ENV{OpenBLAS_HOME}
|
||||
$ENV{OpenBLAS_HOME}/lib
|
||||
)
|
||||
)
|
||||
|
||||
SET(Open_BLAS_LIB_NAME openblas)
|
||||
IF(DEFINED ENV{OpenBLAS_LIB_NAME})
|
||||
SET(Open_BLAS_LIB_NAME $ENV{OpenBLAS_LIB_NAME})
|
||||
ENDIF()
|
||||
|
||||
FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
|
||||
FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
|
||||
FIND_LIBRARY(OpenBLAS_LIB NAMES ${Open_BLAS_LIB_NAME} PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
|
||||
|
||||
SET(OpenBLAS_FOUND ON)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user