mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-29 19:24:55 +08:00
Compare commits
144 Commits
add_conv3d
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 1dd6b76914 | |||
| 284716a691 | |||
| 8b188647cf | |||
| 96b61844a7 | |||
| 1b655a87ef | |||
| cb6966704c | |||
| 17d5aa4767 | |||
| cde81e92b9 | |||
| bfc2050db9 | |||
| c5701d0ab5 | |||
| 23669d02a6 | |||
| e8d887ae3f | |||
| 774abb018e | |||
| 0e19561e23 | |||
| 1fa520ea65 | |||
| c2e3cc7aed | |||
| 5849eea129 | |||
| 924482a6f6 | |||
| 20be077085 | |||
| 94eaeb9cb8 | |||
| 753d9bd806 | |||
| dd1fe7c22f | |||
| 695cb0d342 | |||
| 1764f3a9c8 | |||
| c9eabadc5e | |||
| c201a1cab1 | |||
| e105a47575 | |||
| aab27b051a | |||
| f8b4c00294 | |||
| 877f126e35 | |||
| 4fada51ada | |||
| 76b2c37045 | |||
| adedf26e21 | |||
| bea89d6060 | |||
| 48e672d149 | |||
| afaaaa314c | |||
| 84fe848503 | |||
| 56afad4eb3 | |||
| 2a058bfecf | |||
| 31e42eb732 | |||
| a9b29caeae | |||
| 0d4992c170 | |||
| b060e5c131 | |||
| 6d5e651a50 | |||
| 3cc5949dc2 | |||
| f167fd09fa | |||
| 68b3984b77 | |||
| a1eb6b5538 | |||
| f36f372acc | |||
| d9483d4c8d | |||
| fea819ed08 | |||
| 84a2715d34 | |||
| 572cc12b42 | |||
| 1fdef664a5 | |||
| 08ae55021e | |||
| 551921d484 | |||
| b5189e269e | |||
| 3895ce093f | |||
| 8aa087a29d | |||
| 7379972cc0 | |||
| b903018c26 | |||
| 21b48f8dfa | |||
| 009ea77234 | |||
| 0e46a10aa7 | |||
| a25818cf7e | |||
| e3e93c7107 | |||
| 1abfa5f70b | |||
| 687c15c0b3 | |||
| 895795f07c | |||
| 2dc56456cb | |||
| 8110ce02a2 | |||
| 43c30f607e | |||
| 5ebf74a655 | |||
| acd936cc1a | |||
| a4a0378e6b | |||
| ac841267a1 | |||
| 0eacd934bc | |||
| 5016e7b2eb | |||
| 544b443ea1 | |||
| 3041ede082 | |||
| 34d6ef7022 | |||
| 110efe4df4 | |||
| e137cd0a10 | |||
| be28329710 | |||
| 85a7c745aa | |||
| 32fe4f681e | |||
| ebb2b2e894 | |||
| 13413b3b07 | |||
| 5d0b3e28dc | |||
| 9139368b64 | |||
| 02095cc09d | |||
| 65868156c6 | |||
| f93ea7dab1 | |||
| a77f5d9a00 | |||
| ff46d5a79b | |||
| f452edd782 | |||
| ea698e8bfc | |||
| 7f7a28046b | |||
| d8283a317a | |||
| e0ca3049c0 | |||
| 8417981c96 | |||
| 06e71c8558 | |||
| a76b59cc45 | |||
| 74336f8c77 | |||
| 236ce736a1 | |||
| 17bdb232e1 | |||
| add37bacda | |||
| 1425b40f29 | |||
| 8af9ed0824 | |||
| 7045aab143 | |||
| 7ae8aaf4c0 | |||
| f2450798cd | |||
| 46d17e8871 | |||
| dc011d3203 | |||
| e95920e3e6 | |||
| 5e769ff867 | |||
| 0ae3e30621 | |||
| 47f50cfd45 | |||
| a51f877287 | |||
| b44423bbb4 | |||
| 8e1e4ee8e0 | |||
| 1e836bc769 | |||
| 9a91486e45 | |||
| 92381a5aa7 | |||
| 2a5f87decf | |||
| 840d63c12d | |||
| 2ce894bb1d | |||
| 47ec1e9990 | |||
| 904abfc2ca | |||
| 7d16fcf2df | |||
| 483845a9c4 | |||
| 60bcb4ee88 | |||
| ee7434be82 | |||
| d049ed2cb1 | |||
| 9901d44418 | |||
| 6096c0fc74 | |||
| f6951cb8ea | |||
| 8887a33ede | |||
| 36a48e7e6d | |||
| c6a02eae5b | |||
| 6ecd6b23b6 | |||
| 3f69b4d9b4 | |||
| a04edcb27a | |||
| eb2bad5bb5 |
@ -10,7 +10,7 @@ else
|
||||
arch_path='sbsa'
|
||||
fi
|
||||
|
||||
NVSHMEM_VERSION=3.3.24
|
||||
NVSHMEM_VERSION=3.4.5
|
||||
|
||||
function install_cuda {
|
||||
version=$1
|
||||
@ -150,7 +150,7 @@ function install_130 {
|
||||
CUDNN_VERSION=9.13.0.50
|
||||
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
# install CUDA 13.0 in the same container
|
||||
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
|
||||
install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
|
||||
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
install_cudnn 13 $CUDNN_VERSION
|
||||
|
||||
@ -100,6 +100,8 @@ COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
|
||||
ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||
|
||||
|
||||
@ -460,28 +460,18 @@ test_inductor_shard() {
|
||||
--verbose
|
||||
}
|
||||
|
||||
test_inductor_aoti() {
|
||||
# docker build uses bdist_wheel which does not work with test_aot_inductor
|
||||
# TODO: need a faster way to build
|
||||
test_inductor_aoti_cpp() {
|
||||
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||
# We need to hipify before building again
|
||||
python3 tools/amd_build/build_amd.py
|
||||
fi
|
||||
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
||||
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
|
||||
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
||||
else
|
||||
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
||||
fi
|
||||
|
||||
# aoti cmake custom command requires `torch` to be installed
|
||||
# initialize the cmake build cache and install torch
|
||||
/usr/bin/env "${BUILD_COMMAND[@]}"
|
||||
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
||||
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
||||
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
||||
}
|
||||
|
||||
@ -1659,7 +1649,7 @@ test_operator_microbenchmark() {
|
||||
|
||||
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
||||
|
||||
for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
|
||||
for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
|
||||
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
|
||||
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
|
||||
--benchmark-name "PyTorch operator microbenchmark" --use-compile
|
||||
@ -1776,7 +1766,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||
test_inductor_aoti
|
||||
test_inductor_aoti_cpp
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
install_torchvision
|
||||
|
||||
@ -7,12 +7,9 @@ if "%DESIRED_PYTHON%" == "3.13t" (
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.13t"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14t" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.14t"
|
||||
) else (
|
||||
|
||||
@ -1,3 +1,8 @@
|
||||
---
|
||||
name: docstring
|
||||
description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
|
||||
---
|
||||
|
||||
# PyTorch Docstring Writing Guide
|
||||
|
||||
This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.
|
||||
385
.claude/skills/skill-writer/SKILL.md
Normal file
385
.claude/skills/skill-writer/SKILL.md
Normal file
@ -0,0 +1,385 @@
|
||||
---
|
||||
name: skill-writer
|
||||
description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
|
||||
---
|
||||
|
||||
# Skill Writer
|
||||
|
||||
This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
|
||||
|
||||
## When to use this Skill
|
||||
|
||||
Use this Skill when:
|
||||
- Creating a new Agent Skill
|
||||
- Writing or updating SKILL.md files
|
||||
- Designing skill structure and frontmatter
|
||||
- Troubleshooting skill discovery issues
|
||||
- Converting existing prompts or workflows into Skills
|
||||
|
||||
## Instructions
|
||||
|
||||
### Step 1: Determine Skill scope
|
||||
|
||||
First, understand what the Skill should do:
|
||||
|
||||
1. **Ask clarifying questions**:
|
||||
- What specific capability should this Skill provide?
|
||||
- When should Claude use this Skill?
|
||||
- What tools or resources does it need?
|
||||
- Is this for personal use or team sharing?
|
||||
|
||||
2. **Keep it focused**: One Skill = one capability
|
||||
- Good: "PDF form filling", "Excel data analysis"
|
||||
- Too broad: "Document processing", "Data tools"
|
||||
|
||||
### Step 2: Choose Skill location
|
||||
|
||||
Determine where to create the Skill:
|
||||
|
||||
**Personal Skills** (`~/.claude/skills/`):
|
||||
- Individual workflows and preferences
|
||||
- Experimental Skills
|
||||
- Personal productivity tools
|
||||
|
||||
**Project Skills** (`.claude/skills/`):
|
||||
- Team workflows and conventions
|
||||
- Project-specific expertise
|
||||
- Shared utilities (committed to git)
|
||||
|
||||
### Step 3: Create Skill structure
|
||||
|
||||
Create the directory and files:
|
||||
|
||||
```bash
|
||||
# Personal
|
||||
mkdir -p ~/.claude/skills/skill-name
|
||||
|
||||
# Project
|
||||
mkdir -p .claude/skills/skill-name
|
||||
```
|
||||
|
||||
For multi-file Skills:
|
||||
```
|
||||
skill-name/
|
||||
├── SKILL.md (required)
|
||||
├── reference.md (optional)
|
||||
├── examples.md (optional)
|
||||
├── scripts/
|
||||
│ └── helper.py (optional)
|
||||
└── templates/
|
||||
└── template.txt (optional)
|
||||
```
|
||||
|
||||
### Step 4: Write SKILL.md frontmatter
|
||||
|
||||
Create YAML frontmatter with required fields:
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: skill-name
|
||||
description: Brief description of what this does and when to use it
|
||||
---
|
||||
```
|
||||
|
||||
**Field requirements**:
|
||||
|
||||
- **name**:
|
||||
- Lowercase letters, numbers, hyphens only
|
||||
- Max 64 characters
|
||||
- Must match directory name
|
||||
- Good: `pdf-processor`, `git-commit-helper`
|
||||
- Bad: `PDF_Processor`, `Git Commits!`
|
||||
|
||||
- **description**:
|
||||
- Max 1024 characters
|
||||
- Include BOTH what it does AND when to use it
|
||||
- Use specific trigger words users would say
|
||||
- Mention file types, operations, and context
|
||||
|
||||
**Optional frontmatter fields**:
|
||||
|
||||
- **allowed-tools**: Restrict tool access (comma-separated list)
|
||||
```yaml
|
||||
allowed-tools: Read, Grep, Glob
|
||||
```
|
||||
Use for:
|
||||
- Read-only Skills
|
||||
- Security-sensitive workflows
|
||||
- Limited-scope operations
|
||||
|
||||
### Step 5: Write effective descriptions
|
||||
|
||||
The description is critical for Claude to discover your Skill.
|
||||
|
||||
**Formula**: `[What it does] + [When to use it] + [Key triggers]`
|
||||
|
||||
**Examples**:
|
||||
|
||||
✅ **Good**:
|
||||
```yaml
|
||||
description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
|
||||
```
|
||||
|
||||
✅ **Good**:
|
||||
```yaml
|
||||
description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
|
||||
```
|
||||
|
||||
❌ **Too vague**:
|
||||
```yaml
|
||||
description: Helps with documents
|
||||
description: For data analysis
|
||||
```
|
||||
|
||||
**Tips**:
|
||||
- Include specific file extensions (.pdf, .xlsx, .json)
|
||||
- Mention common user phrases ("analyze", "extract", "generate")
|
||||
- List concrete operations (not generic verbs)
|
||||
- Add context clues ("Use when...", "For...")
|
||||
|
||||
### Step 6: Structure the Skill content
|
||||
|
||||
Use clear Markdown sections:
|
||||
|
||||
```markdown
|
||||
# Skill Name
|
||||
|
||||
Brief overview of what this Skill does.
|
||||
|
||||
## Quick start
|
||||
|
||||
Provide a simple example to get started immediately.
|
||||
|
||||
## Instructions
|
||||
|
||||
Step-by-step guidance for Claude:
|
||||
1. First step with clear action
|
||||
2. Second step with expected outcome
|
||||
3. Handle edge cases
|
||||
|
||||
## Examples
|
||||
|
||||
Show concrete usage examples with code or commands.
|
||||
|
||||
## Best practices
|
||||
|
||||
- Key conventions to follow
|
||||
- Common pitfalls to avoid
|
||||
- When to use vs. not use
|
||||
|
||||
## Requirements
|
||||
|
||||
List any dependencies or prerequisites:
|
||||
```bash
|
||||
pip install package-name
|
||||
```
|
||||
|
||||
## Advanced usage
|
||||
|
||||
For complex scenarios, see [reference.md](reference.md).
|
||||
```
|
||||
|
||||
### Step 7: Add supporting files (optional)
|
||||
|
||||
Create additional files for progressive disclosure:
|
||||
|
||||
**reference.md**: Detailed API docs, advanced options
|
||||
**examples.md**: Extended examples and use cases
|
||||
**scripts/**: Helper scripts and utilities
|
||||
**templates/**: File templates or boilerplate
|
||||
|
||||
Reference them from SKILL.md:
|
||||
```markdown
|
||||
For advanced usage, see [reference.md](reference.md).
|
||||
|
||||
Run the helper script:
|
||||
\`\`\`bash
|
||||
python scripts/helper.py input.txt
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
### Step 8: Validate the Skill
|
||||
|
||||
Check these requirements:
|
||||
|
||||
✅ **File structure**:
|
||||
- [ ] SKILL.md exists in correct location
|
||||
- [ ] Directory name matches frontmatter `name`
|
||||
|
||||
✅ **YAML frontmatter**:
|
||||
- [ ] Opening `---` on line 1
|
||||
- [ ] Closing `---` before content
|
||||
- [ ] Valid YAML (no tabs, correct indentation)
|
||||
- [ ] `name` follows naming rules
|
||||
- [ ] `description` is specific and < 1024 chars
|
||||
|
||||
✅ **Content quality**:
|
||||
- [ ] Clear instructions for Claude
|
||||
- [ ] Concrete examples provided
|
||||
- [ ] Edge cases handled
|
||||
- [ ] Dependencies listed (if any)
|
||||
|
||||
✅ **Testing**:
|
||||
- [ ] Description matches user questions
|
||||
- [ ] Skill activates on relevant queries
|
||||
- [ ] Instructions are clear and actionable
|
||||
|
||||
### Step 9: Test the Skill
|
||||
|
||||
1. **Restart Claude Code** (if running) to load the Skill
|
||||
|
||||
2. **Ask relevant questions** that match the description:
|
||||
```
|
||||
Can you help me extract text from this PDF?
|
||||
```
|
||||
|
||||
3. **Verify activation**: Claude should use the Skill automatically
|
||||
|
||||
4. **Check behavior**: Confirm Claude follows the instructions correctly
|
||||
|
||||
### Step 10: Debug if needed
|
||||
|
||||
If Claude doesn't use the Skill:
|
||||
|
||||
1. **Make description more specific**:
|
||||
- Add trigger words
|
||||
- Include file types
|
||||
- Mention common user phrases
|
||||
|
||||
2. **Check file location**:
|
||||
```bash
|
||||
ls ~/.claude/skills/skill-name/SKILL.md
|
||||
ls .claude/skills/skill-name/SKILL.md
|
||||
```
|
||||
|
||||
3. **Validate YAML**:
|
||||
```bash
|
||||
cat SKILL.md | head -n 10
|
||||
```
|
||||
|
||||
4. **Run debug mode**:
|
||||
```bash
|
||||
claude --debug
|
||||
```
|
||||
|
||||
## Common patterns
|
||||
|
||||
### Read-only Skill
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: code-reader
|
||||
description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
|
||||
allowed-tools: Read, Grep, Glob
|
||||
---
|
||||
```
|
||||
|
||||
### Script-based Skill
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: data-processor
|
||||
description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
|
||||
---
|
||||
|
||||
# Data Processor
|
||||
|
||||
## Instructions
|
||||
|
||||
1. Use the processing script:
|
||||
\`\`\`bash
|
||||
python scripts/process.py input.csv --output results.json
|
||||
\`\`\`
|
||||
|
||||
2. Validate output with:
|
||||
\`\`\`bash
|
||||
python scripts/validate.py results.json
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
### Multi-file Skill with progressive disclosure
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: api-designer
|
||||
description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
|
||||
---
|
||||
|
||||
# API Designer
|
||||
|
||||
Quick start: See [examples.md](examples.md)
|
||||
|
||||
Detailed reference: See [reference.md](reference.md)
|
||||
|
||||
## Instructions
|
||||
|
||||
1. Gather requirements
|
||||
2. Design endpoints (see examples.md)
|
||||
3. Document with OpenAPI spec
|
||||
4. Review against best practices (see reference.md)
|
||||
```
|
||||
|
||||
## Best practices for Skill authors
|
||||
|
||||
1. **One Skill, one purpose**: Don't create mega-Skills
|
||||
2. **Specific descriptions**: Include trigger words users will say
|
||||
3. **Clear instructions**: Write for Claude, not humans
|
||||
4. **Concrete examples**: Show real code, not pseudocode
|
||||
5. **List dependencies**: Mention required packages in description
|
||||
6. **Test with teammates**: Verify activation and clarity
|
||||
7. **Version your Skills**: Document changes in content
|
||||
8. **Use progressive disclosure**: Put advanced details in separate files
|
||||
|
||||
## Validation checklist
|
||||
|
||||
Before finalizing a Skill, verify:
|
||||
|
||||
- [ ] Name is lowercase, hyphens only, max 64 chars
|
||||
- [ ] Description is specific and < 1024 chars
|
||||
- [ ] Description includes "what" and "when"
|
||||
- [ ] YAML frontmatter is valid
|
||||
- [ ] Instructions are step-by-step
|
||||
- [ ] Examples are concrete and realistic
|
||||
- [ ] Dependencies are documented
|
||||
- [ ] File paths use forward slashes
|
||||
- [ ] Skill activates on relevant queries
|
||||
- [ ] Claude follows instructions correctly
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Skill doesn't activate**:
|
||||
- Make description more specific with trigger words
|
||||
- Include file types and operations in description
|
||||
- Add "Use when..." clause with user phrases
|
||||
|
||||
**Multiple Skills conflict**:
|
||||
- Make descriptions more distinct
|
||||
- Use different trigger words
|
||||
- Narrow the scope of each Skill
|
||||
|
||||
**Skill has errors**:
|
||||
- Check YAML syntax (no tabs, proper indentation)
|
||||
- Verify file paths (use forward slashes)
|
||||
- Ensure scripts have execute permissions
|
||||
- List all dependencies
|
||||
|
||||
## Examples
|
||||
|
||||
See the documentation for complete examples:
|
||||
- Simple single-file Skill (commit-helper)
|
||||
- Skill with tool permissions (code-reviewer)
|
||||
- Multi-file Skill (pdf-processing)
|
||||
|
||||
## Output format
|
||||
|
||||
When creating a Skill, I will:
|
||||
|
||||
1. Ask clarifying questions about scope and requirements
|
||||
2. Suggest a Skill name and location
|
||||
3. Create the SKILL.md file with proper frontmatter
|
||||
4. Include clear instructions and examples
|
||||
5. Add supporting files if needed
|
||||
6. Provide testing instructions
|
||||
7. Validate against all requirements
|
||||
|
||||
The result will be a complete, working Skill that follows all best practices and validation rules.
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
1752fe6809b74921644866275ab80244b96e80bc
|
||||
218d2ab791d437309f91e0486eb9fa7f00badc17
|
||||
|
||||
20
.github/merge_rules.yaml
vendored
20
.github/merge_rules.yaml
vendored
@ -540,6 +540,26 @@
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: PrivateUse1
|
||||
patterns:
|
||||
- torch/accelerator/**
|
||||
- torch/utils/backend_registration.py
|
||||
- torch/csrc/acc/**
|
||||
- torch/csrc/DeviceAccelerator.*
|
||||
- torch/csrc/profiler/standalone/privateuse1_observer.*
|
||||
- aten/src/ATen/DeviceAccelerator.*
|
||||
- aten/src/ATen/core/GeneratorForPrivateuseone.*
|
||||
- aten/src/ATen/detail/PrivateUse1HooksInterface.*
|
||||
- docs/source/accelerator/**
|
||||
- test/cpp_extensions/open_registration_extension/torch_openreg/**
|
||||
approved_by:
|
||||
- albanD
|
||||
- fffrog
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: superuser
|
||||
patterns:
|
||||
- '*'
|
||||
|
||||
30
.github/scripts/generate_binary_build_matrix.py
vendored
30
.github/scripts/generate_binary_build_matrix.py
vendored
@ -22,7 +22,7 @@ CUDA_ARCHES_FULL_VERSION = {
|
||||
"12.6": "12.6.3",
|
||||
"12.8": "12.8.1",
|
||||
"12.9": "12.9.1",
|
||||
"13.0": "13.0.0",
|
||||
"13.0": "13.0.2",
|
||||
}
|
||||
CUDA_ARCHES_CUDNN_VERSION = {
|
||||
"12.6": "9",
|
||||
@ -56,7 +56,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
||||
@ -73,7 +73,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||
@ -90,27 +90,27 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
|
||||
"nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
|
||||
"nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
|
||||
"nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
|
||||
),
|
||||
"xpu": (
|
||||
"intel-cmplr-lib-rt==2025.2.1 | "
|
||||
|
||||
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -178,7 +178,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -224,7 +224,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -270,7 +270,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -381,7 +381,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -427,7 +427,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -473,7 +473,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -519,7 +519,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -630,7 +630,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -676,7 +676,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -722,7 +722,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -768,7 +768,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -879,7 +879,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -925,7 +925,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -971,7 +971,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1017,7 +1017,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1128,7 +1128,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1174,7 +1174,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1220,7 +1220,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1266,7 +1266,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1377,7 +1377,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1423,7 +1423,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1469,7 +1469,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1515,7 +1515,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1626,7 +1626,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1672,7 +1672,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1718,7 +1718,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1764,7 +1764,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
56
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -127,7 +127,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_6-test: # Testing
|
||||
@ -193,7 +193,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_8-test: # Testing
|
||||
@ -259,7 +259,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_9-test: # Testing
|
||||
@ -325,7 +325,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda13_0-test: # Testing
|
||||
@ -793,7 +793,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_6-test: # Testing
|
||||
@ -859,7 +859,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-test: # Testing
|
||||
@ -925,7 +925,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_9-test: # Testing
|
||||
@ -991,7 +991,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda13_0-test: # Testing
|
||||
@ -1459,7 +1459,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_6-test: # Testing
|
||||
@ -1525,7 +1525,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
@ -1591,7 +1591,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_9-test: # Testing
|
||||
@ -1657,7 +1657,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
@ -2125,7 +2125,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_6-test: # Testing
|
||||
@ -2191,7 +2191,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_8-test: # Testing
|
||||
@ -2257,7 +2257,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_9-test: # Testing
|
||||
@ -2323,7 +2323,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda13_0-test: # Testing
|
||||
@ -2791,7 +2791,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_6-test: # Testing
|
||||
@ -2857,7 +2857,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_8-test: # Testing
|
||||
@ -2923,7 +2923,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_9-test: # Testing
|
||||
@ -2989,7 +2989,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||
@ -3457,7 +3457,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_6-test: # Testing
|
||||
@ -3523,7 +3523,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_8-test: # Testing
|
||||
@ -3589,7 +3589,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_9-test: # Testing
|
||||
@ -3655,7 +3655,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda13_0-test: # Testing
|
||||
@ -4123,7 +4123,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_6-test: # Testing
|
||||
@ -4189,7 +4189,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_8-test: # Testing
|
||||
@ -4255,7 +4255,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_9-test: # Testing
|
||||
@ -4321,7 +4321,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||
|
||||
20
.github/workflows/xpu.yml
vendored
20
.github/workflows/xpu.yml
vendored
@ -59,14 +59,18 @@ jobs:
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||

|
||||

|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
@ -72,7 +72,7 @@ Elaborating Further:
|
||||
|
||||
If you use NumPy, then you have used Tensors (a.k.a. ndarray).
|
||||
|
||||

|
||||

|
||||
|
||||
PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
|
||||
computation by a huge amount.
|
||||
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
|
||||
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
|
||||
You get the best of speed and flexibility for your crazy research.
|
||||
|
||||

|
||||

|
||||
|
||||
### Python First
|
||||
|
||||
|
||||
@ -31,9 +31,9 @@ Be careful when running untrusted models. This classification includes models cr
|
||||
|
||||
**Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing).
|
||||
|
||||
**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done.
|
||||
|
||||
**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details.
|
||||
|
||||
Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs.
|
||||
|
||||
Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
|
||||
|
||||
|
||||
@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
|
||||
if(USE_CUDA)
|
||||
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
|
||||
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
|
||||
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
|
||||
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*")
|
||||
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
|
||||
@ -291,6 +291,7 @@ IF(USE_FBGEMM_GENAI)
|
||||
|
||||
set(fbgemm_genai_cuh
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/f4f4bf16_grouped/"
|
||||
"${FBGEMM_GENAI_SRCS}/"
|
||||
)
|
||||
|
||||
|
||||
@ -677,8 +677,8 @@ struct CachingHostAllocatorImpl {
|
||||
// size. This allows us to quickly find a free block of the right size.
|
||||
// We use deque to store per size free list and guard the list with its own
|
||||
// mutex.
|
||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
|
||||
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
|
||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>>
|
||||
free_list_{MAX_SIZE_INDEX};
|
||||
|
||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
|
||||
@ -21,12 +21,46 @@ inline void convertImpl(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename to_type>
|
||||
inline void convertFromBool(
|
||||
const bool* __restrict src,
|
||||
to_type* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename from_type>
|
||||
inline void convertToBool(
|
||||
const from_type* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define CONVERT_TEMPLATE(from_type, to_type) \
|
||||
template <> \
|
||||
inline void convert(const from_type* src, to_type* dst, int64_t n) { \
|
||||
return convertImpl<from_type, to_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_FROM_BOOL_TEMPLATE(to_type) \
|
||||
inline void convert(const bool* src, to_type* dst, int64_t n) { \
|
||||
return convertFromBool<to_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_TO_BOOL_TEMPLATE(from_type) \
|
||||
inline void convert(const from_type* src, bool* dst, int64_t n) { \
|
||||
return convertToBool<from_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
CONVERT_TEMPLATE(uint8_t, uint8_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int8_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int16_t)
|
||||
@ -34,6 +68,7 @@ CONVERT_TEMPLATE(uint8_t, int32_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int64_t)
|
||||
CONVERT_TEMPLATE(uint8_t, float)
|
||||
CONVERT_TEMPLATE(uint8_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(uint8_t)
|
||||
CONVERT_TEMPLATE(int8_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int8_t, int8_t)
|
||||
CONVERT_TEMPLATE(int8_t, int16_t)
|
||||
@ -41,6 +76,7 @@ CONVERT_TEMPLATE(int8_t, int32_t)
|
||||
CONVERT_TEMPLATE(int8_t, int64_t)
|
||||
CONVERT_TEMPLATE(int8_t, float)
|
||||
CONVERT_TEMPLATE(int8_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int8_t)
|
||||
CONVERT_TEMPLATE(int16_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int16_t, int8_t)
|
||||
CONVERT_TEMPLATE(int16_t, int16_t)
|
||||
@ -48,6 +84,7 @@ CONVERT_TEMPLATE(int16_t, int32_t)
|
||||
CONVERT_TEMPLATE(int16_t, int64_t)
|
||||
CONVERT_TEMPLATE(int16_t, float)
|
||||
CONVERT_TEMPLATE(int16_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int16_t)
|
||||
CONVERT_TEMPLATE(int32_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int32_t, int8_t)
|
||||
CONVERT_TEMPLATE(int32_t, int16_t)
|
||||
@ -55,6 +92,7 @@ CONVERT_TEMPLATE(int32_t, int32_t)
|
||||
CONVERT_TEMPLATE(int32_t, int64_t)
|
||||
CONVERT_TEMPLATE(int32_t, float)
|
||||
CONVERT_TEMPLATE(int32_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int32_t)
|
||||
CONVERT_TEMPLATE(int64_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int64_t, int8_t)
|
||||
CONVERT_TEMPLATE(int64_t, int16_t)
|
||||
@ -62,6 +100,7 @@ CONVERT_TEMPLATE(int64_t, int32_t)
|
||||
CONVERT_TEMPLATE(int64_t, int64_t)
|
||||
CONVERT_TEMPLATE(int64_t, float)
|
||||
CONVERT_TEMPLATE(int64_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int64_t)
|
||||
CONVERT_TEMPLATE(float, uint8_t)
|
||||
CONVERT_TEMPLATE(float, int8_t)
|
||||
CONVERT_TEMPLATE(float, int16_t)
|
||||
@ -69,6 +108,7 @@ CONVERT_TEMPLATE(float, int32_t)
|
||||
CONVERT_TEMPLATE(float, int64_t)
|
||||
CONVERT_TEMPLATE(float, float)
|
||||
CONVERT_TEMPLATE(float, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(float)
|
||||
CONVERT_TEMPLATE(double, uint8_t)
|
||||
CONVERT_TEMPLATE(double, int8_t)
|
||||
CONVERT_TEMPLATE(double, int16_t)
|
||||
@ -76,22 +116,80 @@ CONVERT_TEMPLATE(double, int32_t)
|
||||
CONVERT_TEMPLATE(double, int64_t)
|
||||
CONVERT_TEMPLATE(double, float)
|
||||
CONVERT_TEMPLATE(double, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(double)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int8_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int16_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int32_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int64_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(float)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(double)
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
CONVERT_TEMPLATE(float16_t, uint8_t)
|
||||
CONVERT_TEMPLATE(float16_t, int8_t)
|
||||
CONVERT_TEMPLATE(float16_t, int16_t)
|
||||
CONVERT_TEMPLATE(float16_t, int32_t)
|
||||
CONVERT_TEMPLATE(float16_t, int64_t)
|
||||
CONVERT_TEMPLATE(float16_t, float16_t)
|
||||
CONVERT_TEMPLATE(float16_t, float)
|
||||
CONVERT_TEMPLATE(float16_t, double)
|
||||
CONVERT_TEMPLATE(uint8_t, float16_t)
|
||||
CONVERT_TEMPLATE(int8_t, float16_t)
|
||||
CONVERT_TEMPLATE(int16_t, float16_t)
|
||||
CONVERT_TEMPLATE(int32_t, float16_t)
|
||||
CONVERT_TEMPLATE(int64_t, float16_t)
|
||||
CONVERT_TEMPLATE(float, float16_t)
|
||||
CONVERT_TEMPLATE(double, float16_t)
|
||||
|
||||
#define CONVERT_FROM_FP16_TEMPLATE(to_type) \
|
||||
template <> \
|
||||
inline void convert(const at::Half* src, to_type* dst, int64_t n) { \
|
||||
const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src); \
|
||||
return convertImpl<float16_t, to_type>(srcPtr, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_TO_FP16_TEMPLATE(from_type) \
|
||||
template <> \
|
||||
inline void convert(const from_type* src, at::Half* dst, int64_t n) { \
|
||||
float16_t* dstPtr = reinterpret_cast<float16_t*>(dst); \
|
||||
return convertImpl<from_type, float16_t>(src, dstPtr, n); \
|
||||
}
|
||||
|
||||
CONVERT_FROM_FP16_TEMPLATE(uint8_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int8_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int16_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int32_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int64_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(float16_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(float)
|
||||
CONVERT_FROM_FP16_TEMPLATE(double)
|
||||
CONVERT_TO_FP16_TEMPLATE(uint8_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int8_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int16_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int32_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int64_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(float)
|
||||
CONVERT_TO_FP16_TEMPLATE(double)
|
||||
|
||||
inline void convertBoolToFp16Impl(
|
||||
const bool* __restrict src,
|
||||
at::Half* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, at::Half* dst, int64_t n) {
|
||||
return convertBoolToFp16Impl(src, dst, n);
|
||||
}
|
||||
|
||||
inline void convertFp16ToBoolImpl(
|
||||
const at::Half* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const at::Half* src, bool* dst, int64_t n) {
|
||||
return convertFp16ToBoolImpl(src, dst, n);
|
||||
}
|
||||
|
||||
#endif
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
CONVERT_TEMPLATE(bfloat16_t, uint8_t)
|
||||
@ -109,6 +207,44 @@ CONVERT_TEMPLATE(int32_t, bfloat16_t)
|
||||
CONVERT_TEMPLATE(int64_t, bfloat16_t)
|
||||
CONVERT_TEMPLATE(float, bfloat16_t)
|
||||
CONVERT_TEMPLATE(double, bfloat16_t)
|
||||
|
||||
inline void convertBoolToBfloat16Impl(
|
||||
const bool* __restrict src,
|
||||
c10::BFloat16* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
|
||||
return convertBoolToBfloat16Impl(src, dst, n);
|
||||
}
|
||||
|
||||
inline void convertBfloat16ToBoolImpl(
|
||||
const c10::BFloat16* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
// Check if all non-sign bits are 0
|
||||
bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
|
||||
dstPtr[i] = isBf16Zero ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
|
||||
return convertBfloat16ToBoolImpl(src, dst, n);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@ -2,10 +2,10 @@
|
||||
|
||||
#include <ATen/cuda/ATenCUDAGeneral.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/core/impl/GPUTrace.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <c10/core/impl/GPUTrace.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
@ -246,4 +246,79 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
// EventPool - Thread-safe pool of CUDA events to avoid expensive cudaEventCreate
|
||||
// calls. cudaEventCreate when concurrently invoked from multiple threads can be
|
||||
// very expensive (especially on certain device/driver combinations).
|
||||
using CUDAEventPtr =
|
||||
std::unique_ptr<CUDAEvent, std::function<void(CUDAEvent*)>>;
|
||||
|
||||
class EventPool {
|
||||
public:
|
||||
EventPool() : pools_(at::cuda::device_count()) {}
|
||||
|
||||
CUDAEventPtr get(const DeviceIndex device) {
|
||||
// If the device is invalid, return a default event and no pooling
|
||||
if (device < 0 || device >= (DeviceIndex)pools_.size()) {
|
||||
auto deleter = [](CUDAEvent* event) {
|
||||
delete event;
|
||||
};
|
||||
return CUDAEventPtr(
|
||||
std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(), deleter);
|
||||
}
|
||||
|
||||
auto& pool = pools_[device];
|
||||
|
||||
// Create a destructor that returns the event to the appropriate device pool
|
||||
auto destructor = [&pool](CUDAEvent* event) noexcept {
|
||||
if (event != nullptr) {
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
pool.event_pool_.emplace_back(event);
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
if (!pool.event_pool_.empty()) {
|
||||
auto event = std::move(pool.event_pool_.back());
|
||||
pool.event_pool_.pop_back();
|
||||
return CUDAEventPtr(event.release(), destructor);
|
||||
}
|
||||
}
|
||||
|
||||
return CUDAEventPtr(
|
||||
std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(),
|
||||
destructor);
|
||||
}
|
||||
|
||||
void empty_cache() {
|
||||
for (auto& pool : pools_) {
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
pool.event_pool_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void init_num_events(const size_t num_events) {
|
||||
for (DeviceIndex device_idx = 0; device_idx < at::cuda::device_count(); ++device_idx) {
|
||||
CUDAGuard device_guard(device_idx);
|
||||
std::vector<CUDAEventPtr> temp_events;
|
||||
temp_events.reserve(num_events);
|
||||
for (size_t i = 0; i < num_events; ++i) {
|
||||
auto event = get(device_idx);
|
||||
// Record the event to ensure it's properly initialized
|
||||
event->record();
|
||||
temp_events.emplace_back(std::move(event));
|
||||
}
|
||||
// Events will be returned to pool when temp_events is destroyed
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
struct alignas(64) PerDevicePool {
|
||||
alignas(64) std::mutex mutex_;
|
||||
std::vector<std::unique_ptr<CUDAEvent>> event_pool_;
|
||||
};
|
||||
|
||||
std::vector<PerDevicePool> pools_;
|
||||
};
|
||||
|
||||
} // namespace at::cuda
|
||||
|
||||
@ -1,78 +1,88 @@
|
||||
#include <ATen/cuda/CUDAGreenContext.h>
|
||||
|
||||
namespace at::cuda {
|
||||
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
int driver_version;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
|
||||
TORCH_CHECK(
|
||||
driver_version >= 12080, "cuda driver too old to use green context!");
|
||||
CUcontext pctx = nullptr;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
|
||||
if (C10_UNLIKELY(!pctx)) {
|
||||
TORCH_WARN(
|
||||
"Attempted to create a green context but"
|
||||
" there was no primary context! Creating a primary context...");
|
||||
|
||||
cudaFree(0);
|
||||
}
|
||||
|
||||
CUdevice device;
|
||||
device_id_ = device_id;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
|
||||
|
||||
// Get device resources
|
||||
CUdevResource device_resource;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
|
||||
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
|
||||
|
||||
// Split resources
|
||||
std::vector<CUdevResource> result(1);
|
||||
auto result_data = result.data();
|
||||
unsigned int nb_groups = 1;
|
||||
CUdevResource remaining;
|
||||
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
|
||||
result_data,
|
||||
&nb_groups,
|
||||
&device_resource,
|
||||
&remaining,
|
||||
0, // default flags
|
||||
num_sms));
|
||||
|
||||
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
|
||||
|
||||
// Generate resource descriptor
|
||||
CUdevResourceDesc desc;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
|
||||
&desc, result_data, 1));
|
||||
|
||||
// Create green context
|
||||
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
|
||||
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
|
||||
|
||||
// Convert to regular context
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
|
||||
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
|
||||
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#define HAS_CUDA_GREEN_CONTEXT() 1
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#define HAS_CUDA_GREEN_CONTEXT() 0
|
||||
#endif
|
||||
|
||||
namespace at::cuda {
|
||||
|
||||
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
int driver_version;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
|
||||
TORCH_CHECK(
|
||||
driver_version >= 12080, "cuda driver too old to use green context!");
|
||||
CUcontext pctx = nullptr;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
|
||||
if (C10_UNLIKELY(!pctx)) {
|
||||
TORCH_WARN(
|
||||
"Attempted to create a green context but"
|
||||
" there was no primary context! Creating a primary context...");
|
||||
|
||||
cudaFree(0);
|
||||
}
|
||||
|
||||
CUdevice device;
|
||||
device_id_ = device_id;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
|
||||
|
||||
// Get device resources
|
||||
CUdevResource device_resource;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
|
||||
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
|
||||
|
||||
// Split resources
|
||||
std::vector<CUdevResource> result(1);
|
||||
auto result_data = result.data();
|
||||
unsigned int nb_groups = 1;
|
||||
CUdevResource remaining;
|
||||
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
|
||||
result_data,
|
||||
&nb_groups,
|
||||
&device_resource,
|
||||
&remaining,
|
||||
0, // default flags
|
||||
num_sms));
|
||||
|
||||
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
|
||||
|
||||
// Generate resource descriptor
|
||||
CUdevResourceDesc desc;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
|
||||
&desc, result_data, 1));
|
||||
|
||||
// Create green context
|
||||
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
|
||||
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
|
||||
|
||||
// Convert to regular context
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
|
||||
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
std::unique_ptr<GreenContext> GreenContext::create(
|
||||
uint32_t num_sms,
|
||||
std::optional<uint32_t> device_id) {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
if (!device_id.has_value()) {
|
||||
device_id = at::cuda::current_device();
|
||||
}
|
||||
return std::make_unique<GreenContext>(device_id.value(), num_sms);
|
||||
return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
@ -80,7 +90,7 @@ namespace at::cuda {
|
||||
|
||||
// Implement move operations
|
||||
GreenContext::GreenContext(GreenContext&& other) noexcept{
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
device_id_ = std::exchange(other.device_id_, -1);
|
||||
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
|
||||
context_ = std::exchange(other.context_, nullptr);
|
||||
@ -91,7 +101,7 @@ namespace at::cuda {
|
||||
}
|
||||
|
||||
GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
if (this != &other) {
|
||||
// Clean up current resources
|
||||
if (green_ctx_) {
|
||||
@ -120,7 +130,7 @@ namespace at::cuda {
|
||||
}
|
||||
|
||||
GreenContext::~GreenContext() noexcept{
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
|
||||
#else
|
||||
@ -128,25 +138,9 @@ namespace at::cuda {
|
||||
#endif
|
||||
}
|
||||
|
||||
// Get the underlying CUDA context
|
||||
CUcontext GreenContext::getContext() const {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
return context_;
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Get the underlying green context
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
CUgreenCtx GreenContext::getGreenContext() const {
|
||||
return green_ctx_;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Make this context current
|
||||
void GreenContext::setContext() {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
auto current_stream = c10::cuda::getCurrentCUDAStream();
|
||||
parent_stream_ = current_stream.stream();
|
||||
|
||||
@ -175,7 +169,7 @@ namespace at::cuda {
|
||||
}
|
||||
|
||||
void GreenContext::popContext() {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
// see above note about stream being hardcoded to the default stream
|
||||
at::cuda::CUDAEvent ev;
|
||||
ev.record(c10::cuda::getCurrentCUDAStream());
|
||||
|
||||
@ -1,53 +1,38 @@
|
||||
#pragma once
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
|
||||
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#include <cuda.h>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#define CUDA_HAS_GREEN_CONTEXT 1
|
||||
#else
|
||||
#define CUDA_HAS_GREEN_CONTEXT 0
|
||||
#endif
|
||||
|
||||
// Forward declare green context as opaque ptr
|
||||
typedef struct CUgreenCtx_st* CUgreenCtx;
|
||||
|
||||
namespace at::cuda {
|
||||
|
||||
class TORCH_CUDA_CPP_API GreenContext {
|
||||
public:
|
||||
GreenContext(uint32_t device_id, uint32_t num_sms);
|
||||
|
||||
static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
|
||||
// Green context creation
|
||||
static std::unique_ptr<GreenContext> create(
|
||||
uint32_t num_sms,
|
||||
std::optional<uint32_t> device_id);
|
||||
~GreenContext() noexcept;
|
||||
|
||||
// Delete copy constructor and assignment
|
||||
GreenContext(const GreenContext&) = delete;
|
||||
GreenContext& operator=(const GreenContext&) = delete;
|
||||
|
||||
// Implement move operations
|
||||
GreenContext(GreenContext&& other) noexcept;
|
||||
GreenContext& operator=(GreenContext&& other) noexcept;
|
||||
~GreenContext() noexcept;
|
||||
|
||||
// Get the underlying CUDA context
|
||||
CUcontext getContext() const;
|
||||
|
||||
// Get the underlying green context
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
CUgreenCtx getGreenContext() const;
|
||||
#endif
|
||||
|
||||
// Make this context current
|
||||
void setContext();
|
||||
|
||||
void popContext();
|
||||
|
||||
private:
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
GreenContext(uint32_t device_id, uint32_t num_sms);
|
||||
// Implement move operations
|
||||
GreenContext(GreenContext&& other) noexcept;
|
||||
GreenContext& operator=(GreenContext&& other) noexcept;
|
||||
|
||||
int32_t device_id_ = -1;
|
||||
CUgreenCtx green_ctx_ = nullptr;
|
||||
CUcontext context_ = nullptr;
|
||||
cudaStream_t parent_stream_ = nullptr;
|
||||
#endif
|
||||
};
|
||||
} // namespace at::cuda
|
||||
|
||||
@ -580,7 +580,7 @@ std::ofstream& TuningContext::GetUntunedFile(){
|
||||
filename.append(device);
|
||||
}
|
||||
|
||||
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc);
|
||||
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app);
|
||||
}
|
||||
return untuned_file_;
|
||||
}
|
||||
|
||||
@ -689,6 +689,10 @@ static void check_shape_forward(const at::Tensor& input,
|
||||
", but got bias of size ", at::symint::sizes<T>(bias), " instead");
|
||||
|
||||
for (const auto i : c10::irange(2, k)) {
|
||||
// T could be int64_t or SymInt, Specialized numeric_limts<SymInt> in c10/core/SymInt.h
|
||||
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
|
||||
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
|
||||
(std::numeric_limits<T>::max() / 2));
|
||||
input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
|
||||
// log new kernel size considering dilation
|
||||
kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
|
||||
@ -715,6 +719,11 @@ static void check_shape_forward(const at::Tensor& input,
|
||||
"Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
|
||||
}
|
||||
} else { // transposed
|
||||
for (const auto i : c10::irange(2, k)) {
|
||||
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
|
||||
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
|
||||
(std::numeric_limits<T>::max() / 2));
|
||||
}
|
||||
TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
|
||||
"Given transposed=", transposed, ", weight of size ", weight_sizes,
|
||||
", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
|
||||
|
||||
@ -52,8 +52,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
|
||||
for (const auto k : c10::irange(kw)) {
|
||||
int iShift = std::max(0, static_cast<int>(k - real_pad));
|
||||
int oShift = std::max(0, static_cast<int>(real_pad - k));
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
int t = std::min(ilen + real_pad - k, olen) - oShift;
|
||||
long t = std::min(ilen + real_pad - k, olen) - oShift;
|
||||
// Note: gemm assumes column-major matrices
|
||||
// input is l*m (row-major)
|
||||
// weight is m*r (row-major)
|
||||
|
||||
@ -16,8 +16,7 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
|
||||
auto linearId = elements - 1;
|
||||
|
||||
// NOTE: Assumes all strides are positive, which is true for now
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
for (int i = t.dim() - 1; i >= 0; --i) {
|
||||
for (auto i = t.dim() - 1; i >= 0; --i) {
|
||||
auto curDimIndex = linearId % t.sym_size(i);
|
||||
auto curDimOffset = curDimIndex * t.sym_stride(i);
|
||||
offset += curDimOffset;
|
||||
|
||||
@ -68,7 +68,6 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
|
||||
const float* input_ptr = input_contig.const_data_ptr<float>();
|
||||
|
||||
TORCH_CHECK(input.dim() >= 2);
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||
const int64_t K = input.size(input.dim() - 1);
|
||||
TORCH_CHECK(weight.dim() == 2);
|
||||
|
||||
@ -160,10 +160,9 @@ struct Dist {
|
||||
// value of k.
|
||||
parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
|
||||
const Vec pvec(p);
|
||||
double n2 = n - .5;
|
||||
double n2 = static_cast<double>(n) - .5;
|
||||
// The -1 accounts for floating point truncation issues
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
|
||||
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast<double>(k) - 1.0)));
|
||||
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
|
||||
|
||||
const scalar_t * self_i = self_start + i * m;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,11 +1,11 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/cuda/CachingHostAllocator.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
#include <ATen/cuda/CachingHostAllocator.h>
|
||||
#include <ATen/cuda/PeerToPeerAccess.h>
|
||||
#include <ATen/native/Copy.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
@ -27,6 +27,24 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
namespace {
|
||||
|
||||
// Initial pool size for CUDA events per device.
|
||||
constexpr size_t kInitialEventPoolSize = 8;
|
||||
|
||||
at::cuda::CUDAEventPtr getEventFromPool(const at::DeviceIndex device_idx) {
|
||||
static auto* event_pool = []() {
|
||||
auto* pool = new at::cuda::EventPool();
|
||||
// Pre-populate the pool with events to avoid stalls in creating events
|
||||
pool->init_num_events(kInitialEventPoolSize);
|
||||
return pool;
|
||||
}();
|
||||
|
||||
return event_pool->get(device_idx);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void neg_kernel_cuda(TensorIteratorBase &iter);
|
||||
void conj_kernel_cuda(TensorIteratorBase &iter);
|
||||
|
||||
@ -263,12 +281,14 @@ void copy_device_to_device(TensorIterator& iter,
|
||||
// write-after-read dependencies on the destination side are handled, so
|
||||
// that no one is operating on the dst memory when we perform the copy.
|
||||
// src waits on dst barrier (src already waits on src)
|
||||
CUDAEvent dst_ready;
|
||||
|
||||
// Use event pool for better performance instead of creating new events
|
||||
auto dst_ready = getEventFromPool(dst_device.index());
|
||||
device_guard.set_device(dst_device);
|
||||
dst_ready.record(getCurrentCUDAStream(dst_device.index()));
|
||||
dst_ready->record(getCurrentCUDAStream(dst_device.index()));
|
||||
|
||||
device_guard.set_device(src_device);
|
||||
dst_ready.block(copy_stream);
|
||||
dst_ready->block(copy_stream);
|
||||
}
|
||||
|
||||
if (memcpy_eligible) {
|
||||
@ -307,11 +327,11 @@ void copy_device_to_device(TensorIterator& iter,
|
||||
// operate on dst's copy until the copy is complete.
|
||||
|
||||
// Still on src_device, record stream event
|
||||
CUDAEvent src_ready;
|
||||
src_ready.record(copy_stream);
|
||||
auto src_ready = getEventFromPool(src_device.index());
|
||||
src_ready->record(copy_stream);
|
||||
|
||||
device_guard.set_device(dst_device);
|
||||
src_ready.block(getCurrentCUDAStream(dst_device.index()));
|
||||
src_ready->block(getCurrentCUDAStream(dst_device.index()));
|
||||
}
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
@ -22,6 +22,9 @@
|
||||
#include <ATen/native/cuda/RowwiseScaledMM.h>
|
||||
#include <ATen/native/cuda/ScaledGroupMM.h>
|
||||
#include <ATen/native/cuda/GroupMM.h>
|
||||
#ifdef USE_ROCM
|
||||
#include <ATen/native/hip/ck_group_gemm.h>
|
||||
#endif
|
||||
#include <ATen/ceil_div.h>
|
||||
|
||||
#ifdef USE_FBGEMM_GENAI
|
||||
@ -208,6 +211,48 @@ _f8_f8_bf16_rowwise_grouped_mm(
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_f4_f4_bf16_grouped_mm_fbgemm(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& global_scale_a,
|
||||
const Tensor& scale_b,
|
||||
const Tensor& global_scale_b,
|
||||
const std::optional<Tensor>& offs,
|
||||
const std::optional<Tensor>& bias,
|
||||
Tensor& out) {
|
||||
#if !defined(USE_ROCM) && defined(USE_FBGEMM_GENAI)
|
||||
// Typing checks
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2,
|
||||
"mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
|
||||
"mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
|
||||
"scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
|
||||
"scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(global_scale_a.scalar_type() == at::kFloat,
|
||||
"global_scale_a must be Float, got: ", global_scale_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(global_scale_b.scalar_type() == at::kFloat,
|
||||
"global_scale_b must be Float, got: ", global_scale_b.scalar_type());
|
||||
|
||||
auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
offs.value(),
|
||||
out,
|
||||
global_scale_a.mul(global_scale_b)
|
||||
);
|
||||
#else
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
|
||||
#endif
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||
if (mat.dim() == 2) {
|
||||
@ -245,7 +290,15 @@ void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int
|
||||
}
|
||||
}
|
||||
|
||||
void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
||||
void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
||||
// if {mx,nv}fp4, will need to modify K later
|
||||
bool is_fp4 = (mat.scalar_type() == kFloat4_e2m1fn_x2);
|
||||
int blocksize = 32;
|
||||
// check for nvfp4 vs. mxfp4 to fix blocksize
|
||||
if (is_fp4 && scale.scalar_type() == kFloat8_e4m3fn) {
|
||||
blocksize = 16;
|
||||
}
|
||||
|
||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||
if (mat.dim() == 2) {
|
||||
// For MXFP8, 2d tensors have variable size groups represented as subtensors,
|
||||
@ -253,17 +306,19 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
|
||||
// so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
|
||||
TORCH_CHECK(
|
||||
scale.dim() == mat.dim(),
|
||||
"for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
|
||||
"for block-scaled, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(),
|
||||
" and scale.dim() = ", scale.dim(), " for arg ", arg_idx
|
||||
);
|
||||
|
||||
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
|
||||
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
|
||||
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/blocksize, 4))
|
||||
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/blocksize, 4))
|
||||
// * weight is transposed prior to the call, scale stays non-transposed.
|
||||
bool LHS = arg_idx == 0;
|
||||
int scale_dim_to_check = 0;
|
||||
int mat_dim_to_check = LHS ? 0 : 1;
|
||||
TORCH_CHECK(
|
||||
scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
|
||||
"for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
||||
"for block-scaled, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
||||
"must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
|
||||
} else {
|
||||
// For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
|
||||
@ -273,32 +328,40 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
|
||||
};
|
||||
|
||||
// TODO: this is for 3d tensor in 2d-3d case specifically.
|
||||
// We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
|
||||
// We'll need to support 3d-3d and 3d-2d cases once mxfp8/nvfp4 grouped gemm supports them.
|
||||
int64_t G = mat.size(0);
|
||||
int64_t K = mat.size(1);
|
||||
if (is_fp4) {
|
||||
// FP4 packs 2 values into a single 8b word - the "real" K is 2x the
|
||||
// reported K. Reverse that adjustment.
|
||||
const int fp4_elems_per_byte = 2;
|
||||
K *= fp4_elems_per_byte;
|
||||
}
|
||||
int64_t N = mat.size(2);
|
||||
int64_t blocked_scale_K = round_up(K/32, 4);
|
||||
int64_t blocked_scale_K = round_up(K/blocksize, 4);
|
||||
int64_t blocked_scale_N = round_up(N, 128);
|
||||
|
||||
// fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
|
||||
TORCH_CHECK(
|
||||
scale.dim() == mat.dim() - 1,
|
||||
"for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
|
||||
"for block-scaled 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N),",
|
||||
"but scale is ", scale.dim(), "D for arg ", arg_idx
|
||||
);
|
||||
TORCH_CHECK(
|
||||
scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
|
||||
"for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
|
||||
"for block-scaled grouped GEMM, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ")",
|
||||
" for arg ", arg_idx, ", got: ", scale.size(0), ", ", scale.size(1)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
bool using_fp8_rowwise = scale.scalar_type() == kFloat;
|
||||
bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
||||
bool using_mx = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
||||
if (using_fp8_rowwise) {
|
||||
_check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
|
||||
} else if (using_mxfp8) {
|
||||
_check_scales_mxfp8(mat, scale, dim, arg_idx);
|
||||
} else if (using_mx) {
|
||||
_check_scales_blocked(mat, scale, dim, arg_idx);
|
||||
} else {
|
||||
TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
|
||||
}
|
||||
@ -411,9 +474,10 @@ namespace {
|
||||
|
||||
using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
|
||||
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2> scale_grouped_kernel_dispatch = {{
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 3> scale_grouped_kernel_dispatch = {{
|
||||
{ "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
|
||||
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
|
||||
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
|
||||
{ "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
@ -525,8 +589,9 @@ _scaled_grouped_mm_cuda_v2(
|
||||
out);
|
||||
}
|
||||
case ScaledGemmImplementation::MXFP8_MXFP8: {
|
||||
_check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
return _mx8_mx8_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
@ -537,6 +602,21 @@ _scaled_grouped_mm_cuda_v2(
|
||||
offs.value(),
|
||||
out);
|
||||
}
|
||||
case ScaledGemmImplementation::NVFP4_NVFP4: {
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
return _f4_f4_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0], /* block-scale A */
|
||||
scale_a[1], /* global-scale A */
|
||||
scale_b[0], /* block-scale B */
|
||||
scale_b[1], /* global-scale B */
|
||||
offs.value(),
|
||||
std::nullopt, /* bias */
|
||||
out);
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||
"_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");
|
||||
@ -559,12 +639,19 @@ std::optional<c10::ScalarType> out_dtype) {
|
||||
// _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
|
||||
// the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
|
||||
bool use_fast_path = false;
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) {
|
||||
use_fast_path = true;
|
||||
}
|
||||
#endif
|
||||
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
|
||||
if (use_fast_path) {
|
||||
// fast path, no d2h sync needed
|
||||
#ifndef USE_ROCM
|
||||
at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
|
||||
#else
|
||||
at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
|
||||
#endif
|
||||
} else {
|
||||
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
|
||||
}
|
||||
|
||||
1284
aten/src/ATen/native/cuda/ScaledBlas.cpp
Normal file
1284
aten/src/ATen/native/cuda/ScaledBlas.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -12,14 +12,15 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if AT_USE_JITERATOR()
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
constexpr char tan_name[] = "tan_impl";
|
||||
#endif
|
||||
|
||||
void tan_kernel_cuda(TensorIteratorBase& iter) {
|
||||
auto common_dtype = iter.common_dtype();
|
||||
if (at::isComplexType(common_dtype)) {
|
||||
#if AT_USE_JITERATOR()
|
||||
// Disabled due to accuracy issues
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
static const auto tan_string = jiterator_stringify(
|
||||
template <typename T> T tan_impl(T a) { return std::tan(a); });
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(
|
||||
|
||||
@ -12,14 +12,15 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if AT_USE_JITERATOR()
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
constexpr char tanh_name[] = "tanh_impl";
|
||||
#endif
|
||||
|
||||
void tanh_kernel_cuda(TensorIteratorBase& iter) {
|
||||
auto common_dtype = iter.common_dtype();
|
||||
if (at::isComplexType(common_dtype)) {
|
||||
#if AT_USE_JITERATOR()
|
||||
// Disabled due to accuracy issues
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
static const auto tanh_string = jiterator_stringify(
|
||||
template <typename T> T tanh_impl(T a) { return std::tanh(a); });
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(
|
||||
|
||||
171
aten/src/ATen/native/cuda/cuBlasCommonArgs.h
Normal file
171
aten/src/ATen/native/cuda/cuBlasCommonArgs.h
Normal file
@ -0,0 +1,171 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
using at::blas::ScalingType;
|
||||
using at::blas::SwizzleType;
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
|
||||
c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
|
||||
if (resolve_conj && tensor.is_conj()) {
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
|
||||
} else {
|
||||
return c10::MaybeOwned<Tensor>::borrowed(tensor);
|
||||
}
|
||||
}
|
||||
|
||||
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
|
||||
if (tensor.is_non_overlapping_and_dense()) { // common case
|
||||
transpose_tensor = tensor.is_contiguous();
|
||||
return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
|
||||
}
|
||||
IntArrayRef tensor_strides = tensor.strides();
|
||||
IntArrayRef tensor_sizes = tensor.sizes();
|
||||
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
|
||||
transpose_tensor = false;
|
||||
return resolve_conj_if_indicated(tensor, !transpose_result);
|
||||
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
|
||||
transpose_tensor = true;
|
||||
return resolve_conj_if_indicated(tensor, transpose_result);
|
||||
} else {
|
||||
transpose_tensor = true;
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
|
||||
}
|
||||
}
|
||||
|
||||
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
|
||||
if (tensor.is_non_overlapping_and_dense()) { // common case
|
||||
transpose_tensor = tensor.is_contiguous();
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
}
|
||||
|
||||
IntArrayRef tensor_strides = tensor.strides();
|
||||
IntArrayRef tensor_sizes = tensor.sizes();
|
||||
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
|
||||
transpose_tensor = false;
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
|
||||
transpose_tensor = true;
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
} else {
|
||||
transpose_tensor = true;
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* @brief Prepares matrices for CUBLAS operation
|
||||
*
|
||||
* This constructor prepares tensors for CUBLAS
|
||||
* The main difference is that PyTorch uses row-major as the default and
|
||||
* CUBLAS expects column-major.
|
||||
*
|
||||
* @details
|
||||
* To enable row-major output while using CUBLAS,
|
||||
* we use the mathematical identity that (A × B)^T = B^T × A^T.
|
||||
*
|
||||
* Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
|
||||
* T = row-major, N = col-major
|
||||
*
|
||||
* Example:
|
||||
* For matrices A (M×K)(row-major) and B (K×N)(row-major):
|
||||
* - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
|
||||
* - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
|
||||
* - However, since the output form cublas is column-major this is
|
||||
* - equivalent to an output of size MxN row-major as expected
|
||||
*
|
||||
* The transpose flags are derived from the layouts of the passed in tensors
|
||||
*
|
||||
* If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
|
||||
* to their unpacked values to match what cuBLAS expects.
|
||||
*
|
||||
* @param mat1 First input matrix
|
||||
* @param mat2 Second input matrix
|
||||
* @param c Output matrix (result)
|
||||
* @param scale_a Optional scaling factor for first matrix
|
||||
* @param scale_b Optional scaling factor for second matrix
|
||||
* @param scale_result Optional scaling factor for result
|
||||
*/
|
||||
struct cublasCommonArgs {
|
||||
cublasCommonArgs(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
Tensor& c,
|
||||
const std::optional<Tensor>& scale_a = std::nullopt,
|
||||
const std::optional<Tensor>& scale_b = std::nullopt,
|
||||
const std::optional<Tensor>& scale_result = std::nullopt,
|
||||
const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
|
||||
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
|
||||
bool transpose_result = false, transpose_a = false, transpose_b = false;
|
||||
result = prepare_matrix_for_cublas(c, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
|
||||
|
||||
// Handle scale tensors if provided
|
||||
if (scale_a && scale_b) {
|
||||
// By default since we return in row-major we run the gemm
|
||||
// as B.T @ A.T, check transpose_result to determine if we flip the scales
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
|
||||
}
|
||||
|
||||
if (scale_result) {
|
||||
scale_result_ptr = scale_result->data_ptr();
|
||||
scale_result_dtype = scale_result->scalar_type();
|
||||
}
|
||||
|
||||
// Update transpose flags
|
||||
if (transpose_result) {
|
||||
transpose_a = !transpose_a;
|
||||
transpose_b = !transpose_b;
|
||||
}
|
||||
|
||||
auto sizes_a = mata->sizes();
|
||||
auto sizes_b = matb->sizes();
|
||||
|
||||
m = sizes_a[transpose_result ? 1 : 0];
|
||||
k = sizes_a[transpose_result ? 0 : 1];
|
||||
n = sizes_b[transpose_result ? 0 : 1];
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
|
||||
result_ld = result->stride(transpose_result ? 0 : 1);
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
|
||||
|
||||
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
|
||||
// if the gemm operands are in packed float4
|
||||
if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
|
||||
k = k * 2;
|
||||
lda = lda * 2;
|
||||
ldb = ldb * 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Matrix members
|
||||
char transa, transb;
|
||||
int64_t m, n, k;
|
||||
int64_t lda, ldb, result_ld;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result;
|
||||
|
||||
// Scale members
|
||||
void* scale_mata_ptr = nullptr;
|
||||
void* scale_matb_ptr = nullptr;
|
||||
void* scale_result_ptr = nullptr;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype;
|
||||
std::optional<ScalingType> scaling_mata_type;
|
||||
std::optional<c10::ScalarType> scale_matb_dtype;
|
||||
std::optional<ScalingType> scaling_matb_type;
|
||||
std::optional<c10::ScalarType> scale_result_dtype;
|
||||
};
|
||||
|
||||
} // namespace at::native
|
||||
19
aten/src/ATen/native/hip/ck_group_gemm.h
Normal file
19
aten/src/ATen/native/hip/ck_group_gemm.h
Normal file
@ -0,0 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <optional>
|
||||
|
||||
namespace at {
|
||||
namespace hip {
|
||||
namespace detail {
|
||||
void group_gemm_ck(
|
||||
const at::Tensor& mat_a,
|
||||
const at::Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
at::Tensor& out);
|
||||
|
||||
} // namespace detail
|
||||
} // namespace hip
|
||||
} // namespace at
|
||||
458
aten/src/ATen/native/hip/ck_group_gemm.hip
Normal file
458
aten/src/ATen/native/hip/ck_group_gemm.hip
Normal file
@ -0,0 +1,458 @@
|
||||
#undef __HIP_NO_HALF_CONVERSIONS__
|
||||
#include <ATen/hip/HIPContext.h>
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/TensorAccessor.h>
|
||||
#include <c10/hip/HIPStream.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <optional>
|
||||
#include <type_traits>
|
||||
|
||||
#include <ck/ck.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp>
|
||||
#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
|
||||
#include <ck/utility/tuple.hpp>
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
namespace at {
|
||||
namespace hip {
|
||||
namespace detail {
|
||||
|
||||
namespace CkTypes {
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
}
|
||||
|
||||
template <typename ALayout, typename BLayout, typename DataType>
|
||||
using GroupedGemmKernel = ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage<
|
||||
ALayout, BLayout, ck::Tuple<>, ck::tensor_layout::gemm::RowMajor,
|
||||
DataType, DataType, CkTypes::F32, DataType, ck::Tuple<>, DataType,
|
||||
CkTypes::PassThrough, CkTypes::PassThrough, CkTypes::PassThrough,
|
||||
ck::tensor_operation::device::GemmSpecialization::MNKPadding,
|
||||
1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2,
|
||||
S<1,4,64,1>, S<0,2,1,3>, S<0,2,1,3>,
|
||||
3, 8, 8, 1,
|
||||
S<1,4,64,1>, S<0,2,1,3>, S<0,2,1,3>,
|
||||
3, 8, 8, 1,
|
||||
1, 1,
|
||||
S<1,32,1,8>, 4
|
||||
>;
|
||||
|
||||
template <typename ALayout, typename BLayout, typename DataType>
|
||||
void launch_grouped_bgemm_ck_impl_dispatch(
|
||||
const at::Tensor& mat_a,
|
||||
const at::Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
at::Tensor& out)
|
||||
{
|
||||
using DeviceOp = GroupedGemmKernel<ALayout, BLayout, DataType>;
|
||||
using PassThrough = CkTypes::PassThrough;
|
||||
|
||||
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
|
||||
std::vector<const void*> p_a_ptrs, p_b_ptrs;
|
||||
std::vector<void*> p_e_ptrs;
|
||||
// Note: d_ptrs will be resized after we populate the other vectors
|
||||
|
||||
const int mat_a_dim = mat_a.dim();
|
||||
const int mat_b_dim = mat_b.dim();
|
||||
|
||||
const char* a_ptr_base = reinterpret_cast<const char*>(mat_a.data_ptr());
|
||||
const char* b_ptr_base = reinterpret_cast<const char*>(mat_b.data_ptr());
|
||||
char* out_ptr_base = reinterpret_cast<char*>(out.data_ptr());
|
||||
const size_t a_element_size = mat_a.element_size();
|
||||
const size_t b_element_size = mat_b.element_size();
|
||||
const size_t out_element_size = out.element_size();
|
||||
|
||||
// for each group, calculate m,n,k,lda,ldb,ldc and A,B,out pointer base addresses.
|
||||
if (mat_a_dim == 2 && mat_b_dim == 2) {
|
||||
// 2D*2D case requires offset tensor
|
||||
auto offs_accessor = offs->accessor<int, 1>();
|
||||
int num_groups = offs_accessor.size(0);
|
||||
const int M = mat_a.size(0); // number of rows in A
|
||||
const int N = mat_b.size(1); // number of columns in B
|
||||
const int K = mat_a.size(1); // columns in A == rows in B
|
||||
// for 2d*2d input, output is 3d.
|
||||
// for each group, A columns (K) are sliced. M and N dimensions are not sliced.
|
||||
for (int i = 0; i < num_groups; ++i) {
|
||||
int start_k = (i == 0) ? 0 : offs_accessor[i-1];
|
||||
int end_k = offs_accessor[i];
|
||||
int k = end_k - start_k;
|
||||
|
||||
//K dimension are sliced, hence select stride(1) always.
|
||||
//K dimension is always dimension 1, regardless of memory layout (row/column major)
|
||||
const void* group_a_ptr = a_ptr_base + start_k * mat_a.stride(1) * a_element_size;
|
||||
const void* group_b_ptr;
|
||||
int ldb;
|
||||
|
||||
if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major B [K,N]: K values are horizontally adjacent, use stride(1) for K offset
|
||||
group_b_ptr = b_ptr_base + start_k * mat_b.stride(1) * b_element_size;
|
||||
// Leading dimension = distance between rows = stride(0)
|
||||
ldb = mat_b.stride(0);
|
||||
} else {
|
||||
// Column-major B [K,N]: K values are vertically adjacent, use stride(0) for K offset
|
||||
group_b_ptr = b_ptr_base + start_k * mat_b.stride(0) * b_element_size;
|
||||
// Leading dimension = distance between columns = stride(1)
|
||||
ldb = mat_b.stride(1);
|
||||
}
|
||||
|
||||
// Calculate output pointer for group i in 3D tensor [num_groups, M, N]
|
||||
// stride(0) = M*N elements between groups, so skip i*stride(0) elements to reach group i
|
||||
void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
|
||||
int lda, ldc;
|
||||
if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major A [M,K]: leading dimension = distance between rows = stride(0)
|
||||
lda = mat_a.stride(0);
|
||||
} else {
|
||||
// Column-major A [M,K]: leading dimension = distance between columns = stride(1)
|
||||
lda = mat_a.stride(1);
|
||||
}
|
||||
// Output is always row-major in 3D tensor [num_groups, M, N]
|
||||
// Leading dimension for each group's [M,N] slice = stride(1) = N
|
||||
ldc = out.stride(1);
|
||||
size_t output_group_bytes = M * N * out_element_size;
|
||||
void* group_e_ptr_end = (char*)group_e_ptr + output_group_bytes;
|
||||
|
||||
gemm_descs.push_back({
|
||||
static_cast<ck::index_t>(M),
|
||||
static_cast<ck::index_t>(N),
|
||||
static_cast<ck::index_t>(k),
|
||||
static_cast<ck::index_t>(lda),
|
||||
static_cast<ck::index_t>(ldb),
|
||||
static_cast<ck::index_t>(ldc)
|
||||
});
|
||||
p_a_ptrs.push_back(group_a_ptr);
|
||||
p_b_ptrs.push_back(group_b_ptr);
|
||||
p_e_ptrs.push_back(group_e_ptr);
|
||||
}
|
||||
} else if (mat_a_dim == 2 && mat_b_dim == 3) {
|
||||
// 2D*3D case requires offset tensor
|
||||
auto offs_accessor = offs->accessor<int, 1>();
|
||||
int num_groups = offs_accessor.size(0);
|
||||
|
||||
// 2d*3d input, output is 2d.
|
||||
// A: [m * n_groups, k], B: [n_groups, n, k] or [n_groups, k, n], Output: [m * n_groups, n]
|
||||
// Offset divides M dimension (rows of A), each group gets different rows of A and different batch of B
|
||||
const int K = mat_a.size(1); // columns in A
|
||||
// For 2D-3D case: The output determines N (result width)
|
||||
const int N = out.size(1); // N is the width of the output tensor
|
||||
|
||||
for (int i = 0; i < num_groups; ++i) {
|
||||
int start_m = (i == 0) ? 0 : offs_accessor[i - 1];
|
||||
int end_m = offs_accessor[i];
|
||||
int m = end_m - start_m;
|
||||
|
||||
// Skip zero-sized groups but continue processing subsequent groups
|
||||
if (m <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Select A rows for group i: skip start_m rows
|
||||
const void* group_a_ptr;
|
||||
int lda;
|
||||
if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major A [total_m, K]: skip start_m rows, each row is stride(0) elements apart
|
||||
group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
|
||||
lda = mat_a.stride(0); // distance between rows
|
||||
} else {
|
||||
// Column-major A [total_m, K]: skip start_m elements in the first dimension (stride(0) is between rows)
|
||||
group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
|
||||
|
||||
// Detect stride pattern for A tensor to determine appropriate lda calculation
|
||||
bool a_is_strided_tensor = (mat_a.stride(0) > mat_a.size(0));
|
||||
|
||||
if (a_is_strided_tensor) {
|
||||
// For strided A tensors: stride(0) gives the actual leading dimension
|
||||
lda = mat_a.stride(0);
|
||||
} else {
|
||||
// For non-strided A tensors: use the M dimension (total rows)
|
||||
lda = mat_a.size(0); // Total M dimension for column-major layout
|
||||
}
|
||||
}
|
||||
|
||||
// Select B batch for group i: B[i, :, :]
|
||||
const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
|
||||
int ldb;
|
||||
|
||||
if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major GEMM: expecting B as [K, N] but we have [N, K], so transpose needed
|
||||
ldb = mat_b.stride(2); // Leading dimension for accessing as [K, N]
|
||||
} else {
|
||||
// Detect stride pattern to determine appropriate ldb calculation
|
||||
bool is_strided_tensor = (mat_b.stride(2) > mat_b.size(2));
|
||||
|
||||
if (is_strided_tensor) {
|
||||
// For strided tensors: stride(2) gives the actual leading dimension
|
||||
ldb = mat_b.stride(2);
|
||||
} else {
|
||||
// For non-strided tensors: use the N dimension
|
||||
ldb = mat_b.size(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Output for this group: rows [start_m:end_m, :] in 2D output [total_m, N]
|
||||
void* group_e_ptr = out_ptr_base + start_m * out.stride(0) * out_element_size;
|
||||
int ldc = out.stride(0); // distance between rows in output (should be N for 2D case)
|
||||
|
||||
gemm_descs.push_back({
|
||||
static_cast<ck::index_t>(m),
|
||||
static_cast<ck::index_t>(N),
|
||||
static_cast<ck::index_t>(K),
|
||||
static_cast<ck::index_t>(lda),
|
||||
static_cast<ck::index_t>(ldb),
|
||||
static_cast<ck::index_t>(ldc)
|
||||
});
|
||||
p_a_ptrs.push_back(group_a_ptr);
|
||||
p_b_ptrs.push_back(group_b_ptr);
|
||||
p_e_ptrs.push_back(group_e_ptr);
|
||||
}
|
||||
} else if (mat_a_dim == 3 && mat_b_dim == 3) {
|
||||
// 3d*3d input, output is 3d - batched matrix multiplication
|
||||
// A: [batch, m, k], B: [batch, k, n] or [batch, n, k] (depending on transpose), Output: [batch, m, n]
|
||||
// Each batch is processed as a separate GEMM operation
|
||||
const int batch_size = mat_a.size(0);
|
||||
const int M = mat_a.size(1); // rows in each A matrix
|
||||
const int K = mat_a.size(2); // columns in A == rows in B (or columns if B is transposed)
|
||||
|
||||
// Determine N from B tensor - it could be B.size(1) or B.size(2) depending on layout
|
||||
int N;
|
||||
if (mat_b.size(1) == K) {
|
||||
// B is [batch, k, n] - normal layout
|
||||
N = mat_b.size(2);
|
||||
} else if (mat_b.size(2) == K) {
|
||||
// B is [batch, n, k] - transposed layout
|
||||
N = mat_b.size(1);
|
||||
} else {
|
||||
TORCH_CHECK(false, "CK Group GEMM 3D-3D: B tensor dimensions incompatible with A. A=[",
|
||||
batch_size, ",", M, ",", K, "], B=[", mat_b.size(0), ",", mat_b.size(1), ",", mat_b.size(2), "]");
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
// Select A batch for group i: A[i, :, :]
|
||||
const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
|
||||
|
||||
// Select B batch for group i: B[i, :, :]
|
||||
const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
|
||||
|
||||
// Select output batch for group i: Output[i, :, :]
|
||||
void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
|
||||
|
||||
int lda, ldb, ldc;
|
||||
|
||||
if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major A: leading dimension = distance between rows = stride(1)
|
||||
lda = mat_a.stride(1);
|
||||
} else {
|
||||
// Column-major A: leading dimension = distance between columns = stride(2)
|
||||
lda = mat_a.stride(2);
|
||||
}
|
||||
|
||||
if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major B: leading dimension = distance between rows
|
||||
if (mat_b.size(1) == K) {
|
||||
// B is [batch, k, n] - normal layout
|
||||
ldb = mat_b.stride(1); // stride between K rows
|
||||
} else {
|
||||
// B is [batch, n, k] - transposed layout, treat as [k, n] for GEMM
|
||||
ldb = mat_b.stride(2); // stride between N rows (since we're accessing as [k,n])
|
||||
}
|
||||
} else {
|
||||
// Column-major B: leading dimension = distance between columns
|
||||
if (mat_b.size(1) == K) {
|
||||
// B is [batch, k, n] - normal layout
|
||||
ldb = mat_b.stride(2); // stride between N columns
|
||||
} else {
|
||||
// B is [batch, n, k] - transposed layout
|
||||
ldb = mat_b.stride(1); // stride between K columns (since we're accessing as [n,k]→[k,n])
|
||||
}
|
||||
}
|
||||
|
||||
// Output is typically row-major: leading dimension = distance between rows = stride(1)
|
||||
ldc = out.stride(1);
|
||||
|
||||
gemm_descs.push_back({
|
||||
static_cast<ck::index_t>(M),
|
||||
static_cast<ck::index_t>(N),
|
||||
static_cast<ck::index_t>(K),
|
||||
static_cast<ck::index_t>(lda),
|
||||
static_cast<ck::index_t>(ldb),
|
||||
static_cast<ck::index_t>(ldc)
|
||||
});
|
||||
p_a_ptrs.push_back(group_a_ptr);
|
||||
p_b_ptrs.push_back(group_b_ptr);
|
||||
p_e_ptrs.push_back(group_e_ptr);
|
||||
}
|
||||
} else if (mat_a_dim == 3 && mat_b_dim == 2) {
|
||||
// 3D*2D case requires offset tensor
|
||||
auto offs_accessor = offs->accessor<int, 1>();
|
||||
int num_groups = offs_accessor.size(0);
|
||||
// 3d*2d input, output is 3d.
|
||||
// A: [n_groups, m, k], B: [k, total_n] (assuming row-major for both)
|
||||
// Offset divides N dimension of B, each group gets different slice of B and different batch of A
|
||||
const int batch_size = mat_a.size(0); // n_groups
|
||||
const int M = mat_a.size(1); // rows in each A matrix
|
||||
const int K = mat_a.size(2); // columns in A
|
||||
|
||||
// For row-major A and B case: B should be [K, total_N]
|
||||
const int total_N = mat_b.size(1); // B is [K, total_N] for row-major
|
||||
|
||||
for (int i = 0; i < num_groups; ++i) {
|
||||
int start_n = (i == 0) ? 0 : offs_accessor[i - 1];
|
||||
int end_n = offs_accessor[i];
|
||||
int n = end_n - start_n;
|
||||
|
||||
// Skip zero-sized groups but continue processing subsequent groups
|
||||
if (n <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Select A batch for group i: A[i, :, :]
|
||||
const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
|
||||
|
||||
// Select B slice for group i: B[:, start_n:end_n] (B[K, total_N])
|
||||
const void* group_b_ptr;
|
||||
int ldb;
|
||||
|
||||
// Check if B is row-major or column-major
|
||||
if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major B [K, total_N]: slice columns [start_n:end_n]
|
||||
group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
|
||||
ldb = mat_b.stride(0); // distance between rows (should be total_N)
|
||||
} else {
|
||||
// Column-major B [K, total_N]: slice columns [start_n:end_n]
|
||||
group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
|
||||
ldb = mat_b.stride(1); // distance between columns (should be K)
|
||||
}
|
||||
|
||||
// Select output slice for group i: Output[:, start_n:end_n]
|
||||
void* group_e_ptr = out_ptr_base + start_n * out.stride(1) * out_element_size;
|
||||
|
||||
int lda, ldc;
|
||||
|
||||
// Row-major A: leading dimension = distance between rows = stride(1)
|
||||
lda = mat_a.stride(1);
|
||||
// Output is row-major: leading dimension = distance between rows = stride(0)
|
||||
ldc = out.stride(0);
|
||||
|
||||
gemm_descs.push_back({
|
||||
static_cast<ck::index_t>(M),
|
||||
static_cast<ck::index_t>(n),
|
||||
static_cast<ck::index_t>(K),
|
||||
static_cast<ck::index_t>(lda),
|
||||
static_cast<ck::index_t>(ldb),
|
||||
static_cast<ck::index_t>(ldc)
|
||||
});
|
||||
p_a_ptrs.push_back(group_a_ptr);
|
||||
p_b_ptrs.push_back(group_b_ptr);
|
||||
p_e_ptrs.push_back(group_e_ptr);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "CK Group GEMM: Unsupported dimensions, mat A dim is ", mat_a_dim, ", mat B dim is ", mat_b_dim);
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(p_a_ptrs.size() > 0, "CK Group GEMM: No valid groups");
|
||||
|
||||
// Initialize d_ptrs with the correct size
|
||||
std::vector<std::array<const void*, 0>> d_ptrs(p_a_ptrs.size());
|
||||
|
||||
static DeviceOp gemm_instance;
|
||||
auto argument = gemm_instance.MakeArgument(
|
||||
p_a_ptrs, p_b_ptrs, d_ptrs, p_e_ptrs,
|
||||
gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}
|
||||
);
|
||||
TORCH_INTERNAL_ASSERT(gemm_instance.IsSupportedArgument(argument),
|
||||
"CK Group GEMM: argument unsupported (shape/strides/type config)");
|
||||
size_t arg_buf_size = gemm_instance.GetDeviceKernelArgSize(&argument);
|
||||
size_t ws_size = gemm_instance.GetWorkSpaceSize(&argument);
|
||||
|
||||
void* gemm_arg_buf = nullptr;
|
||||
void* ws_buf = nullptr;
|
||||
|
||||
hipMalloc(&gemm_arg_buf, arg_buf_size);
|
||||
hipMalloc(&ws_buf, ws_size);
|
||||
|
||||
gemm_instance.SetDeviceKernelArgs(&argument, gemm_arg_buf);
|
||||
gemm_instance.SetWorkSpacePointer(&argument, ws_buf);
|
||||
|
||||
auto invoker = gemm_instance.MakeInvoker();
|
||||
hipStream_t stream = c10::hip::getCurrentHIPStream();
|
||||
invoker.Run(argument, {stream});
|
||||
hipFree(gemm_arg_buf);
|
||||
hipFree(ws_buf);
|
||||
}
|
||||
|
||||
void group_gemm_ck(
|
||||
const at::Tensor& input_a,
|
||||
const at::Tensor& input_b_colmajor,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& /*bias*/,
|
||||
at::Tensor& out)
|
||||
{
|
||||
// Detect if input_a is row-major based on stride pattern
|
||||
bool a_row_major = (input_a.dim() == 3) ? (input_a.stride(2) == 1) : (input_a.stride(1) == 1);
|
||||
bool b_col_major = (input_b_colmajor.dim() == 3) ? (input_b_colmajor.stride(1) == 1) : (input_b_colmajor.stride(0) == 1);
|
||||
// Ensure tensor A is row-major and contiguous if not already
|
||||
at::Tensor mat_a = input_a;
|
||||
if (!a_row_major) {
|
||||
// If A is not row-major, make it contiguous (row-major)
|
||||
mat_a = input_a.contiguous();
|
||||
}
|
||||
// Force tensor B to be column-major using double transpose trick
|
||||
// This guarantees stride(0) == 1 and stride(1) == K for [K, N] shape
|
||||
at::Tensor mat_b = input_b_colmajor;
|
||||
if (!b_col_major) {
|
||||
mat_b = input_b_colmajor.transpose(-2, -1).contiguous().transpose(-2, -1);
|
||||
}
|
||||
|
||||
// For 3D tensors, check the last dimension stride for row-major detection
|
||||
a_row_major = (mat_a.dim() == 3) ? (mat_a.stride(2) == 1) : (mat_a.stride(1) == 1);
|
||||
bool b_row_major = (mat_b.dim() == 3) ? (mat_b.stride(2) == 1) : (mat_b.stride(1) == 1);
|
||||
|
||||
if (mat_a.dtype() == at::kBFloat16) {
|
||||
// bf16 path
|
||||
if (a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
|
||||
} else if (a_row_major && !b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
|
||||
} else if (!a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
|
||||
} else {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
|
||||
}
|
||||
} else if (mat_a.dtype() == at::kHalf) {
|
||||
// fp16 path
|
||||
if (a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
|
||||
} else if (a_row_major && !b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
|
||||
} else if (!a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
|
||||
} else {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
|
||||
}
|
||||
} else if (mat_a.dtype() == at::kFloat) {
|
||||
// fp32 path
|
||||
if (a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
|
||||
} else if (a_row_major && !b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
|
||||
} else if (!a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
|
||||
} else {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "CK Group GEMM: Unsupported mat_a dtype");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace hip
|
||||
} // namespace at
|
||||
@ -1,4 +1,4 @@
|
||||
#pragma onces
|
||||
#pragma once
|
||||
#include <c10/metal/common.h>
|
||||
|
||||
template <unsigned N = c10::metal::max_ndim>
|
||||
|
||||
@ -57,6 +57,7 @@ Tensor& random_mps_impl(Tensor& self,
|
||||
if (self.numel() == 0) {
|
||||
return self;
|
||||
}
|
||||
at::assert_no_internal_overlap(self);
|
||||
// MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
|
||||
const auto need_reshape = self.ndimension() > 4;
|
||||
auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
|
||||
@ -153,8 +154,16 @@ Tensor& random_mps_impl(Tensor& self,
|
||||
feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
|
||||
}
|
||||
|
||||
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
|
||||
// Handle non-contiguous output tensors by creating a contiguous temporary
|
||||
const auto needs_gather = needsGather(self);
|
||||
Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
|
||||
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
|
||||
// Copy results back to original non-contiguous output
|
||||
if (needs_gather) {
|
||||
self.copy_(self_);
|
||||
}
|
||||
}
|
||||
|
||||
return self;
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/SpectralOpsUtils.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
@ -37,25 +39,12 @@ NSArray<NSNumber*>* IntArrayToNSArray(IntArrayRef arr) {
|
||||
} // anonymous namespace
|
||||
|
||||
Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
|
||||
TORCH_CHECK(self.is_complex());
|
||||
auto in_sizes = self.sizes();
|
||||
DimVector out_sizes(in_sizes.begin(), in_sizes.end());
|
||||
out_sizes[dim.back()] = last_dim_size;
|
||||
auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
|
||||
auto out = at::empty({}, self.options().dtype(c10::toRealValueType(self.scalar_type())));
|
||||
return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out);
|
||||
}
|
||||
|
||||
Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
|
||||
TORCH_CHECK(self.is_floating_point());
|
||||
auto input_sizes = self.sizes();
|
||||
DimVector out_sizes(input_sizes.begin(), input_sizes.end());
|
||||
auto last_dim = dim.back();
|
||||
auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
|
||||
if (onesided) {
|
||||
out_sizes[last_dim] = last_dim_halfsize;
|
||||
}
|
||||
|
||||
auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type())));
|
||||
auto out = at::empty({}, self.options().dtype(c10::toComplexType(self.scalar_type())));
|
||||
return _fft_r2c_mps_out(self, dim, normalization, onesided, out);
|
||||
}
|
||||
|
||||
@ -72,6 +61,17 @@ using namespace mps;
|
||||
|
||||
// TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237
|
||||
Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) {
|
||||
TORCH_CHECK(self.scalar_type() == kFloat || self.scalar_type() == kHalf, "Only float and half dtypes are supported");
|
||||
TORCH_CHECK(out.scalar_type() == c10::toComplexType(self.scalar_type()));
|
||||
const auto input_sizes = self.sym_sizes();
|
||||
SymDimVector out_sizes(input_sizes.begin(), input_sizes.end());
|
||||
auto last_dim = dim.back();
|
||||
auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
|
||||
if (onesided) {
|
||||
out_sizes[last_dim] = last_dim_halfsize;
|
||||
}
|
||||
at::native::resize_output_symint(out, out_sizes);
|
||||
|
||||
auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" +
|
||||
std::to_string(normalization) + ":" + std::to_string(onesided);
|
||||
@autoreleasepool {
|
||||
@ -112,6 +112,12 @@ Tensor& _fft_c2r_mps_out(const Tensor& self,
|
||||
int64_t normalization,
|
||||
int64_t last_dim_size,
|
||||
Tensor& out) {
|
||||
TORCH_CHECK(self.is_complex(), "Input must be complex");
|
||||
TORCH_CHECK(out.scalar_type() == c10::toRealValueType(self.scalar_type()), "Unexpected output type");
|
||||
const auto in_sizes = self.sym_sizes();
|
||||
SymDimVector out_sizes(in_sizes.begin(), in_sizes.end());
|
||||
out_sizes[dim.back()] = last_dim_size;
|
||||
at::native::resize_output_symint(out, out_sizes);
|
||||
auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
|
||||
std::to_string(normalization) + ":" + std::to_string(last_dim_size);
|
||||
@autoreleasepool {
|
||||
|
||||
@ -617,6 +617,9 @@ Tensor& index_select_out_mps(const Tensor& self, int64_t dim, const Tensor& inde
|
||||
TORCH_CHECK(self.scalar_type() == output.scalar_type(),
|
||||
"index_select(): self and output must have the same scalar type");
|
||||
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
|
||||
at::assert_no_internal_overlap(output);
|
||||
at::assert_no_overlap(output, self);
|
||||
at::assert_no_overlap(output, index);
|
||||
auto output_size = self.sizes().vec();
|
||||
if (self.dim() > 0) {
|
||||
output_size[dim] = num_indices;
|
||||
|
||||
@ -73,8 +73,7 @@ void upsample_bilinear2d_out_frame(
|
||||
const auto rwidth = area_pixel_compute_scale<float>(
|
||||
input_width, output_width, align_corners, scales_w);
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
|
||||
float output_scale = output.q_scale() / input.q_scale();
|
||||
float output_scale = static_cast<float>(output.q_scale() / input.q_scale());
|
||||
|
||||
const int64_t input_q_zero_point = input.q_zero_point();
|
||||
const int64_t output_q_zero_point = output.q_zero_point();
|
||||
|
||||
@ -148,7 +148,7 @@ Tensor qcat_nhwc_kernel(
|
||||
// Vectorized loop
|
||||
if (c + VLEN <= curr_C) {
|
||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
||||
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
|
||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||
for (; c + VLEN <= curr_C; c += VLEN) {
|
||||
auto inp_vec = Vec::loadu(iptr + c);
|
||||
@ -174,7 +174,7 @@ Tensor qcat_nhwc_kernel(
|
||||
int64_t elem_size = curr_C - c;
|
||||
if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
|
||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
||||
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
|
||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||
int64_t vec_num = elem_size / kVLEN;
|
||||
std::array<typename scalar_t::underlying, VLEN> buf_in{};
|
||||
@ -611,12 +611,10 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
|
||||
void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
||||
const Scalar& negval_) {
|
||||
int64_t i_zp = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float i_scale = qx.q_scale();
|
||||
float i_scale = static_cast<float>(qx.q_scale());
|
||||
|
||||
int64_t o_zp = out.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float o_scale = out.q_scale();
|
||||
float o_scale = static_cast<float>(out.q_scale());
|
||||
float o_inv_scale = 1.0f / o_scale;
|
||||
|
||||
float negval = negval_.to<float>();
|
||||
@ -627,8 +625,8 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
||||
Vec zero_vec = Vec(0.0f);
|
||||
Vec one_vec = Vec(1.0f);
|
||||
|
||||
Vec i_scale_vec = Vec((float)i_scale);
|
||||
Vec i_zp_vec = Vec((float)i_zp);
|
||||
Vec i_scale_vec = Vec(i_scale);
|
||||
Vec i_zp_vec = Vec(i_zp);
|
||||
Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();
|
||||
|
||||
Vec negval_vec = Vec(negval);
|
||||
@ -738,10 +736,9 @@ void qprelu_out_kernel(Tensor& out,
|
||||
|
||||
void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
int64_t output_zero_point = zero_point;
|
||||
float output_scale = scale;
|
||||
@ -828,10 +825,9 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
||||
void qsigmoid_kernel(
|
||||
const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
|
||||
float inv_output_scale = 1.0 / output_scale;
|
||||
@ -870,10 +866,9 @@ void qsigmoid_kernel(
|
||||
|
||||
void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
|
||||
@ -1029,13 +1024,10 @@ void qthreshold_kernel(
|
||||
|
||||
// defines input and output scales and zero_points
|
||||
int64_t input_zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float input_scale = qx.q_scale();
|
||||
float input_scale = static_cast<float>(qx.q_scale());
|
||||
int64_t output_zero_point = qy.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float output_scale = qy.q_scale();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float inv_output_scale = 1.0 / output_scale;
|
||||
float output_scale = static_cast<float>(qy.q_scale());
|
||||
float inv_output_scale = static_cast<float>(1.0 / output_scale);
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
|
||||
qy = at::_empty_affine_quantized(
|
||||
@ -1096,8 +1088,7 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
|
||||
|
||||
const auto o_scale = qy.q_scale();
|
||||
const auto o_zero_point = qy.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const float o_inv_scale = 1.0 / o_scale;
|
||||
const float o_inv_scale = static_cast<float>(1.0 / o_scale);
|
||||
|
||||
using fVec = Vectorized<float>;
|
||||
fVec i_scale_vec(i_scale);
|
||||
@ -1135,10 +1126,9 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
|
||||
|
||||
void qtanh_kernel(const Tensor& qx, Tensor& qy) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
|
||||
@ -1198,16 +1188,13 @@ void qelu_kernel(
|
||||
// they are NOT related to the quantization scale term
|
||||
|
||||
int64_t i_zp = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float i_scale = qx.q_scale();
|
||||
float i_scale = static_cast<float>(qx.q_scale());
|
||||
|
||||
// In a future PR, we can improve on output scale and zero_point
|
||||
// selection.
|
||||
int64_t o_zp = qy.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float o_scale = qy.q_scale();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float inv_o_scale = 1.0 / o_scale;
|
||||
float o_scale = static_cast<float>(qy.q_scale());
|
||||
float inv_o_scale = static_cast<float>(1.0 / o_scale);
|
||||
|
||||
float alpha_float = alpha.to<float>();
|
||||
float scale_coef = scale.to<float>();
|
||||
@ -1227,7 +1214,7 @@ void qelu_kernel(
|
||||
Vec scale_coef_vec = Vec(scale_coef);
|
||||
Vec input_scale_coef_vec = Vec(input_scale_coef);
|
||||
Vec i_scale_vec = Vec(i_scale);
|
||||
Vec i_zero_point_vec = Vec((float)i_zp);
|
||||
Vec i_zero_point_vec = Vec(i_zp);
|
||||
Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();
|
||||
|
||||
cpu_kernel_vec(
|
||||
@ -1326,23 +1313,20 @@ void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
|
||||
template <bool ReLUFused = false>
|
||||
void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
|
||||
int64_t zero_point = out.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = out.q_scale();
|
||||
float scale = static_cast<float>(out.q_scale());
|
||||
float inv_scale = 1.0f / scale;
|
||||
int64_t self_zero_point = self.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float self_scale = self.q_scale();
|
||||
float self_scale = static_cast<float>(self.q_scale());
|
||||
int64_t other_zero_point = other.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float other_scale = other.q_scale();
|
||||
float other_scale = static_cast<float>(other.q_scale());
|
||||
|
||||
// Broadcast out the parameters here to amortize out that cost across
|
||||
// loop iterations.
|
||||
// TODO: we can optimize dequantization by doing a premultiplication
|
||||
// of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
|
||||
auto self_zero_point_vec = Vectorized<float>((float)self_zero_point);
|
||||
auto self_zero_point_vec = Vectorized<float>(self_zero_point);
|
||||
auto self_scale_vec = Vectorized<float>(self_scale);
|
||||
auto other_zero_point_vec = Vectorized<float>((float)other_zero_point);
|
||||
auto other_zero_point_vec = Vectorized<float>(other_zero_point);
|
||||
auto other_scale_vec = Vectorized<float>(other_scale);
|
||||
|
||||
auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
|
||||
@ -2965,7 +2949,7 @@ void quantized_normalize_kernel(
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
int64_t x_zp = X.q_zero_point();
|
||||
float x_scale = X.q_scale();
|
||||
fVec x_zp_vec((float)x_zp);
|
||||
fVec x_zp_vec(x_zp);
|
||||
fVec one_vec(1.0f);
|
||||
fVec zero_vec(0.0f);
|
||||
float x_fake_scale = 1.0f;
|
||||
@ -3253,7 +3237,7 @@ void quantized_groupnorm_nhwc_kernel(
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
int64_t x_zp = X.q_zero_point();
|
||||
float x_scale = X.q_scale();
|
||||
fVec x_zp_vec((float)x_zp);
|
||||
fVec x_zp_vec(x_zp);
|
||||
fVec one_vec(1.0f);
|
||||
fVec zero_vec(0.0f);
|
||||
float x_fake_scale = 1.0f;
|
||||
|
||||
@ -414,7 +414,6 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
|
||||
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
|
||||
TORCH_CHECK(input.dim() >= 2);
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||
const int64_t N = packed_weight_fp16.numCols();
|
||||
std::vector<int64_t> output_sizes = input.sizes().vec();
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#include <ATen/ops/zeros_like.h>
|
||||
#include <ATen/ops/reshape.h>
|
||||
#include <ATen/ops/scalar_tensor.h>
|
||||
#include <ATen/ops/sum.h>
|
||||
@ -42,7 +43,6 @@ C10_DIAGNOSTIC_POP()
|
||||
#include <static_switch.h>
|
||||
#include <ATen/native/transformers/cuda/flash_attn/flash_api.h>
|
||||
|
||||
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
namespace FLASH_NAMESPACE {
|
||||
@ -417,6 +417,26 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
const int head_size_og = sizes[3];
|
||||
const int seqlen_k = k.size(1);
|
||||
const int num_heads_k = k.size(2);
|
||||
|
||||
if (batch_size == 0) {
|
||||
auto opts = q.options();
|
||||
at::Tensor out = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
|
||||
at::Tensor q_padded = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
|
||||
at::Tensor k_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
|
||||
at::Tensor v_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
|
||||
at::Tensor softmax_lse = at::empty({0, num_heads, seqlen_q}, opts.dtype(at::kFloat));
|
||||
at::Tensor rng_state = at::empty({2}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||
at::Tensor _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||
at::Tensor p = at::empty({0}, opts);
|
||||
if (return_softmax) {
|
||||
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
|
||||
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
|
||||
const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
|
||||
p = at::empty({0, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
|
||||
}
|
||||
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), _unused, std::move(p)};
|
||||
}
|
||||
|
||||
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
||||
TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
|
||||
TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
|
||||
@ -547,7 +567,7 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
|
||||
softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
|
||||
}
|
||||
return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
|
||||
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), std::move(_unused), std::move(p)};
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
@ -852,7 +872,6 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
|
||||
|
||||
const auto sizes = q.sizes();
|
||||
|
||||
@ -863,6 +882,20 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
const int head_size = sizes[3];
|
||||
const int seqlen_k = k.size(1);
|
||||
const int num_heads_k = k.size(2);
|
||||
|
||||
if (batch_size == 0) {
|
||||
auto opts = q.options();
|
||||
at::Tensor dq = at::empty_like(q);
|
||||
at::Tensor dk = at::empty_like(k);
|
||||
at::Tensor dv = at::empty_like(v);
|
||||
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
|
||||
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
|
||||
at::Tensor softmax_d = at::empty({0, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
|
||||
return {dq, dk, dv, softmax_d};
|
||||
}
|
||||
|
||||
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
|
||||
|
||||
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
||||
TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
|
||||
TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
|
||||
|
||||
@ -11,11 +11,6 @@ def remove_cuda(config_list):
|
||||
return [config for config in config_list if cuda_config not in config]
|
||||
|
||||
|
||||
def remove_cpu(config_list):
|
||||
cpu_config = {"device": "cpu"}
|
||||
return [config for config in config_list if cpu_config not in config]
|
||||
|
||||
|
||||
# Configs for conv-1d ops
|
||||
conv_1d_configs_short = op_bench.config_list(
|
||||
attr_names=["IC", "OC", "kernel", "stride", "N", "L"],
|
||||
@ -132,20 +127,6 @@ conv_3d_configs_short = op_bench.config_list(
|
||||
},
|
||||
tags=["short"],
|
||||
)
|
||||
conv_3d_configs_long = op_bench.cross_product_configs(
|
||||
IC=[16, 32],
|
||||
OC=[32, 64],
|
||||
kernel=[3, 5],
|
||||
stride=[1, 2],
|
||||
N=[1],
|
||||
D=[128],
|
||||
H=[128],
|
||||
W=[128],
|
||||
G=[1],
|
||||
pad=[0],
|
||||
device=["cpu", "cuda"],
|
||||
tags=["long"],
|
||||
)
|
||||
|
||||
linear_configs_short = op_bench.config_list(
|
||||
attr_names=["N", "IN", "OUT"],
|
||||
|
||||
@ -38,10 +38,6 @@ class ConvTranspose1dBenchmark(op_bench.TorchBenchmarkBase):
|
||||
op_bench.generate_pt_test(
|
||||
configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
|
||||
)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
configs.remove_cpu(configs.conv_1d_configs_short + configs.conv_1d_configs_long),
|
||||
Conv1dBenchmark,
|
||||
)
|
||||
|
||||
|
||||
if not torch.backends.mkldnn.is_acl_available():
|
||||
@ -107,20 +103,6 @@ op_bench.generate_pt_test(
|
||||
configs.conv_2d_pw_configs_short + configs.conv_2d_pw_configs_long,
|
||||
Conv2dPointwiseBenchmark,
|
||||
)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
configs.remove_cpu(configs.conv_2d_configs_short + configs.conv_2d_configs_long),
|
||||
Conv2dBenchmark,
|
||||
)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
configs.remove_cpu(configs.conv_2d_configs_short + configs.conv_2d_configs_long),
|
||||
ConvTranspose2dBenchmark,
|
||||
)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
configs.remove_cpu(
|
||||
configs.conv_2d_pw_configs_short + configs.conv_2d_pw_configs_long
|
||||
),
|
||||
Conv2dPointwiseBenchmark,
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
@ -152,12 +134,6 @@ class ConvTranspose3dBenchmark(op_bench.TorchBenchmarkBase):
|
||||
|
||||
op_bench.generate_pt_test(configs.conv_3d_configs_short, Conv3dBenchmark)
|
||||
op_bench.generate_pt_test(configs.conv_3d_configs_short, ConvTranspose3dBenchmark)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
configs.remove_cpu(configs.conv_3d_configs_long), Conv3dBenchmark
|
||||
)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
configs.remove_cpu(configs.conv_3d_configs_long), ConvTranspose3dBenchmark
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
157
benchmarks/transformer/config_utils.py
Normal file
157
benchmarks/transformer/config_utils.py
Normal file
@ -0,0 +1,157 @@
|
||||
"""Configuration utilities for parsing JSON and YAML config files."""
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def heads_input_type(s: str) -> tuple[int, int]:
|
||||
"""Convert string format 'Hq,Hkv' to tuple (Hq, Hkv)."""
|
||||
try:
|
||||
hq, hkv = map(int, s.split(","))
|
||||
return hq, hkv
|
||||
except Exception as e:
|
||||
raise ValueError("Heads must be Hq,Hkv") from e
|
||||
|
||||
|
||||
default_config = {
|
||||
"dynamic": False,
|
||||
"calculate_bwd": False,
|
||||
"dtype": "bfloat16",
|
||||
"b": [2, 8, 16],
|
||||
"nh": ["16,16", "16,2"],
|
||||
"s": [512, 1024, 4096],
|
||||
"d": [64, 128],
|
||||
"mods": ["noop", "causal", "alibi", "sliding_window"],
|
||||
"backend": ["efficient"],
|
||||
"max_autotune": False,
|
||||
"decoding": False,
|
||||
"kv_size": None,
|
||||
"throughput": True,
|
||||
"save_path": None,
|
||||
"output_json_for_dashboard": None,
|
||||
"benchmark_name": "PyTorch operator microbenchmark",
|
||||
}
|
||||
|
||||
|
||||
def load_config_file(config_path: str) -> dict:
|
||||
"""Load configuration from JSON or YAML file.
|
||||
|
||||
Automatically converts 'nh' field from strings to tuples.
|
||||
|
||||
Args:
|
||||
config_path: Path to the configuration file
|
||||
|
||||
Returns:
|
||||
Dictionary containing the configuration
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If config file doesn't exist
|
||||
ValueError: If config file format is invalid
|
||||
"""
|
||||
with open(config_path) as f:
|
||||
config_str = f.read()
|
||||
|
||||
# Try to load as JSON first
|
||||
try:
|
||||
config = json.loads(config_str)
|
||||
except json.JSONDecodeError:
|
||||
# Fall back to YAML parsing
|
||||
config = _parse_simple_yaml(config_str)
|
||||
|
||||
# Apply automatic conversions for 'nh' field
|
||||
if "nh" in config and isinstance(config["nh"], list):
|
||||
config["nh"] = [
|
||||
heads_input_type(h) if isinstance(h, str) else h for h in config["nh"]
|
||||
]
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _parse_simple_yaml(yaml_str: str) -> dict:
|
||||
"""Simple YAML parser for basic configs (without external dependencies).
|
||||
|
||||
Supports:
|
||||
- key: value pairs
|
||||
- booleans (true/false)
|
||||
- null values
|
||||
- integers and floats
|
||||
- strings (quoted and unquoted)
|
||||
- lists in JSON format [item1, item2, ...]
|
||||
- comments (lines starting with # or after #)
|
||||
|
||||
Args:
|
||||
yaml_str: YAML content as string
|
||||
|
||||
Returns:
|
||||
Dictionary containing parsed YAML content
|
||||
"""
|
||||
config = {}
|
||||
|
||||
for line in yaml_str.split("\n"):
|
||||
# Remove comments
|
||||
line = line.split("#")[0].strip()
|
||||
|
||||
if not line or ":" not in line:
|
||||
continue
|
||||
|
||||
key, value = line.split(":", 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
|
||||
# Parse value based on type
|
||||
if value.lower() == "true":
|
||||
config[key] = True
|
||||
elif value.lower() == "false":
|
||||
config[key] = False
|
||||
elif value.lower() in ("null", "none", ""):
|
||||
config[key] = None
|
||||
elif value.startswith("[") and value.endswith("]"):
|
||||
# Parse list - handle quoted strings properly
|
||||
pattern = r'"([^"]+)"|\'([^\']+)\'|([^,\[\]\s]+)'
|
||||
matches = re.findall(pattern, value[1:-1]) # Remove [ ]
|
||||
parsed_items = []
|
||||
for match in matches:
|
||||
# match is a tuple of (double_quoted, single_quoted, unquoted)
|
||||
item = match[0] or match[1] or match[2]
|
||||
item = item.strip()
|
||||
if item:
|
||||
try:
|
||||
parsed_items.append(int(item))
|
||||
except ValueError:
|
||||
parsed_items.append(item)
|
||||
config[key] = parsed_items
|
||||
elif value.startswith(('"', "'")):
|
||||
config[key] = value.strip("\"'")
|
||||
else:
|
||||
# Try to parse as number
|
||||
try:
|
||||
config[key] = int(value)
|
||||
except ValueError:
|
||||
try:
|
||||
config[key] = float(value)
|
||||
except ValueError:
|
||||
config[key] = value
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def print_default_config(output_format: str) -> None:
|
||||
"""Print a default configuration template in JSON or YAML format.
|
||||
|
||||
Args:
|
||||
output_format: Either "json" or "yaml"
|
||||
"""
|
||||
if output_format == "json":
|
||||
print(json.dumps(default_config, indent=2))
|
||||
else: # yaml
|
||||
for key, value in default_config.items():
|
||||
if value is None:
|
||||
print(f"{key}: null")
|
||||
elif isinstance(value, bool):
|
||||
print(f"{key}: {str(value).lower()}")
|
||||
elif isinstance(value, str):
|
||||
print(f'{key}: "{value}"')
|
||||
elif isinstance(value, list):
|
||||
print(f"{key}: {json.dumps(value)}")
|
||||
else:
|
||||
print(f"{key}: {value}")
|
||||
29
benchmarks/transformer/configs/config_basic.yaml
Normal file
29
benchmarks/transformer/configs/config_basic.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
# Basic benchmark configuration for PyTorch transformer benchmarks
|
||||
# Usage: python score_mod.py --config config_basic.yaml
|
||||
|
||||
# Core parameters
|
||||
dynamic: false
|
||||
calculate_bwd: true
|
||||
dtype: "bfloat16"
|
||||
|
||||
# Shape parameters - larger sweep
|
||||
b: [1, 2, 4, 8, 16] # batch sizes
|
||||
nh: ["16,16", "16,2", "32,32", "32,4"] # [query_heads,key_value_heads]
|
||||
s: [512, 1024, 2048, 4096, 8192] # sequence lengths
|
||||
d: [64, 128] # head dimensions (limited to 128 for Flash Attention/cuDNN compatibility)
|
||||
|
||||
# All attention types
|
||||
mods: ["noop", "causal", "rel", "head_bias", "alibi", "sliding_window", "prefix_lm", "softcap"]
|
||||
|
||||
# Multiple backends for comparison (SDPA + Flash Attention) - flex is always included internally
|
||||
backend: ["efficient", "math", "cudnn", "fav2"]
|
||||
max_autotune: true # Enable torch.compile with max-autotune for optimal performance
|
||||
|
||||
# Decoding and cache settings
|
||||
decoding: false
|
||||
kv_size: null
|
||||
|
||||
# Metrics and output
|
||||
throughput: true # Calculate memory bandwidth & TFLOPS
|
||||
save_path: "comprehensive_results.csv" # Save to CSV
|
||||
output_json_for_dashboard: "attn_bench_basic.json"
|
||||
@ -1,15 +1,19 @@
|
||||
import argparse
|
||||
import csv
|
||||
import gc
|
||||
import itertools
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import asdict, dataclass
|
||||
from functools import partial
|
||||
from typing import Optional, Union
|
||||
from functools import partial, wraps
|
||||
from typing import Literal, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from config_utils import heads_input_type, load_config_file, print_default_config
|
||||
from tabulate import tabulate
|
||||
from tqdm import tqdm
|
||||
|
||||
@ -33,6 +37,96 @@ torch._dynamo.config.recompile_limit = 1000
|
||||
from torch._inductor.runtime.benchmarking import benchmarker
|
||||
|
||||
|
||||
def cleanup_memory():
|
||||
"""Aggressively free GPU memory"""
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def safe_backend(backend_name=None, return_dict=False):
|
||||
"""Decorator that wraps backend functions with error handling
|
||||
|
||||
Args:
|
||||
backend_name: Name of the backend for error messages
|
||||
return_dict: If True, returns dict of results for all backends (for run_single_experiment)
|
||||
If False, returns single ExperimentResults (for individual backend functions)
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(config, *args, **kwargs):
|
||||
try:
|
||||
return func(config, *args, **kwargs)
|
||||
except torch.OutOfMemoryError:
|
||||
print(
|
||||
f"[SKIP] OOM for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
cleanup_memory()
|
||||
except RuntimeError as e:
|
||||
error_msg = str(e)
|
||||
if "out of resource" in error_msg or "OutOfMemoryError" in error_msg:
|
||||
print(
|
||||
f"[SKIP] Triton OOM for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
cleanup_memory()
|
||||
elif "No valid triton configs" in error_msg:
|
||||
print(
|
||||
f"[SKIP] No valid Triton config for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"[SKIP] Runtime error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(
|
||||
f"[SKIP] Error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
|
||||
)
|
||||
|
||||
# Return appropriate NaN result based on function type
|
||||
if return_dict:
|
||||
# For run_single_experiment: return dict with NaN for all backends
|
||||
nan_result = ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
)
|
||||
results = dict.fromkeys(config.backends, nan_result)
|
||||
results["flex"] = ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
sparsity=None,
|
||||
)
|
||||
return results
|
||||
else:
|
||||
# For individual backend functions: return single ExperimentResults
|
||||
return ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
# Type definitions
|
||||
Backend = Literal["math", "efficient", "cudnn", "fav2", "fav3", "fakv", "og-eager"]
|
||||
AttentionType = Literal[
|
||||
"noop",
|
||||
"causal",
|
||||
"rel",
|
||||
"head_bias",
|
||||
"alibi",
|
||||
"sliding_window",
|
||||
"document_mask",
|
||||
"prefix_lm",
|
||||
"softcap",
|
||||
]
|
||||
DtypeString = Literal["bfloat16", "float16", "float32"]
|
||||
SpeedupType = Literal["fwd", "bwd"]
|
||||
|
||||
|
||||
def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
|
||||
# warmup
|
||||
for _ in range(5):
|
||||
@ -48,6 +142,7 @@ class ExperimentConfig:
|
||||
calculate_bwd_time: bool
|
||||
cal_bandwidth: bool
|
||||
backends: list[str]
|
||||
max_autotune: bool
|
||||
|
||||
def __post_init__(self):
|
||||
assert len(self.shape) == 6, (
|
||||
@ -62,6 +157,7 @@ class ExperimentConfig:
|
||||
d.pop("cal_bandwidth", None)
|
||||
d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
|
||||
d.pop("backends", None)
|
||||
d.pop("max_autotune", False)
|
||||
return d
|
||||
|
||||
|
||||
@ -209,6 +305,7 @@ def query_key_value_clones(
|
||||
return query_ref, key_ref, value_ref
|
||||
|
||||
|
||||
@safe_backend("SDPA")
|
||||
def run_single_backend_sdpa(
|
||||
config: ExperimentConfig,
|
||||
query: torch.Tensor,
|
||||
@ -223,6 +320,7 @@ def run_single_backend_sdpa(
|
||||
backend_context = get_backend_context(backend)
|
||||
with backend_context:
|
||||
_device = torch.device("cuda")
|
||||
|
||||
eager_sdpa = generate_eager_sdpa(
|
||||
config.attn_type, config.shape, config.dtype, block_mask, score_mod
|
||||
)
|
||||
@ -290,6 +388,7 @@ def run_single_backend_sdpa(
|
||||
)
|
||||
|
||||
|
||||
@safe_backend("FlashAttention")
|
||||
def run_single_backend_FA(
|
||||
config: ExperimentConfig,
|
||||
query: torch.Tensor,
|
||||
@ -301,9 +400,9 @@ def run_single_backend_FA(
|
||||
mask_kwargs,
|
||||
backend: str,
|
||||
) -> ExperimentResults:
|
||||
assert backend in ["fav2", "fav3", "fakv"]
|
||||
assert backend in ["fav3", "fakv"]
|
||||
# Generate callable for specific backend.
|
||||
if backend in ["fav2", "fav3"]:
|
||||
if backend in ["fav3"]:
|
||||
FA = generate_FA_callable(
|
||||
config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
|
||||
)
|
||||
@ -354,10 +453,10 @@ def run_single_backend_FA(
|
||||
)
|
||||
|
||||
|
||||
@safe_backend("flex_attention", return_dict=True)
|
||||
def run_single_experiment(
|
||||
config: ExperimentConfig,
|
||||
dynamic=False,
|
||||
max_autotune=False,
|
||||
) -> dict[str, ExperimentResults]:
|
||||
device = torch.device("cuda")
|
||||
batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
|
||||
@ -377,7 +476,7 @@ def run_single_experiment(
|
||||
block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
|
||||
kernel_options = get_kernel_options(config.attn_type, config.shape)
|
||||
|
||||
if max_autotune:
|
||||
if config.max_autotune:
|
||||
compiled_sdpa = torch.compile(
|
||||
flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
|
||||
)
|
||||
@ -407,7 +506,7 @@ def run_single_experiment(
|
||||
|
||||
results = {}
|
||||
for backend in config.backends:
|
||||
if backend in ["fav2", "fav3", "fakv"]:
|
||||
if backend in ["fav3", "fakv"]:
|
||||
results[backend] = run_single_backend_FA(
|
||||
config,
|
||||
query,
|
||||
@ -419,7 +518,7 @@ def run_single_experiment(
|
||||
mask_kwargs,
|
||||
backend,
|
||||
)
|
||||
else: # sdpa
|
||||
else: # sdpa (also supports fav2)
|
||||
results[backend] = run_single_backend_sdpa(
|
||||
config,
|
||||
query,
|
||||
@ -440,7 +539,7 @@ def run_single_experiment(
|
||||
sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
|
||||
sparsity = sparsity if config.attn_type != "document_mask" else 0.5
|
||||
|
||||
results["compiled"] = ExperimentResults(
|
||||
results["flex"] = ExperimentResults(
|
||||
fwd_time=forward_compiled_time,
|
||||
bwd_time=backward_compile_time if config.calculate_bwd_time else None,
|
||||
sparsity=sparsity,
|
||||
@ -501,15 +600,15 @@ def calculate_tflops(config: ExperimentConfig, results: ExperimentResults) -> fl
|
||||
softmax_flops = M * N * 2 # Not counting online softmax overhead
|
||||
o_flops = M * D * N * 2
|
||||
# Not counting split k overhead
|
||||
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - results.sparsity)
|
||||
sparsity = results.sparsity if results.sparsity is not None else 0.0
|
||||
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - sparsity)
|
||||
return total_flops / results.fwd_time / 1e6 # in TFLOPs/
|
||||
|
||||
|
||||
def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
||||
# Calculate speedups
|
||||
speedups = [
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type)
|
||||
for r in results
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type) for r in results
|
||||
]
|
||||
|
||||
# Find indices of max and min speedups
|
||||
@ -537,7 +636,7 @@ def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
||||
def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
||||
table_data = defaultdict(list)
|
||||
for experiment in results:
|
||||
backends = experiment.config.backends + ["compiled"]
|
||||
backends = experiment.config.backends + ["flex"]
|
||||
for key, value in experiment.asdict().items():
|
||||
if key in backends:
|
||||
if value.fwd_time:
|
||||
@ -550,45 +649,43 @@ def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
||||
# Calculate speedups
|
||||
for backend in results[0].config.backends:
|
||||
fwd_speedups = [
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type="fwd")
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type="fwd")
|
||||
for r in results
|
||||
]
|
||||
table_data[f"fwd_{backend}_speedup"] = fwd_speedups
|
||||
table_data[f"fwd_speedup_flex_over_{backend}"] = fwd_speedups
|
||||
|
||||
if results[0].config.calculate_bwd_time:
|
||||
for backend in results[0].config.backends:
|
||||
bwd_speedups = [
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type="bwd")
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type="bwd")
|
||||
for r in results
|
||||
]
|
||||
table_data[f"bwd_{backend}_speedup"] = bwd_speedups
|
||||
table_data[f"bwd_speedup_flex_over_{backend}"] = bwd_speedups
|
||||
|
||||
# Calculate mem + computational throughput
|
||||
if results[0].config.cal_bandwidth:
|
||||
fwd_bandwidth = [
|
||||
calculate_bandwidth(r.config, r.results["compiled"], type="fwd")
|
||||
calculate_bandwidth(r.config, r.results["flex"], type="fwd")
|
||||
for r in results
|
||||
]
|
||||
table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
|
||||
fwd_tflops = [
|
||||
calculate_tflops(r.config, r.results["compiled"]) for r in results
|
||||
]
|
||||
fwd_tflops = [calculate_tflops(r.config, r.results["flex"]) for r in results]
|
||||
table_data["TFlops/s"] = fwd_tflops
|
||||
|
||||
print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
||||
|
||||
for backend in results[0].config.backends:
|
||||
if np.isnan(table_data[f"fwd_{backend}_speedup"]).all():
|
||||
if np.isnan(table_data[f"fwd_speedup_flex_over_{backend}"]).all():
|
||||
continue
|
||||
print("\n")
|
||||
print(f"FWD Speedups vs. {backend}".center(125, "="))
|
||||
print(f"FWD Speedup of Flex over {backend}".center(125, "="))
|
||||
print("\n")
|
||||
average_data = get_average_speedups(results, type="fwd", backend=backend)
|
||||
print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
||||
|
||||
if results[0].config.calculate_bwd_time:
|
||||
print("\n")
|
||||
print(f"BWD Speedups vs. {backend}".center(125, "="))
|
||||
print(f"BWD Speedup of Flex over {backend}".center(125, "="))
|
||||
print("\n")
|
||||
average_data = get_average_speedups(results, type="bwd", backend=backend)
|
||||
print(
|
||||
@ -791,14 +888,14 @@ def get_backend_context(backend: str):
|
||||
Returns a context manager for the specified backend.
|
||||
Args:
|
||||
backend (str): The name of the backend to use.
|
||||
Valid options are 'fav2', 'cudnn', 'math', 'efficient', 'fav3', 'fakv', 'og-eager'.
|
||||
Valid options are 'math', 'efficient', 'cudnn', 'fav2', 'fav3', 'fakv', 'og-eager'.
|
||||
Returns:
|
||||
A context manager for the specified backend.
|
||||
Raises:
|
||||
ValueError: If an invalid backend is specified.
|
||||
"""
|
||||
backends = {
|
||||
"fav2": nullcontext(),
|
||||
"fav2": sdpa_kernel(SDPBackend.FLASH_ATTENTION),
|
||||
"cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
|
||||
"math": sdpa_kernel(SDPBackend.MATH),
|
||||
"efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
|
||||
@ -820,15 +917,7 @@ def generate_FA_callable(
|
||||
) -> Callable | None:
|
||||
if dtype not in [torch.float16, torch.bfloat16]:
|
||||
return None
|
||||
if backend == "fav2":
|
||||
try:
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
except ImportError:
|
||||
print(
|
||||
"Flash attention 2 is not installed. Please install it to run fav2 backend. "
|
||||
)
|
||||
raise
|
||||
elif backend == "fav3":
|
||||
if backend == "fav3":
|
||||
try:
|
||||
from flash_attn.flash_attn_interface import (
|
||||
flash_attn_func,
|
||||
@ -1034,6 +1123,7 @@ def generate_experiment_configs(
|
||||
kv_cache_size: list[int],
|
||||
cal_bandwidth: bool,
|
||||
backends: list[str],
|
||||
max_autotune: bool,
|
||||
) -> list[ExperimentConfig]:
|
||||
assert not (calculate_bwd and decoding), "Decoding does not support backward"
|
||||
|
||||
@ -1077,52 +1167,333 @@ def generate_experiment_configs(
|
||||
calculate_bwd_time=calculate_bwd,
|
||||
cal_bandwidth=cal_bandwidth,
|
||||
backends=backends,
|
||||
max_autotune=max_autotune,
|
||||
)
|
||||
)
|
||||
|
||||
return all_configs
|
||||
|
||||
|
||||
def main(args):
|
||||
def _output_json_for_dashboard(
|
||||
experiments,
|
||||
output_file,
|
||||
benchmark_name="PyTorch operator microbenchmark",
|
||||
):
|
||||
"""
|
||||
Write the result into JSON format for PyTorch OSS dashboard.
|
||||
The JSON format is defined at
|
||||
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||
|
||||
Args:
|
||||
experiments: List of experiment results
|
||||
output_file: Path to output JSON file
|
||||
benchmark_name: Name of the benchmark
|
||||
"""
|
||||
if not experiments:
|
||||
return
|
||||
|
||||
import math
|
||||
import platform
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Optional
|
||||
|
||||
# Prepare headers and records for JSON output
|
||||
records = []
|
||||
for experiment in experiments:
|
||||
config = experiment.config
|
||||
results_dict = (
|
||||
experiment.results
|
||||
) # This is a dict: backend -> ExperimentResults
|
||||
|
||||
# Process each backend result
|
||||
for backend, results in results_dict.items():
|
||||
# Skip backends that were not run (NaN results)
|
||||
if math.isnan(results.fwd_time):
|
||||
continue
|
||||
|
||||
# Extract data from experiment
|
||||
test_name = f"{backend}_{config.attn_type}_"
|
||||
input_config = f"shape: {config.shape}, dtype: {config.dtype}"
|
||||
|
||||
# Determine mode based on backward pass
|
||||
mode = "training" if config.calculate_bwd_time else "inference"
|
||||
|
||||
# Extract dtype
|
||||
dtype = (
|
||||
str(config.dtype).split(".")[1]
|
||||
if "." in str(config.dtype)
|
||||
else str(config.dtype)
|
||||
)
|
||||
|
||||
# Determine device
|
||||
device = "cuda"
|
||||
|
||||
# Get device architecture
|
||||
device_arch = (
|
||||
torch.cuda.get_device_name(0)
|
||||
if device == "cuda"
|
||||
else platform.processor()
|
||||
if device == "cpu"
|
||||
else "unknown"
|
||||
)
|
||||
|
||||
# Create dataclasses for JSON structure
|
||||
@dataclass
|
||||
class BenchmarkInfo:
|
||||
name: str
|
||||
mode: Optional[str]
|
||||
dtype: str
|
||||
extra_info: dict[str, Any]
|
||||
|
||||
@dataclass
|
||||
class ModelInfo:
|
||||
name: str
|
||||
type: str
|
||||
origins: list[str]
|
||||
extra_info: dict[str, Any]
|
||||
|
||||
@dataclass
|
||||
class MetricInfo:
|
||||
name: str
|
||||
unit: str
|
||||
benchmark_values: list[float]
|
||||
target_value: Optional[float]
|
||||
|
||||
@dataclass
|
||||
class BenchmarkRecord:
|
||||
benchmark: BenchmarkInfo
|
||||
model: ModelInfo
|
||||
metric: MetricInfo
|
||||
|
||||
# Benchmark extra info
|
||||
benchmark_extra_info = {
|
||||
"input_config": input_config,
|
||||
"device": device,
|
||||
"arch": device_arch,
|
||||
"operator_name": backend,
|
||||
"attn_type": config.attn_type,
|
||||
"shape": str(config.shape),
|
||||
"max_autotune": config.max_autotune,
|
||||
}
|
||||
# Add record for forward latency
|
||||
record_fwd_latency = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
"attn_type": config.attn_type,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="forward latency",
|
||||
unit="us",
|
||||
benchmark_values=[results.fwd_time],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_latency))
|
||||
|
||||
# Add record for forward memory bandwidth (if available)
|
||||
if config.cal_bandwidth:
|
||||
record_fwd_bandwidth = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="memory bandwidth",
|
||||
unit="TB/s",
|
||||
benchmark_values=[calculate_bandwidth(config, results, "fwd")],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_bandwidth))
|
||||
|
||||
# Add record for forward TFLOPS (if available)
|
||||
if config.cal_bandwidth:
|
||||
record_fwd_tflops = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="tflops",
|
||||
unit="TFLOPS/s",
|
||||
benchmark_values=[calculate_tflops(config, results)],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_tflops))
|
||||
|
||||
# Add record for backward latency (if available and not NaN)
|
||||
if (
|
||||
config.calculate_bwd_time
|
||||
and results.bwd_time is not None
|
||||
and not math.isnan(results.bwd_time)
|
||||
):
|
||||
record_bwd_latency = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="backward latency",
|
||||
unit="us",
|
||||
benchmark_values=[results.bwd_time],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_bwd_latency))
|
||||
|
||||
# Write all records to the output file
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
|
||||
|
||||
def main(
|
||||
dynamic: bool = False,
|
||||
calculate_bwd: bool = False,
|
||||
dtype: DtypeString = "bfloat16",
|
||||
b: list[int] | None = None,
|
||||
nh: list[str] | None = None,
|
||||
s: list[int] | None = None,
|
||||
d: list[int] | None = None,
|
||||
mods: list[AttentionType] | None = None,
|
||||
backend: list[Backend] | None = None,
|
||||
max_autotune: bool = False,
|
||||
decoding: bool = False,
|
||||
kv_size: Optional[list[int]] = None,
|
||||
throughput: bool = True,
|
||||
save_path: Optional[str] = None,
|
||||
output_json_for_dashboard: Optional[str] = None,
|
||||
benchmark_name: str = "PyTorch operator microbenchmark",
|
||||
) -> None:
|
||||
"""Run sweep over sizes and score mods for flex attention.
|
||||
|
||||
Usage Examples:
|
||||
# Use a yml config file
|
||||
python score_mod.py --config basic_config.yaml
|
||||
|
||||
# Use a json config file
|
||||
python score_mod.py --config my_config.json
|
||||
|
||||
# Generate a config template
|
||||
python score_mod.py --print-config json > my_config.json # For a json config
|
||||
python score_mod.py --print-config yaml > my_config.yaml # For a yaml config
|
||||
|
||||
# Override config with CLI args
|
||||
python score_mod.py --config my_config.json -dtype float16 --max-autotune
|
||||
|
||||
# Pure CLI usage
|
||||
python score_mod.py -b 4 8 -s 1024 2048 -mods causal alibi --backend efficient
|
||||
|
||||
Args:
|
||||
dynamic: Runs a dynamic shapes version of compiled flex attention
|
||||
calculate_bwd: Calculate backward pass times
|
||||
dtype: Data type for tensors (bfloat16, float16, float32)
|
||||
b: Batch sizes to benchmark
|
||||
nh: Number of query and key/value heads in format "Hq,Hkv"
|
||||
s: Sequence lengths to benchmark
|
||||
d: Head dimensions to benchmark
|
||||
mods: Score modifications: noop, causal, rel, head_bias, alibi, sliding_window, document_mask, prefix_lm, softcap
|
||||
backend: Backends for attention computation: math, efficient, cudnn, fav2, fav3, fakv, og-eager
|
||||
max_autotune: Turn on max-autotune optimization
|
||||
decoding: Benchmark decoding mode (query sequence length = 1)
|
||||
kv_size: Key/value cache size in MiB (ignores batch size if specified)
|
||||
throughput: Calculate kernel memory bandwidth & computational throughput (always True)
|
||||
save_path: Path to save the results CSV file
|
||||
output_json_for_dashboard: Path to save results in JSON format for PyTorch OSS dashboard
|
||||
benchmark_name: Name of the benchmark for dashboard output
|
||||
"""
|
||||
# Convert dtype string to torch dtype (if not already converted)
|
||||
import torch
|
||||
|
||||
if isinstance(dtype, str):
|
||||
dtype = getattr(torch, dtype)
|
||||
|
||||
# Always calculate throughput
|
||||
throughput = True
|
||||
print("Backend: ", backend)
|
||||
seed = 123
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
results = []
|
||||
for config in tqdm(
|
||||
generate_experiment_configs(
|
||||
args.calculate_bwd,
|
||||
args.dtype,
|
||||
args.b,
|
||||
args.nh,
|
||||
args.s,
|
||||
args.d,
|
||||
args.mods,
|
||||
args.decoding,
|
||||
args.kv_size,
|
||||
args.throughput,
|
||||
args.backend,
|
||||
)
|
||||
for experiment_count, config in enumerate(
|
||||
tqdm(
|
||||
generate_experiment_configs(
|
||||
calculate_bwd,
|
||||
dtype,
|
||||
b,
|
||||
nh,
|
||||
s,
|
||||
d,
|
||||
mods,
|
||||
decoding,
|
||||
kv_size,
|
||||
throughput,
|
||||
backend,
|
||||
max_autotune,
|
||||
)
|
||||
),
|
||||
start=1,
|
||||
):
|
||||
results.append(
|
||||
Experiment(
|
||||
config,
|
||||
run_single_experiment(
|
||||
config,
|
||||
dynamic=args.dynamic,
|
||||
max_autotune=args.max_autotune,
|
||||
dynamic=dynamic,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
print_results(results, args.save_path)
|
||||
# Periodic memory cleanup every 50 experiments
|
||||
if experiment_count % 50 == 0:
|
||||
cleanup_memory()
|
||||
|
||||
print_results(results, save_path)
|
||||
|
||||
def heads_input_type(s):
|
||||
try:
|
||||
hq, hkv = map(int, s.split(","))
|
||||
return hq, hkv
|
||||
except Exception as e:
|
||||
raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
|
||||
# Output JSON for dashboard if requested
|
||||
if output_json_for_dashboard:
|
||||
_output_json_for_dashboard(results, output_json_for_dashboard, benchmark_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -1130,6 +1501,12 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run sweep over sizes and score mods for flex attention"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="Path to JSON config file. CLI args override config file values.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dynamic",
|
||||
action="store_true",
|
||||
@ -1199,8 +1576,49 @@ Ignores -b batch size and calculate batch size from kv size instead when specifi
|
||||
default=["efficient"],
|
||||
help="Backend to use for attention computation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json-for-dashboard",
|
||||
type=str,
|
||||
help="Path to save results in JSON format for PyTorch OSS dashboard",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--benchmark-name",
|
||||
type=str,
|
||||
help="Name of the benchmark for dashboard output",
|
||||
default="PyTorch operator microbenchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-config",
|
||||
type=str,
|
||||
choices=["json", "yaml"],
|
||||
help="Print a default config template in JSON or YAML format and exit",
|
||||
default=None,
|
||||
)
|
||||
# Parse arguments
|
||||
args = parser.parse_args()
|
||||
args.dtype = getattr(torch, args.dtype)
|
||||
|
||||
main(args)
|
||||
# Handle --print-config
|
||||
if args.print_config:
|
||||
print_default_config(args.print_config)
|
||||
sys.exit(0)
|
||||
|
||||
# Load and merge config if provided
|
||||
if args.config:
|
||||
config = load_config_file(args.config)
|
||||
|
||||
# Merge config with CLI args (CLI args take precedence)
|
||||
json_args = argparse.Namespace()
|
||||
json_args.__dict__ = config
|
||||
args = parser.parse_args(namespace=json_args)
|
||||
|
||||
# Convert dtype string to torch dtype (only if it's still a string)
|
||||
if isinstance(args.dtype, str):
|
||||
args.dtype = getattr(torch, args.dtype)
|
||||
|
||||
# Remove config and print_config from args before passing to main
|
||||
args_dict = vars(args)
|
||||
args_dict.pop("config", None)
|
||||
args_dict.pop("print_config", None)
|
||||
|
||||
main(**args_dict)
|
||||
|
||||
@ -916,6 +916,7 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/autograd/python_torch_functions_manual.cpp",
|
||||
"torch/csrc/autograd/python_variable.cpp",
|
||||
"torch/csrc/autograd/python_variable_indexing.cpp",
|
||||
"torch/csrc/distributed/python_placement.cpp",
|
||||
"torch/csrc/dynamo/python_compiled_autograd.cpp",
|
||||
"torch/csrc/dynamo/cache_entry.cpp",
|
||||
"torch/csrc/dynamo/cpp_shim.cpp",
|
||||
@ -1073,6 +1074,7 @@ aten_cpu_non_globed_sources = [
|
||||
"aten/src/ATen/detail/MPSHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/MAIAHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/PrivateUse1HooksInterface.cpp",
|
||||
"aten/src/ATen/detail/XLAHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/XPUHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/MTIAHooksInterface.cpp",
|
||||
"aten/src/ATen/detail/IPUHooksInterface.cpp",
|
||||
@ -1091,6 +1093,7 @@ aten_cpu_non_globed_headers = [
|
||||
"aten/src/ATen/detail/HPUHooksInterface.h",
|
||||
"aten/src/ATen/detail/MAIAHooksInterface.h",
|
||||
"aten/src/ATen/detail/PrivateUse1HooksInterface.h",
|
||||
"aten/src/ATen/detail/XLAHooksInterface.h",
|
||||
"aten/src/ATen/detail/XPUHooksInterface.h",
|
||||
"aten/src/ATen/detail/MTIAHooksInterface.h",
|
||||
"aten/src/ATen/detail/IPUHooksInterface.h",
|
||||
|
||||
@ -556,3 +556,26 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
#include <limits>
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
class numeric_limits<c10::SymInt> {
|
||||
public:
|
||||
static constexpr bool is_specialized = true;
|
||||
|
||||
static constexpr int64_t max() noexcept {
|
||||
return std::numeric_limits<int64_t>::max();
|
||||
}
|
||||
|
||||
static constexpr int64_t min() noexcept {
|
||||
return std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
|
||||
static constexpr bool is_signed = true;
|
||||
static constexpr bool is_integer = true;
|
||||
};
|
||||
|
||||
} // namespace std
|
||||
|
||||
@ -329,17 +329,17 @@ struct pair {
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static T conj(T a) {
|
||||
inline T conj(T a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
template <>
|
||||
half2 conj(half2 a) {
|
||||
inline half2 conj(half2 a) {
|
||||
return half2(a.x, -a.y);
|
||||
}
|
||||
|
||||
template <>
|
||||
float2 conj(float2 a) {
|
||||
inline float2 conj(float2 a) {
|
||||
return float2(a.x, -a.y);
|
||||
}
|
||||
|
||||
|
||||
@ -123,6 +123,8 @@ class DeviceCachingAllocator {
|
||||
ska::flat_hash_map<xpu::XPUStream, std::deque<std::pair<sycl::event, Block*>>>
|
||||
xpu_events;
|
||||
DeviceIndex device_index;
|
||||
size_t allowed_memory_maximum = 0;
|
||||
bool set_fraction = false;
|
||||
|
||||
size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
|
||||
if (!src || src->allocated || src->event_count > 0 ||
|
||||
@ -245,6 +247,12 @@ class DeviceCachingAllocator {
|
||||
if (isRetry) {
|
||||
stats.num_alloc_retries += 1;
|
||||
}
|
||||
if (set_fraction &&
|
||||
stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current +
|
||||
size >
|
||||
allowed_memory_maximum) {
|
||||
return false;
|
||||
}
|
||||
void* ptr = sycl::aligned_alloc_device(
|
||||
kDeviceAlignment,
|
||||
size,
|
||||
@ -435,6 +443,11 @@ class DeviceCachingAllocator {
|
||||
device_free =
|
||||
raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
|
||||
}
|
||||
std::string allowed_info;
|
||||
if (set_fraction) {
|
||||
allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
|
||||
}
|
||||
|
||||
auto allocated_bytes =
|
||||
stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
|
||||
.current;
|
||||
@ -459,7 +472,9 @@ class DeviceCachingAllocator {
|
||||
format_size(device_total),
|
||||
" of which ",
|
||||
format_size(device_free),
|
||||
" is free. Of the allocated memory ",
|
||||
" is free. ",
|
||||
allowed_info,
|
||||
"Of the allocated memory ",
|
||||
format_size(allocated_bytes),
|
||||
" is allocated by PyTorch, and ",
|
||||
format_size(reserved_bytes - allocated_bytes),
|
||||
@ -538,6 +553,14 @@ class DeviceCachingAllocator {
|
||||
stats.requested_bytes[statType].reset_peak();
|
||||
}
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction) {
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||
auto device_total = device_prop.global_mem_size;
|
||||
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
||||
set_fraction = true;
|
||||
}
|
||||
};
|
||||
|
||||
static void local_raw_delete(void* ptr);
|
||||
@ -700,6 +723,16 @@ class XPUAllocator : public DeviceAllocator {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetAccumulatedStats();
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction, DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
TORCH_CHECK_VALUE(
|
||||
0 < fraction && fraction <= 1,
|
||||
"invalid fraction:",
|
||||
fraction,
|
||||
". Please set within (0, 1].");
|
||||
device_allocators[device]->setMemoryFraction(fraction);
|
||||
}
|
||||
};
|
||||
|
||||
static XPUAllocator allocator;
|
||||
@ -744,6 +777,10 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
|
||||
return allocator.recordStream(dataPtr, stream);
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction, DeviceIndex device) {
|
||||
return allocator.setMemoryFraction(fraction, device);
|
||||
}
|
||||
|
||||
REGISTER_ALLOCATOR(kXPU, &allocator)
|
||||
|
||||
} // namespace c10::xpu::XPUCachingAllocator
|
||||
|
||||
@ -25,4 +25,6 @@ C10_XPU_API void raw_delete(void* ptr);
|
||||
|
||||
C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
|
||||
|
||||
C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
|
||||
|
||||
} // namespace c10::xpu::XPUCachingAllocator
|
||||
|
||||
@ -1358,9 +1358,15 @@ if(BUILD_TEST)
|
||||
)
|
||||
else()
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
|
||||
# NativeRT is disabled
|
||||
# add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
||||
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
|
||||
if(BUILD_AOT_INDUCTOR_TEST)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
|
||||
endif()
|
||||
|
||||
if(USE_DISTRIBUTED)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
@ -1378,16 +1384,6 @@ if(BUILD_TEST)
|
||||
${CMAKE_BINARY_DIR}/test_mobile_nnc
|
||||
)
|
||||
endif()
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
|
||||
${CMAKE_BINARY_DIR}/test_lazy)
|
||||
endif()
|
||||
if(BUILD_AOT_INDUCTOR_TEST)
|
||||
add_subdirectory(
|
||||
${TORCH_ROOT}/test/cpp/aoti_abi_check
|
||||
${CMAKE_BINARY_DIR}/test_aoti_abi_check)
|
||||
add_subdirectory(
|
||||
${TORCH_ROOT}/test/cpp/aoti_inference
|
||||
${CMAKE_BINARY_DIR}/test_aoti_inference)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@ -29,10 +29,15 @@ SET(Open_BLAS_LIB_SEARCH_PATHS
|
||||
$ENV{OpenBLAS}/lib
|
||||
$ENV{OpenBLAS_HOME}
|
||||
$ENV{OpenBLAS_HOME}/lib
|
||||
)
|
||||
)
|
||||
|
||||
SET(Open_BLAS_LIB_NAME openblas)
|
||||
IF(DEFINED ENV{OpenBLAS_LIB_NAME})
|
||||
SET(Open_BLAS_LIB_NAME $ENV{OpenBLAS_LIB_NAME})
|
||||
ENDIF()
|
||||
|
||||
FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
|
||||
FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
|
||||
FIND_LIBRARY(OpenBLAS_LIB NAMES ${Open_BLAS_LIB_NAME} PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
|
||||
|
||||
SET(OpenBLAS_FOUND ON)
|
||||
|
||||
|
||||
@ -383,7 +383,7 @@ function(torch_compile_options libname)
|
||||
-Wno-strict-aliasing
|
||||
)
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
list(APPEND private_compile_options -Wredundant-move)
|
||||
list(APPEND private_compile_options -Wredundant-move -Wno-interference-size)
|
||||
endif()
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND private_compile_options -Wextra-semi -Wmove)
|
||||
|
||||
@ -14,7 +14,7 @@ Combining, these building blocks form a research and
|
||||
production ready C++ library for tensor computation and dynamic neural
|
||||
networks with strong emphasis on GPU acceleration as well as fast CPU
|
||||
performance. It is currently in use at Facebook in research and
|
||||
production; we are looking forward to welcome more users of the PyTorch C++ API.
|
||||
production; we are looking forward to welcoming more users of the PyTorch C++ API.
|
||||
|
||||
.. warning::
|
||||
|
||||
|
||||
@ -64,7 +64,7 @@ users should pay additional attention to:
|
||||
|
||||
- Both guards affects tensor execution process to skip work not related to inference, but ``InferenceMode``
|
||||
also affects tensor creation while ``AutoNonVariableTypeMode`` doesn't. In other words, tensors created
|
||||
inside ``InferenceMode`` are marked as inference tensors so that certain limitation can be applied after
|
||||
inside ``InferenceMode`` are marked as inference tensors so that certain limitations can be applied after
|
||||
exiting ``InferenceMode``.
|
||||
- Enabled/disabled ``InferenceMode`` states can be nested while ``AutoNonVariableTypeMode`` only allows enabled state.
|
||||
|
||||
|
||||
@ -17,7 +17,7 @@ restoring the RNG state during each checkpoint.
|
||||
The stashing logic saves and restores the RNG state for CPU and another
|
||||
device type (infer the device type from Tensor arguments excluding CPU
|
||||
tensors by `_infer_device_type`) to the `run_fn`. If there are multiple
|
||||
device, device state will only be saved for devices of a single device type,
|
||||
devices, device state will only be saved for devices of a single device type,
|
||||
and the remaining devices will be ignored. Consequently, if any checkpointed
|
||||
functions involve randomness, this may result in incorrect gradients. (Note
|
||||
that if CUDA devices are among the devices detected, it will be prioritized;
|
||||
|
||||
@ -1066,6 +1066,8 @@ coverage_ignore_functions = [
|
||||
"set_current_meta",
|
||||
"set_grad_fn_seq_nr",
|
||||
"set_stack_trace",
|
||||
"set_current_replay_node",
|
||||
"get_current_replay_node",
|
||||
# torch.jit.annotations
|
||||
"ann_to_type",
|
||||
"check_fn",
|
||||
|
||||
@ -59,14 +59,14 @@ MPI supports CUDA only if the implementation used to build PyTorch supports it.
|
||||
|
||||
### Backends that come with PyTorch
|
||||
|
||||
PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype).
|
||||
PyTorch distributed package supports Linux (stable), macOS (stable), and Windows (prototype).
|
||||
By default for Linux, the Gloo and NCCL backends are built and included in PyTorch
|
||||
distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be
|
||||
included if you build PyTorch from source. (e.g. building PyTorch on a host that has MPI
|
||||
installed.)
|
||||
|
||||
:::{note}
|
||||
As of PyTorch v1.8, Windows supports all collective communications backend but NCCL,
|
||||
As of PyTorch v1.8, Windows supports all collective communications backends but NCCL,
|
||||
If the `init_method` argument of {func}`init_process_group` points to a file it must adhere
|
||||
to the following schema:
|
||||
|
||||
|
||||
@ -99,6 +99,12 @@ DTensor supports the following types of {class}`Placement` on each {class}`Devic
|
||||
:undoc-members:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: MaskPartial
|
||||
:members:
|
||||
:undoc-members:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: Placement
|
||||
:members:
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# torch.mtia
|
||||
|
||||
The MTIA backend is implemented out of the tree, only interfaces are be defined here.
|
||||
The MTIA backend is implemented out of the tree, only interfaces are defined here.
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: torch.mtia
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# torch.mtia.memory
|
||||
|
||||
The MTIA backend is implemented out of the tree, only interfaces are be defined here.
|
||||
The MTIA backend is implemented out of the tree, only interfaces are defined here.
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: torch.mtia.memory
|
||||
|
||||
@ -263,12 +263,31 @@ offers a comprehensive example of using these features to manipulate a checkpoin
|
||||
Starting in version 2.6, ``torch.load`` will use ``weights_only=True`` if the ``pickle_module``
|
||||
argument is not passed.
|
||||
|
||||
.. _weights-only-security:
|
||||
|
||||
weights_only security
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
As discussed in the documentation for :func:`torch.load`, ``weights_only=True`` restricts
|
||||
the unpickler used in ``torch.load`` to only executing functions/building classes required for
|
||||
``state_dicts`` of plain ``torch.Tensors`` as well as some other primitive types. Further,
|
||||
unlike the default ``Unpickler`` provided by the ``pickle`` module, the ``weights_only`` Unpickler
|
||||
is not allowed to dynamically import anything during unpickling.
|
||||
|
||||
``weights_only=True`` narrows the surface of remote code execution attacks but has the following limitations:
|
||||
|
||||
1. ``weights_only=True`` does not guard against denial of service attacks.
|
||||
2. We try to prevent memory corruptions during ``torch.load(weights_only=True)`` but they might still be possible.
|
||||
|
||||
Note that even if memory corruption does not occur during ``torch.load`` itself, loading CAN create
|
||||
unexpected objects for the downstream code that can also lead to memory corruption (e.g. a Tensor of
|
||||
indices and values made to a sparse Tensor in user code might write/read out of bounds).
|
||||
|
||||
.. _weights-only-allowlist:
|
||||
|
||||
weights_only allowlist
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
As mentioned above, saving a module's ``state_dict`` is a best practice when using ``torch.save``. If loading an old
|
||||
checkpoint that contains an ``nn.Module``, we recommend ``weights_only=False``. When loading a checkpoint that contains
|
||||
tensor subclasses, there will likely be functions/classes that need to be allowlisted, see below for further details.
|
||||
|
||||
@ -85,6 +85,7 @@
|
||||
memory_stats_as_nested_dict
|
||||
reset_accumulated_memory_stats
|
||||
reset_peak_memory_stats
|
||||
set_per_process_memory_fraction
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
|
||||
@ -238,7 +238,7 @@ def pytest_pycollect_makemodule(module_path, path, parent) -> Module:
|
||||
|
||||
@pytest.hookimpl(hookwrapper=True)
|
||||
def pytest_report_teststatus(report, config):
|
||||
# Add the test time to the verbose output, unforunately I don't think this
|
||||
# Add the test time to the verbose output, unfortunately I don't think this
|
||||
# includes setup or teardown
|
||||
pluggy_result = yield
|
||||
if not isinstance(report, pytest.TestReport):
|
||||
|
||||
@ -1,3 +1,8 @@
|
||||
# Skip on windows
|
||||
if(WIN32)
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
|
||||
|
||||
# Build the cpp gtest binary containing the cpp-only tests.
|
||||
@ -30,8 +35,15 @@ target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)
|
||||
|
||||
# WARNING: DO NOT LINK torch!!!
|
||||
# The purpose is to check if the used aten/c10 headers are written in a header-only way
|
||||
target_link_libraries(test_aoti_abi_check PRIVATE gtest_main)
|
||||
target_link_libraries(test_aoti_abi_check PRIVATE gtest_main sleef)
|
||||
target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
|
||||
if(NOT USE_SYSTEM_SLEEF)
|
||||
target_include_directories(test_aoti_abi_check PRIVATE ${CMAKE_BINARY_DIR}/include)
|
||||
endif()
|
||||
|
||||
# Disable unused-variable warnings for variables that are only used to test compilation
|
||||
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
|
||||
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)
|
||||
|
||||
foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
|
||||
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
|
||||
@ -41,12 +53,17 @@ foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
|
||||
separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}")
|
||||
add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
|
||||
|
||||
target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main)
|
||||
target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main sleef)
|
||||
target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${ATen_CPU_INCLUDE})
|
||||
if(NOT USE_SYSTEM_SLEEF)
|
||||
target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${CMAKE_BINARY_DIR}/include)
|
||||
endif()
|
||||
|
||||
# Define CPU_CAPABILITY and CPU_CAPABILITY_XXX macros for conditional compilation
|
||||
target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY} CPU_CAPABILITY_${CPU_CAPABILITY})
|
||||
target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE ${FLAGS})
|
||||
target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-variable)
|
||||
target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-but-set-variable)
|
||||
endforeach()
|
||||
endforeach()
|
||||
|
||||
|
||||
@ -2,10 +2,27 @@
|
||||
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
|
||||
#include <iostream>
|
||||
namespace torch {
|
||||
namespace aot_inductor {
|
||||
|
||||
template <typename T>
|
||||
void ExpectVecEqual(
|
||||
const at::vec::Vectorized<T>& expected,
|
||||
const at::vec::Vectorized<T>& actual) {
|
||||
using Vec = at::vec::Vectorized<T>;
|
||||
// Have to use std::vector for comparison because at::vec::Vectorized doesn't
|
||||
// support operator[] on aarch64
|
||||
std::vector<T> expected_data(Vec::size());
|
||||
std::vector<T> actual_data(Vec::size());
|
||||
|
||||
expected.store(expected_data.data());
|
||||
actual.store(actual_data.data());
|
||||
|
||||
for (int i = 0; i < Vec::size(); i++) {
|
||||
EXPECT_EQ(expected_data[i], actual_data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestVec, TestAdd) {
|
||||
using Vec = at::vec::Vectorized<int>;
|
||||
std::vector<int> a(1024, 1);
|
||||
@ -16,9 +33,7 @@ TEST(TestVec, TestAdd) {
|
||||
std::vector<int> expected(1024, 3);
|
||||
Vec expected_vec = Vec::loadu(expected.data());
|
||||
|
||||
for (int i = 0; i < Vec::size(); i++) {
|
||||
EXPECT_EQ(expected_vec[i], actual_vec[i]);
|
||||
}
|
||||
ExpectVecEqual(expected_vec, actual_vec);
|
||||
}
|
||||
|
||||
TEST(TestVec, TestMax) {
|
||||
@ -30,9 +45,7 @@ TEST(TestVec, TestMax) {
|
||||
Vec actual_vec = at::vec::maximum(a_vec, b_vec);
|
||||
Vec expected_vec = b_vec;
|
||||
|
||||
for (int i = 0; i < Vec::size(); i++) {
|
||||
EXPECT_EQ(expected_vec[i], actual_vec[i]);
|
||||
}
|
||||
ExpectVecEqual(expected_vec, actual_vec);
|
||||
}
|
||||
|
||||
TEST(TestVec, TestMin) {
|
||||
@ -44,9 +57,7 @@ TEST(TestVec, TestMin) {
|
||||
Vec actual_vec = at::vec::minimum(a_vec, b_vec);
|
||||
Vec expected_vec = a_vec;
|
||||
|
||||
for (int i = 0; i < Vec::size(); i++) {
|
||||
EXPECT_EQ(expected_vec[i], actual_vec[i]);
|
||||
}
|
||||
ExpectVecEqual(expected_vec, actual_vec);
|
||||
}
|
||||
|
||||
TEST(TestVec, TestConvert) {
|
||||
@ -58,9 +69,7 @@ TEST(TestVec, TestConvert) {
|
||||
auto actual_vec = at::vec::convert<float>(a_vec);
|
||||
auto expected_vec = b_vec;
|
||||
|
||||
for (int i = 0; i < at::vec::Vectorized<int>::size(); i++) {
|
||||
EXPECT_EQ(expected_vec[i], actual_vec[i]);
|
||||
}
|
||||
ExpectVecEqual(expected_vec, actual_vec);
|
||||
}
|
||||
|
||||
TEST(TestVec, TestClampMin) {
|
||||
@ -72,9 +81,7 @@ TEST(TestVec, TestClampMin) {
|
||||
Vec actual_vec = at::vec::clamp_min(a_vec, min_vec);
|
||||
Vec expected_vec = min_vec;
|
||||
|
||||
for (int i = 0; i < Vec::size(); i++) {
|
||||
EXPECT_EQ(expected_vec[i], actual_vec[i]);
|
||||
}
|
||||
ExpectVecEqual(expected_vec, actual_vec);
|
||||
}
|
||||
|
||||
} // namespace aot_inductor
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
|
||||
set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_inference)
|
||||
|
||||
# Build custom TorchScript op for AOTInductor
|
||||
@ -8,27 +7,12 @@ set_target_properties(aoti_custom_class PROPERTIES
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA)
|
||||
elseif(USE_ROCM)
|
||||
target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
|
||||
target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
|
||||
endif()
|
||||
|
||||
# Link against LibTorch
|
||||
target_link_libraries(aoti_custom_class torch)
|
||||
|
||||
# the custom command that generates the TorchScript module
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
|
||||
${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
|
||||
${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
|
||||
# This script requires the torch package to be installed.
|
||||
COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
|
||||
DEPENDS torch torch_python aoti_custom_class ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
|
||||
)
|
||||
add_custom_target(aoti_script_model ALL
|
||||
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
|
||||
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
|
||||
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
|
||||
)
|
||||
add_dependencies(aoti_script_model aoti_custom_class)
|
||||
|
||||
# Build the cpp gtest binary containing the cpp-only tests.
|
||||
set(INDUCTOR_TEST_SRCS
|
||||
${AOT_INDUCTOR_TEST_ROOT}/test.cpp
|
||||
@ -37,23 +21,12 @@ set(INDUCTOR_TEST_SRCS
|
||||
add_executable(test_aoti_inference
|
||||
${TORCH_ROOT}/test/cpp/common/main.cpp
|
||||
${INDUCTOR_TEST_SRCS}
|
||||
data.pt
|
||||
script_data.pt
|
||||
script_model_cpu.pt
|
||||
script_model_cuda.pt
|
||||
)
|
||||
add_dependencies(test_aoti_inference aoti_custom_class aoti_script_model)
|
||||
add_dependencies(test_aoti_inference aoti_custom_class)
|
||||
|
||||
# TODO temporary until we can delete the old gtest polyfills.
|
||||
target_compile_definitions(test_aoti_inference PRIVATE USE_GTEST)
|
||||
|
||||
# Define a custom command to generate the library
|
||||
add_custom_command(
|
||||
OUTPUT data.pt
|
||||
COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py
|
||||
DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py
|
||||
)
|
||||
|
||||
target_link_libraries(test_aoti_inference PRIVATE
|
||||
torch
|
||||
gtest_main
|
||||
@ -71,6 +44,10 @@ target_compile_definitions(test_aoti_inference PRIVATE
|
||||
CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
target_compile_options_if_supported(test_aoti_inference -Wno-unused-variable)
|
||||
target_compile_options_if_supported(test_aoti_inference -Wno-unused-but-set-variable)
|
||||
target_compile_options_if_supported(test_aoti_inference -Wno-unused-function)
|
||||
|
||||
if(INSTALL_TEST)
|
||||
install(TARGETS test_aoti_inference DESTINATION bin)
|
||||
# Install PDB files for MSVC builds
|
||||
|
||||
@ -2,7 +2,9 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <cstdlib>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
@ -28,6 +30,64 @@
|
||||
|
||||
namespace {
|
||||
|
||||
// Function to check if test data files exist and are valid
|
||||
bool testDataFilesExist() {
|
||||
std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
|
||||
std::array<std::string, 4> required_files = {
|
||||
"data.pt",
|
||||
"script_data.pt",
|
||||
"script_model_cpu.pt",
|
||||
"script_model_cuda.pt"};
|
||||
|
||||
for (const auto& filename : required_files) {
|
||||
std::string filepath = bindir + "/" + filename;
|
||||
std::ifstream file(filepath);
|
||||
if (!file.good()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Function to ensure test data files are generated at runtime
|
||||
void ensureTestDataGenerated() {
|
||||
static std::once_flag generated_flag;
|
||||
std::call_once(generated_flag, []() {
|
||||
// Only generate if files don't exist or are placeholders
|
||||
if (testDataFilesExist()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
|
||||
|
||||
// Calculate path to source directory: build/test_aoti_inference -> build ->
|
||||
// pytorch
|
||||
std::string pytorch_root = bindir.substr(0, bindir.find_last_of("/"));
|
||||
pytorch_root = pytorch_root.substr(0, pytorch_root.find_last_of("/"));
|
||||
std::string source_dir = pytorch_root + "/test/cpp/aoti_inference";
|
||||
|
||||
// Generate test data files (data.pt, etc.) by running test.py directly
|
||||
std::string test_script = source_dir + "/test.py";
|
||||
std::string test_data_cmd = "cd " + bindir + " && python " + test_script;
|
||||
std::cout << "Generating test data: " << test_data_cmd << std::endl;
|
||||
int result1 = std::system(test_data_cmd.c_str());
|
||||
if (result1 != 0) {
|
||||
std::cerr << "Warning: Test data generation failed with code " << result1
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Generate model files (script_*.pt) by running compile_model.py directly
|
||||
std::string compile_script = source_dir + "/compile_model.py";
|
||||
std::string models_cmd = "cd " + bindir + " && python " + compile_script;
|
||||
std::cout << "Generating model files: " << models_cmd << std::endl;
|
||||
int result2 = std::system(models_cmd.c_str());
|
||||
if (result2 != 0) {
|
||||
std::cerr << "Warning: Model generation failed with code " << result2
|
||||
<< std::endl;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const std::unordered_map<std::string, at::Tensor> derefTensorConstantMap(
|
||||
torch::inductor::TensorConstantMap tensor_constant_map) {
|
||||
std::unordered_map<std::string, at::Tensor> ret;
|
||||
@ -855,7 +915,6 @@ void test_aoti_free_buffer(bool use_runtime_constant_folding) {
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(USE_CUDA) || defined(USE_ROCM)
|
||||
void test_cuda_alloc_test() {
|
||||
torch::NoGradGuard no_grad;
|
||||
|
||||
@ -895,8 +954,8 @@ void test_cuda_alloc_test() {
|
||||
runner->run(data_loader.attr(inputs_attr.c_str()).toTensorList().vec());
|
||||
ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_CUDA
|
||||
class ThreadPool {
|
||||
private:
|
||||
struct Task {
|
||||
@ -1037,86 +1096,96 @@ void test_multi_cuda_streams(const std::string& device) {
|
||||
ASSERT_TRUE(torch::allclose(ref_output_tensors[0], all_outputs[i][0]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // USE_CUDA
|
||||
#endif // USE_CUDA || USE_ROCM
|
||||
} // namespace
|
||||
|
||||
namespace torch::aot_inductor {
|
||||
|
||||
TEST(AotInductorTest, BasicTestCpu) {
|
||||
// Test fixture that ensures test data is generated once for all tests
|
||||
class AotInductorTest : public ::testing::Test {
|
||||
public:
|
||||
// This runs once before all tests in this test suite
|
||||
static void SetUpTestSuite() {
|
||||
ensureTestDataGenerated();
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(AotInductorTest, BasicTestCpu) {
|
||||
test_aoti("cpu", false);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, BasicScriptTestCpu) {
|
||||
TEST_F(AotInductorTest, BasicScriptTestCpu) {
|
||||
test_aoti_script("cpu");
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, BasicPackageLoaderTestCpu) {
|
||||
TEST_F(AotInductorTest, BasicPackageLoaderTestCpu) {
|
||||
test_aoti_package_loader("cpu", false);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, ExtractConstantsMapCpu) {
|
||||
TEST_F(AotInductorTest, ExtractConstantsMapCpu) {
|
||||
test_aoti_extract_constants_map("cpu");
|
||||
}
|
||||
|
||||
#ifdef USE_CUDA
|
||||
TEST(AotInductorTest, BasicTestCuda) {
|
||||
TEST_F(AotInductorTest, BasicTestCuda) {
|
||||
test_aoti("cuda", true);
|
||||
test_aoti("cuda", false);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, BasicScriptTestCuda) {
|
||||
TEST_F(AotInductorTest, BasicScriptTestCuda) {
|
||||
test_aoti_script("cuda");
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, BasicPackageLoaderTestCuda) {
|
||||
TEST_F(AotInductorTest, BasicPackageLoaderTestCuda) {
|
||||
test_aoti_package_loader("cuda", false);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
|
||||
TEST_F(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
|
||||
test_aoti_package_loader_multi_gpu("cuda", false);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, UpdateUserManagedConstantsCuda) {
|
||||
TEST_F(AotInductorTest, UpdateUserManagedConstantsCuda) {
|
||||
test_aoti_user_managed_buffer();
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, RuntimeUpdateConstantsCuda) {
|
||||
TEST_F(AotInductorTest, RuntimeUpdateConstantsCuda) {
|
||||
test_aoti_constants_update("cuda", true);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, UpdateConstantsCuda) {
|
||||
TEST_F(AotInductorTest, UpdateConstantsCuda) {
|
||||
test_aoti_constants_update("cuda", false);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, ExtractConstantsMapCuda) {
|
||||
TEST_F(AotInductorTest, ExtractConstantsMapCuda) {
|
||||
test_aoti_extract_constants_map("cuda");
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
|
||||
TEST_F(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
|
||||
test_aoti_double_buffering("cuda", true);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
|
||||
TEST_F(AotInductorTest, UpdateInactiveConstantsCuda) {
|
||||
test_aoti_double_buffering("cuda", false);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
|
||||
TEST_F(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
|
||||
test_aoti_double_buffering_with_tensor_constants();
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, FreeInactiveConstantBufferCuda) {
|
||||
TEST_F(AotInductorTest, FreeInactiveConstantBufferCuda) {
|
||||
test_aoti_free_buffer(false);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
|
||||
TEST_F(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
|
||||
test_aoti_free_buffer(true);
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, MultiStreamTestCuda) {
|
||||
TEST_F(AotInductorTest, MultiStreamTestCuda) {
|
||||
test_multi_cuda_streams("cuda");
|
||||
}
|
||||
|
||||
TEST(AotInductorTest, CudaAllocTestCuda) {
|
||||
TEST_F(AotInductorTest, CudaAllocTestCuda) {
|
||||
test_cuda_alloc_test();
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -584,7 +584,7 @@ TEST(CustomAutogradTest, MarkDirty) {
|
||||
}
|
||||
};
|
||||
|
||||
// Clone here because modifying leafs inplace is not allowed
|
||||
// Clone here because modifying leaves inplace is not allowed
|
||||
auto x = torch::randn({5, 5}, torch::requires_grad()).clone();
|
||||
auto version_before = x._version();
|
||||
auto out = MyFunction::apply(x);
|
||||
|
||||
@ -264,7 +264,7 @@ TEST_F(ParallelTest, DataParallelNumericalEquivalence_MultiCUDA) {
|
||||
input += i;
|
||||
input_dp += i;
|
||||
|
||||
// non-prallel training
|
||||
// non-parallel training
|
||||
torch::optim::SGD optim(model->parameters(), torch::optim::SGDOptions(0.1));
|
||||
auto output = model->forward(input);
|
||||
auto loss = torch::mse_loss(output, torch::zeros_like(output));
|
||||
|
||||
@ -149,8 +149,8 @@ When `import torch`, installed accelerators (such as `torch_openreg`) will be au
|
||||
### Installation
|
||||
|
||||
```python
|
||||
pip3 install --no-build-isolation -e . # for develop
|
||||
pip3 install --no-build-isolation . # for install
|
||||
python -m pip install --no-build-isolation -e . # for develop
|
||||
python -m pip install --no-build-isolation . # for install
|
||||
```
|
||||
|
||||
### Usage Example
|
||||
@ -188,7 +188,7 @@ Please refer to [this](https://docs.pytorch.org/docs/main/accelerator/index.html
|
||||
- Device-agnostic APIs
|
||||
- Memory Management
|
||||
- Generator
|
||||
- Distrubuted
|
||||
- Distributed
|
||||
- Custom Tensor&Storage
|
||||
- ...
|
||||
- **Improve Tests**: Add more test cases related to the integration mechanism.
|
||||
|
||||
@ -8,7 +8,8 @@ class TestAutocast(TestCase):
|
||||
def test_autocast_with_unsupported_type(self):
|
||||
with self.assertWarnsRegex(
|
||||
UserWarning,
|
||||
"In openreg autocast, but the target dtype torch.float32 is not supported.",
|
||||
"In openreg autocast, but the target dtype is not supported. Disabling autocast.\n"
|
||||
"openreg Autocast only supports dtypes of torch.float16, torch.bfloat16 currently.",
|
||||
):
|
||||
with torch.autocast(device_type="openreg", dtype=torch.float32):
|
||||
_ = torch.ones(10)
|
||||
|
||||
@ -5,6 +5,7 @@ from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, T
|
||||
|
||||
|
||||
class TestStream(TestCase):
|
||||
@skipIfTorchDynamo()
|
||||
def test_stream_create(self):
|
||||
stream = torch.Stream(device="openreg")
|
||||
self.assertEqual(stream.device_index, torch.openreg.current_device())
|
||||
@ -24,6 +25,7 @@ class TestStream(TestCase):
|
||||
)
|
||||
self.assertEqual(stream, stream1)
|
||||
|
||||
@skipIfTorchDynamo()
|
||||
def test_stream_context(self):
|
||||
with torch.Stream(device="openreg:1") as stream:
|
||||
self.assertEqual(torch.accelerator.current_stream(), stream)
|
||||
@ -40,6 +42,7 @@ class TestStream(TestCase):
|
||||
current_stream = torch.accelerator.current_stream()
|
||||
self.assertEqual(current_stream, stream2)
|
||||
|
||||
@skipIfTorchDynamo()
|
||||
def test_stream_synchronize(self):
|
||||
stream = torch.Stream(device="openreg:1")
|
||||
self.assertEqual(True, stream.query())
|
||||
@ -49,12 +52,14 @@ class TestStream(TestCase):
|
||||
stream.synchronize()
|
||||
self.assertEqual(True, stream.query())
|
||||
|
||||
@skipIfTorchDynamo()
|
||||
def test_stream_repr(self):
|
||||
stream = torch.Stream(device="openreg:1")
|
||||
self.assertTrue(
|
||||
"torch.Stream device_type=openreg, device_index=1" in repr(stream)
|
||||
)
|
||||
|
||||
@skipIfTorchDynamo()
|
||||
def test_stream_wait_stream(self):
|
||||
stream_1 = torch.Stream(device="openreg:0")
|
||||
stream_2 = torch.Stream(device="openreg:1")
|
||||
|
||||
@ -218,7 +218,7 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
|
||||
torch.manual_seed(42 + global_mesh.get_local_rank("dp"))
|
||||
inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type)
|
||||
for iter_idx in range(5):
|
||||
for _ in range(5):
|
||||
ref_loss = ref_model(inp).sum()
|
||||
loss = model(inp).sum()
|
||||
self.assertEqual(ref_loss, loss)
|
||||
@ -238,9 +238,7 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
# runs its reduce-scatter
|
||||
self.assertIsInstance(model.pos_embeddings.weight.placements[1], Shard)
|
||||
self.assertIsInstance(model.pos_embeddings.weight.grad.placements[1], Shard)
|
||||
for ref_param, (param_name, param) in zip(
|
||||
ref_model.parameters(), model.named_parameters()
|
||||
):
|
||||
for ref_param, param in zip(ref_model.parameters(), model.parameters()):
|
||||
full_grad = param.grad.full_tensor()
|
||||
self.assertEqual(ref_param.grad, full_grad)
|
||||
|
||||
|
||||
@ -101,14 +101,14 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
|
||||
@property
|
||||
def world_size(self):
|
||||
return 4
|
||||
return 8
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.rank
|
||||
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(8)
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
|
||||
)
|
||||
@ -169,8 +169,8 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
{f"{i}": MLPModule(dim) for i in range(total_layers)}
|
||||
)
|
||||
# Calculate start and end indices based on rank
|
||||
start_index = self.rank * 2
|
||||
end_index = start_index + 2
|
||||
start_index = self.rank
|
||||
end_index = start_index + 1
|
||||
pp_model = PPModelChunk(full_model, start_index, end_index)
|
||||
|
||||
pp_model.to(self.device)
|
||||
@ -224,7 +224,6 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
],
|
||||
)
|
||||
def test_3d_with_tp_dp_pp(self, ScheduleClass, MixedPrecisionParam):
|
||||
_device_raii = torch.device(device_type, self.device)
|
||||
torch.accelerator.set_device_index(self.device)
|
||||
store = torch.distributed.FileStore(self.file_name, self.world_size)
|
||||
torch.distributed.init_process_group(
|
||||
@ -286,56 +285,44 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
parallelize_module(layer, tp_mesh, parallelize_plan)
|
||||
return model
|
||||
|
||||
# Attach to a schedule
|
||||
if issubclass(ScheduleClass, PipelineScheduleSingle):
|
||||
stage_idx = pp_group.rank()
|
||||
partial_model = nn.Sequential(
|
||||
*full_model[stage_idx * 2 : stage_idx * 2 + 2]
|
||||
)
|
||||
partial_model.to(self.device)
|
||||
n_virtual = 1
|
||||
else:
|
||||
n_virtual = 2
|
||||
|
||||
num_stages = pp_group.size() * n_virtual
|
||||
layers_per_stage = total_layers // num_stages
|
||||
stages = []
|
||||
for i in range(n_virtual):
|
||||
stage_idx = pp_group.rank() + pp_group.size() * i
|
||||
start_layer = stage_idx * layers_per_stage
|
||||
end_layer = start_layer + layers_per_stage
|
||||
# divide the model layers by the number of stages
|
||||
partial_model = nn.Sequential(*full_model[start_layer:end_layer])
|
||||
partial_model.to(self.device)
|
||||
tp_model = apply_tp(partial_model, tp_mesh)
|
||||
dp_model = apply_fsdp(tp_model)
|
||||
pipeline_stage = PipelineStage(
|
||||
|
||||
stage = PipelineStage(
|
||||
dp_model,
|
||||
stage_idx,
|
||||
pp_group.size(),
|
||||
num_stages,
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
partial_models = [pipeline_stage.submod]
|
||||
pipeline_schedule = ScheduleClass(
|
||||
pipeline_stage,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
)
|
||||
else:
|
||||
n_virtual = 2
|
||||
num_stages = pp_group.size() * n_virtual
|
||||
stages = []
|
||||
for i in range(n_virtual):
|
||||
stage_idx = pp_group.rank() + n_virtual * i
|
||||
# divide the model layers by the number of stages
|
||||
partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1])
|
||||
partial_model.to(self.device)
|
||||
|
||||
tp_model = apply_tp(partial_model, tp_mesh)
|
||||
dp_model = apply_fsdp(tp_model)
|
||||
stage = PipelineStage(
|
||||
dp_model,
|
||||
stage_idx,
|
||||
num_stages,
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
stages.append(stage)
|
||||
partial_models = [pipeline_stage.submod for pipeline_stage in stages]
|
||||
|
||||
stages.append(stage)
|
||||
partial_models = [pipeline_stage.submod for pipeline_stage in stages]
|
||||
pipeline_schedule = ScheduleClass(
|
||||
stages,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
)
|
||||
if issubclass(ScheduleClass, PipelineScheduleSingle):
|
||||
stages = stages[0]
|
||||
|
||||
pipeline_schedule = ScheduleClass(
|
||||
stages,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
|
||||
optimizer_kwargs = {
|
||||
"lr": 0.01,
|
||||
@ -349,7 +336,7 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
for model in partial_models
|
||||
]
|
||||
|
||||
for train_step in range(5):
|
||||
for _train_step in range(5):
|
||||
for optimizer in optimizers:
|
||||
optimizer.zero_grad()
|
||||
inputs = torch.rand((num_microbatches, dim), device=self.device)
|
||||
@ -369,7 +356,7 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(8)
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
|
||||
)
|
||||
@ -447,109 +434,71 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
partial_model = partial_model.to(dtype=MixedPrecisionParam)
|
||||
return partial_model
|
||||
|
||||
# Attach to a schedule
|
||||
if issubclass(ScheduleClass, PipelineScheduleSingle):
|
||||
stage_idx = pp_group.rank()
|
||||
partial_model = nn.Sequential(
|
||||
*full_model[stage_idx * 2 : stage_idx * 2 + 2]
|
||||
)
|
||||
partial_model.to(self.device)
|
||||
|
||||
dp_model = apply_replicate(partial_model)
|
||||
pipeline_stage = PipelineStage(
|
||||
dp_model,
|
||||
stage_idx,
|
||||
pp_group.size(),
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
partial_models = [pipeline_stage.submod]
|
||||
pipeline_schedule = ScheduleClass(
|
||||
pipeline_stage,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
|
||||
ref_partial_model = nn.Sequential(
|
||||
*ref_full_model[stage_idx * 2 : stage_idx * 2 + 2]
|
||||
)
|
||||
ref_partial_model.to(self.device)
|
||||
ref_partial_model = apply_same_precision(
|
||||
ref_partial_model
|
||||
) # Apply same precision
|
||||
|
||||
ref_pipeline_stage = PipelineStage(
|
||||
ref_partial_model,
|
||||
stage_idx,
|
||||
pp_group.size(),
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
ref_partial_models = [ref_pipeline_stage.submod]
|
||||
ref_pipeline_schedule = ScheduleClass(
|
||||
ref_pipeline_stage,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
n_virtual = 1
|
||||
else:
|
||||
n_virtual = 2
|
||||
num_stages = pp_group.size() * n_virtual
|
||||
stages = []
|
||||
ref_stages = []
|
||||
for i in range(n_virtual):
|
||||
stage_idx = pp_group.rank() + n_virtual * i
|
||||
# divide the model layers by the number of stages
|
||||
partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1])
|
||||
partial_model.to(self.device)
|
||||
|
||||
dp_model = apply_replicate(partial_model)
|
||||
stage = PipelineStage(
|
||||
dp_model,
|
||||
stage_idx,
|
||||
num_stages,
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
num_stages = pp_group.size() * n_virtual
|
||||
layers_per_stage = total_layers // num_stages
|
||||
stages = []
|
||||
ref_stages = []
|
||||
for i in range(n_virtual):
|
||||
stage_idx = pp_group.rank() + pp_group.size() * i
|
||||
start_layer = stage_idx * layers_per_stage
|
||||
end_layer = start_layer + layers_per_stage
|
||||
# divide the model layers by the number of stages
|
||||
partial_model = nn.Sequential(*full_model[start_layer:end_layer])
|
||||
partial_model.to(self.device)
|
||||
|
||||
stages.append(stage)
|
||||
partial_models = [pipeline_stage.submod for pipeline_stage in stages]
|
||||
ref_partial_model = nn.Sequential(*ref_full_model[start_layer:end_layer])
|
||||
ref_partial_model.to(self.device)
|
||||
|
||||
ref_partial_model = nn.Sequential(
|
||||
*ref_full_model[stage_idx : stage_idx + 1]
|
||||
)
|
||||
ref_partial_model.to(self.device)
|
||||
ref_partial_model = apply_same_precision(
|
||||
ref_partial_model
|
||||
) # Apply same precision
|
||||
dp_model = apply_replicate(partial_model)
|
||||
ref_dp_model = apply_same_precision(ref_partial_model)
|
||||
|
||||
ref_stage = PipelineStage(
|
||||
ref_partial_model,
|
||||
stage_idx,
|
||||
num_stages,
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
|
||||
ref_stages.append(ref_stage)
|
||||
ref_partial_models = [
|
||||
pipeline_stage.submod for pipeline_stage in ref_stages
|
||||
]
|
||||
pipeline_schedule = ScheduleClass(
|
||||
stages,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
stage = PipelineStage(
|
||||
dp_model,
|
||||
stage_idx,
|
||||
num_stages,
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
|
||||
ref_pipeline_schedule = ScheduleClass(
|
||||
ref_stages,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
ref_stage = PipelineStage(
|
||||
ref_dp_model,
|
||||
stage_idx,
|
||||
num_stages,
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
|
||||
stages.append(stage)
|
||||
ref_stages.append(ref_stage)
|
||||
|
||||
partial_models = [pipeline_stage.submod for pipeline_stage in stages]
|
||||
ref_partial_models = [
|
||||
pipeline_stage.submod for pipeline_stage in ref_stages
|
||||
]
|
||||
|
||||
if issubclass(ScheduleClass, PipelineScheduleSingle):
|
||||
stages = stages[0]
|
||||
ref_stages = ref_stages[0]
|
||||
|
||||
pipeline_schedule = ScheduleClass(
|
||||
stages,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
|
||||
ref_pipeline_schedule = ScheduleClass(
|
||||
ref_stages,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
|
||||
optimizer_kwargs = {
|
||||
"lr": 0.01,
|
||||
"betas": (0.9, 0.95),
|
||||
@ -568,7 +517,7 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
for model in ref_partial_models
|
||||
]
|
||||
|
||||
for train_step in range(5):
|
||||
for _train_step in range(5):
|
||||
for optimizer in optimizers:
|
||||
optimizer.zero_grad()
|
||||
for ref_optimizer in ref_optimizers:
|
||||
@ -604,7 +553,7 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(8)
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
|
||||
)
|
||||
@ -736,67 +685,44 @@ class ComposabilityTest(MultiProcessTestCase):
|
||||
|
||||
pipeline_model_parameter_dict = {}
|
||||
|
||||
# Attach to a schedule
|
||||
if issubclass(ScheduleClass, PipelineScheduleSingle):
|
||||
stage_idx = pp_group.rank()
|
||||
# Calculate layers per stage correctly
|
||||
layers_per_stage = total_layers // pp_group.size() # 8 // 2 = 4
|
||||
n_virtual = 1
|
||||
else:
|
||||
n_virtual = 2
|
||||
|
||||
num_stages = pp_group.size() * n_virtual
|
||||
layers_per_stage = total_layers // num_stages
|
||||
stages = []
|
||||
for i in range(n_virtual):
|
||||
stage_idx = pp_group.rank() + pp_group.size() * i
|
||||
start_layer = stage_idx * layers_per_stage
|
||||
end_layer = start_layer + layers_per_stage
|
||||
|
||||
# divide the model layers by the number of stages
|
||||
partial_model = nn.Sequential(*full_model[start_layer:end_layer])
|
||||
partial_model.to(self.device)
|
||||
|
||||
dp_model = apply_replicate(partial_model)
|
||||
pipelined_models_parameters(start_layer, dp_model)
|
||||
|
||||
pipeline_stage = PipelineStage(
|
||||
stage = PipelineStage(
|
||||
dp_model,
|
||||
stage_idx,
|
||||
pp_group.size(),
|
||||
num_stages,
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
partial_models = [pipeline_stage.submod]
|
||||
pipeline_schedule = ScheduleClass(
|
||||
pipeline_stage,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
|
||||
else:
|
||||
n_virtual = 2
|
||||
num_stages = pp_group.size() * n_virtual
|
||||
layers_per_stage = total_layers // num_stages
|
||||
stages = []
|
||||
for i in range(n_virtual):
|
||||
stage_idx = pp_group.rank() + pp_group.size() * i
|
||||
start_layer = stage_idx * layers_per_stage
|
||||
end_layer = start_layer + layers_per_stage
|
||||
# divide the model layers by the number of stages
|
||||
partial_model = nn.Sequential(*full_model[start_layer:end_layer])
|
||||
partial_model.to(self.device)
|
||||
stages.append(stage)
|
||||
partial_models = [pipeline_stage.submod for pipeline_stage in stages]
|
||||
|
||||
dp_model = apply_replicate(partial_model)
|
||||
pipelined_models_parameters(start_layer, dp_model)
|
||||
stage = PipelineStage(
|
||||
dp_model,
|
||||
stage_idx,
|
||||
num_stages,
|
||||
self.device,
|
||||
group=pp_group,
|
||||
)
|
||||
if issubclass(ScheduleClass, PipelineScheduleSingle):
|
||||
stages = stages[0]
|
||||
|
||||
stages.append(stage)
|
||||
partial_models = [pipeline_stage.submod for pipeline_stage in stages]
|
||||
|
||||
pipeline_schedule = ScheduleClass(
|
||||
stages,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
pipeline_schedule = ScheduleClass(
|
||||
stages,
|
||||
n_microbatches=num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
|
||||
optimizer_kwargs = {
|
||||
"lr": 0.01,
|
||||
|
||||
@ -216,7 +216,7 @@ class TestSavePlan(TestCase):
|
||||
# Number of plans should remain unchanged
|
||||
self.assertEqual(len(all_plans), len(deduped_plans))
|
||||
|
||||
# Numer of items in the deduped plans should be less than the original plans
|
||||
# Number of items in the deduped plans should be less than the original plans
|
||||
for new_plan, old_plan in zip(deduped_plans, all_plans):
|
||||
self.assertFalse(_compare_save_plans(new_plan, old_plan))
|
||||
self.assertTrue(len(new_plan.items) < len(old_plan.items))
|
||||
|
||||
@ -4,7 +4,7 @@ import copy
|
||||
import functools
|
||||
import sys
|
||||
from collections.abc import Callable
|
||||
from itertools import chain
|
||||
from itertools import chain, product
|
||||
from typing import Union
|
||||
|
||||
import torch
|
||||
@ -708,29 +708,43 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_flattened_osd(self) -> None:
|
||||
device_mesh = init_device_mesh(device_type, (self.world_size,))
|
||||
model = CompositeParamModel(device=torch.device(device_type))
|
||||
fsdp_model = fully_shard(copy.deepcopy(model), mesh=device_mesh)
|
||||
fsdp_optim = torch.optim.AdamW(fsdp_model.parameters())
|
||||
batch = torch.rand(8, 100, device=device_type)
|
||||
fsdp_model(batch).sum().backward()
|
||||
fsdp_optim.step()
|
||||
fsdp_optim.zero_grad()
|
||||
osd1 = get_optimizer_state_dict(fsdp_model, fsdp_optim)
|
||||
osd2 = get_optimizer_state_dict(
|
||||
fsdp_model,
|
||||
fsdp_optim,
|
||||
options=StateDictOptions(flatten_optimizer_state_dict=True),
|
||||
)
|
||||
fsdp_optim2 = torch.optim.AdamW(fsdp_model.parameters())
|
||||
set_optimizer_state_dict(
|
||||
fsdp_model, optimizers=fsdp_optim2, optim_state_dict=osd2
|
||||
)
|
||||
self.assertEqual(fsdp_optim.state_dict(), fsdp_optim2.state_dict())
|
||||
set_optimizer_state_dict(
|
||||
fsdp_model, optimizers=fsdp_optim2, optim_state_dict=osd1
|
||||
)
|
||||
self.assertEqual(fsdp_optim.state_dict(), fsdp_optim2.state_dict())
|
||||
"""
|
||||
Test flattened optimizer state dictionaries with different combinations of
|
||||
flatten_optimizer_state_dict flag for saving and loading.
|
||||
|
||||
This test verifies that:
|
||||
1. We can save optimizer state dict with/without flattening
|
||||
2. We can load optimizer state dict with/without flattening
|
||||
3. The resulting optimizer state is equivalent regardless of flattening options
|
||||
"""
|
||||
for flatten_to_save, flatten_to_load in product([True, False], repeat=2):
|
||||
device_mesh = init_device_mesh(device_type, (self.world_size,))
|
||||
model = CompositeParamModel(device=torch.device(device_type))
|
||||
fsdp_model = fully_shard(copy.deepcopy(model), mesh=device_mesh)
|
||||
fsdp_optim = torch.optim.AdamW(fsdp_model.parameters())
|
||||
batch = torch.rand(8, 100, device=device_type)
|
||||
fsdp_model(batch).sum().backward()
|
||||
fsdp_optim.step()
|
||||
fsdp_optim.zero_grad()
|
||||
|
||||
# Get optimizer state dict with/without flattening option
|
||||
osd = get_optimizer_state_dict(
|
||||
fsdp_model,
|
||||
fsdp_optim,
|
||||
options=StateDictOptions(flatten_optimizer_state_dict=flatten_to_save),
|
||||
)
|
||||
|
||||
# Create a new optimizer and load the state from osd
|
||||
fsdp_optim2 = torch.optim.AdamW(fsdp_model.parameters())
|
||||
set_optimizer_state_dict(
|
||||
fsdp_model,
|
||||
optimizers=fsdp_optim2,
|
||||
optim_state_dict=osd,
|
||||
options=StateDictOptions(flatten_optimizer_state_dict=flatten_to_load),
|
||||
)
|
||||
|
||||
# Verify the loaded optimizer state matches the original
|
||||
self.assertEqual(fsdp_optim.state_dict(), fsdp_optim2.state_dict())
|
||||
|
||||
def _test_deprecate_partial(self) -> None:
|
||||
model = CompositeParamModel(device=torch.device(device_type))
|
||||
|
||||
@ -18,7 +18,9 @@ from torch.distributed.tensor import (
|
||||
from torch.distributed.tensor.debug import CommDebugMode
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorTestBase,
|
||||
map_local_tensor_for_rank,
|
||||
with_comms,
|
||||
)
|
||||
|
||||
@ -78,17 +80,21 @@ class DTensorAPITest(DTensorTestBase):
|
||||
self.assertEqual(dist_tensor.placements[0].dim, 1)
|
||||
|
||||
placement_combs = [[Shard(0)], [Shard(1)], [Replicate()]]
|
||||
# test src_data_rank == 1
|
||||
# set seed differently for each rank
|
||||
torch.manual_seed(self.rank)
|
||||
for placement in placement_combs:
|
||||
tensor_to_distribute = torch.randn(3 * self.world_size, 3 * self.world_size)
|
||||
dtensor = distribute_tensor(
|
||||
tensor_to_distribute, device_mesh, placement, src_data_rank=1
|
||||
)
|
||||
full_dtensor = dtensor.full_tensor()
|
||||
if self.rank == 1:
|
||||
self.assertEqual(full_dtensor, tensor_to_distribute)
|
||||
|
||||
if not self.is_local_tensor_enabled:
|
||||
# test src_data_rank == 1
|
||||
# set seed differently for each rank
|
||||
self.init_manual_seed_for_rank()
|
||||
for placement in placement_combs:
|
||||
tensor_to_distribute = torch.randn(
|
||||
3 * self.world_size, 3 * self.world_size
|
||||
)
|
||||
dtensor = distribute_tensor(
|
||||
tensor_to_distribute, device_mesh, placement, src_data_rank=1
|
||||
)
|
||||
full_dtensor = dtensor.full_tensor()
|
||||
if self.rank == 1:
|
||||
self.assertEqual(full_dtensor, tensor_to_distribute)
|
||||
|
||||
# test src_data_rank = None, make sure it does not have communication
|
||||
with comm_mode:
|
||||
@ -156,7 +162,12 @@ class DTensorAPITest(DTensorTestBase):
|
||||
dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
|
||||
self.assertEqual(dist_tensor.size(), torch.Size(input_size))
|
||||
local_tensor = dist_tensor.to_local()
|
||||
self.assertEqual(local_tensor, splitted_tensor_list[self.rank])
|
||||
self.assertEqual(
|
||||
local_tensor,
|
||||
map_local_tensor_for_rank(
|
||||
splitted_tensor_list, self.rank, lambda tl, r: tl[r]
|
||||
),
|
||||
)
|
||||
|
||||
@with_comms
|
||||
def test_distribute_module(self):
|
||||
@ -388,5 +399,9 @@ class DTensorAPITest(DTensorTestBase):
|
||||
dcp.save({"fqn": dtensor}, checkpoint_id=tempfile.mkdtemp())
|
||||
|
||||
|
||||
DTensorAPITestWithLocalTensor = create_local_tensor_test_class(
|
||||
DTensorAPITest, skipped_tests=["test_checkpoint_apis_check_partial_placement"]
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -158,7 +158,7 @@ class RingAttentionTest(DTensorTestBase):
|
||||
# parameters because when require_grad is True, resize_ is not
|
||||
# allowed. But requires_grad of cp_q, cp_k, and cp_v are False
|
||||
# now. So we can just use context_parallel() to shard q, k, v.
|
||||
# In reality, context_paralle() should be used to shard the input.
|
||||
# In reality, context_parallel() should be used to shard the input.
|
||||
# In reality, context_parallel() should only be used to shard
|
||||
# the model inputs (batch).
|
||||
|
||||
@ -701,7 +701,7 @@ class CPFlexAttentionTest(DTensorTestBase):
|
||||
)
|
||||
|
||||
# TODO: change this for-loop to run_subtests
|
||||
# Use a for-loop instead of run_subtests because we need to intialize the mask
|
||||
# Use a for-loop instead of run_subtests because we need to initialize the mask
|
||||
# for each subtest. This can be baked into self._test_cp_flex_attention as
|
||||
# a str argument denoting mask type.
|
||||
for batch_size, max_seq_len, lb_type in itertools.product(
|
||||
|
||||
@ -464,6 +464,25 @@ def forward(self, b_parametrizations_buffer_original0, x):
|
||||
run(g, 64, 8)
|
||||
self.assertEqual(cnt.frame_count, 2)
|
||||
|
||||
def test_dtensor_requires_grad_recompile(self):
|
||||
cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
|
||||
@torch.compile(backend=cnt, fullgraph=True)
|
||||
def f(x):
|
||||
y = x * x
|
||||
return y.to_local()
|
||||
|
||||
full_x = torch.randn(8, 8, requires_grad=False)
|
||||
x = distribute_tensor(full_x, mesh, [Shard(0)])
|
||||
f(x)
|
||||
|
||||
full_x = torch.randn(8, 8, requires_grad=True)
|
||||
x = distribute_tensor(full_x, mesh, [Shard(0)])
|
||||
f(x)
|
||||
|
||||
self.assertEqual(cnt.frame_count, 2)
|
||||
|
||||
def test_dtensor_attribute_access_on_intermediate(self):
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
|
||||
|
||||
@ -22,6 +22,11 @@ from torch.distributed.tensor.parallel import (
|
||||
parallelize_module,
|
||||
RowwiseParallel,
|
||||
)
|
||||
from torch.nn.attention.flex_attention import (
|
||||
BlockMask,
|
||||
create_block_mask,
|
||||
flex_attention,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
@ -31,6 +36,7 @@ from torch.testing._internal.common_utils import (
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
|
||||
from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
from torch.utils._pytree import register_pytree_node
|
||||
|
||||
|
||||
class SimpleModel(torch.nn.Module):
|
||||
@ -82,7 +88,46 @@ class SimpleModelAnnotated(torch.nn.Module):
|
||||
return self.mlp_1(x)
|
||||
|
||||
|
||||
def strict_export_and_aot_export_joint_with_descriptors(model, inputs):
|
||||
class FlexAttentionModel(torch.nn.Module):
|
||||
def __init__(self, device):
|
||||
super().__init__()
|
||||
self.proj_q = torch.nn.Linear(16, 128, device=device)
|
||||
self.proj_k = torch.nn.Linear(16, 128, device=device)
|
||||
self.proj_v = torch.nn.Linear(16, 128, device=device)
|
||||
self.proj_out = torch.nn.Linear(128, 16, device=device)
|
||||
self.num_heads = 8
|
||||
self.head_dim = 16
|
||||
|
||||
def forward(self, x, *, block_mask=None):
|
||||
batch_size, seq_len, embed_dim = x.shape
|
||||
# Project to Q, K, V
|
||||
q = self.proj_q(x)
|
||||
k = self.proj_k(x)
|
||||
v = self.proj_v(x)
|
||||
# After colwise parallel, q/k/v are sharded on the last dimension
|
||||
# Get the actual size after sharding
|
||||
hidden_size = q.shape[-1]
|
||||
num_heads_local = hidden_size // self.head_dim
|
||||
# Reshape to (batch, num_heads, seq_len, head_dim)
|
||||
q = q.view(batch_size, seq_len, num_heads_local, self.head_dim).transpose(1, 2)
|
||||
k = k.view(batch_size, seq_len, num_heads_local, self.head_dim).transpose(1, 2)
|
||||
v = v.view(batch_size, seq_len, num_heads_local, self.head_dim).transpose(1, 2)
|
||||
# Apply flex_attention
|
||||
attn_output_raw = flex_attention(q, k, v, block_mask=block_mask)
|
||||
# Reshape back to (batch, seq_len, hidden_size)
|
||||
attn_output = (
|
||||
attn_output_raw.transpose(1, 2)
|
||||
.contiguous()
|
||||
.view(batch_size, seq_len, hidden_size)
|
||||
)
|
||||
# Output projection
|
||||
output = self.proj_out(attn_output)
|
||||
return output
|
||||
|
||||
|
||||
def strict_export_and_aot_export_joint_with_descriptors(model, args, kwargs=None):
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
# needed for stric export
|
||||
torch.utils._pytree.register_constant(DTensorSpec)
|
||||
|
||||
@ -91,36 +136,43 @@ def strict_export_and_aot_export_joint_with_descriptors(model, inputs):
|
||||
install_free_tensors=True, inline_inbuilt_nn_modules=True
|
||||
):
|
||||
with torch._export.utils._disable_aten_to_metadata_assertions():
|
||||
ep = torch.export.export(model, (inputs,), strict=True)
|
||||
ep = torch.export.export(model, args, kwargs, strict=True)
|
||||
|
||||
# joint_gm produced here is missing the backward region, due to incompatiblility
|
||||
# between ep.module() and aot_export_joint_with_descriptors.
|
||||
# Keeping this here to show the issue.
|
||||
return aot_export_joint_with_descriptors_alone(ep.module(), inputs)
|
||||
return aot_export_joint_with_descriptors_alone(ep.module(), args, kwargs)
|
||||
|
||||
|
||||
def graph_capture_and_aot_export_joint_with_descriptors_v2(model, inputs):
|
||||
gm = dynamo_graph_capture_for_export(model)(inputs)
|
||||
def graph_capture_and_aot_export_joint_with_descriptors_v2(model, args, kwargs=None):
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
gm = dynamo_graph_capture_for_export(model)(*args, **kwargs)
|
||||
fake_mode = gm.meta.get("fake_mode", None)
|
||||
with tracing(TracingContext(fake_mode)):
|
||||
return aot_export_joint_with_descriptors_alone(gm, inputs)
|
||||
return aot_export_joint_with_descriptors_alone(gm, args, kwargs)
|
||||
|
||||
|
||||
def graph_capture_and_aot_export_joint_with_descriptors(model, inputs):
|
||||
def graph_capture_and_aot_export_joint_with_descriptors(model, args, kwargs=None):
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
with torch._dynamo.config.patch(install_free_tensors=True):
|
||||
# TODO: switch to use the official graph_capture API once it is ready
|
||||
gm = _dynamo_graph_capture_for_export(model)(inputs)
|
||||
gm = _dynamo_graph_capture_for_export(model)(*args, **kwargs)
|
||||
fake_mode = gm.meta.get("fake_mode", None)
|
||||
with tracing(TracingContext(fake_mode)):
|
||||
return aot_export_joint_with_descriptors_alone(gm, inputs)
|
||||
return aot_export_joint_with_descriptors_alone(gm, args, kwargs)
|
||||
|
||||
|
||||
def aot_export_joint_with_descriptors_alone(model, inputs):
|
||||
def aot_export_joint_with_descriptors_alone(model, args, kwargs=None):
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
with contextlib.ExitStack() as stack:
|
||||
joint_with_descriptors = aot_export_joint_with_descriptors(
|
||||
stack,
|
||||
model,
|
||||
(inputs,),
|
||||
args,
|
||||
kwargs,
|
||||
)
|
||||
return joint_with_descriptors.graph_module
|
||||
|
||||
@ -129,6 +181,15 @@ def _count_op(gm, target):
|
||||
return sum(1 for node in gm.graph.nodes if node.target == target)
|
||||
|
||||
|
||||
register_pytree_node(
|
||||
BlockMask,
|
||||
BlockMask._flatten,
|
||||
BlockMask._unflatten,
|
||||
flatten_with_keys_fn=BlockMask._flatten_with_keys,
|
||||
serialized_type_name="torch.nn.attention.flex_attention.BlockMask",
|
||||
)
|
||||
|
||||
|
||||
@requires_cuda
|
||||
class DTensorExportTest(TestCase):
|
||||
def tearDown(self):
|
||||
@ -168,8 +229,8 @@ class DTensorExportTest(TestCase):
|
||||
}
|
||||
tp_model = parallelize_module(model, mesh_2d["tp"], parallelize_plan)
|
||||
|
||||
inputs = torch.rand(20, 10, device=self.device_type)
|
||||
inputs = distribute_tensor(inputs, mesh_2d["tp"], placements=[Replicate()])
|
||||
inp = torch.rand(20, 10, device=self.device_type)
|
||||
inputs = (distribute_tensor(inp, mesh_2d["tp"], placements=[Replicate()]),)
|
||||
|
||||
joint_gm = export_fn(tp_model, inputs)
|
||||
fw_gm, bw_gm = min_cut_rematerialization_partition(
|
||||
@ -352,9 +413,10 @@ class DTensorExportTest(TestCase):
|
||||
}
|
||||
tp_model = parallelize_module(model, mesh_2d["tp"], parallelize_plan)
|
||||
|
||||
inputs = torch.rand(20, 10, device=self.device_type)
|
||||
inputs = distribute_tensor(inputs, mesh_2d["tp"], placements=[Replicate()])
|
||||
torch._dynamo.mark_dynamic(inputs, 0, min=5, max=100)
|
||||
inp = torch.rand(20, 10, device=self.device_type)
|
||||
inp_dtensor = distribute_tensor(inp, mesh_2d["tp"], placements=[Replicate()])
|
||||
torch._dynamo.mark_dynamic(inp_dtensor, 0, min=5, max=100)
|
||||
inputs = (inp_dtensor,)
|
||||
|
||||
joint_gm = export_fn(tp_model, inputs)
|
||||
|
||||
@ -390,15 +452,74 @@ class DTensorExportTest(TestCase):
|
||||
z = torch.randn(16, 16)
|
||||
y_dtensor = distribute_tensor(y, device_mesh, placements=[Replicate()])
|
||||
z_dtensor = DTensor.from_local(z, device_mesh, placements=[Partial()])
|
||||
inputs = (x_dtensor, y_dtensor, z_dtensor)
|
||||
|
||||
# Run model to verify it works
|
||||
output = model(x_dtensor, y_dtensor, z_dtensor)
|
||||
with torch._dynamo.config.patch(install_free_tensors=True):
|
||||
output = model(*inputs)
|
||||
with torch._dynamo.config.patch(
|
||||
install_free_tensors=(export_fn is _dynamo_graph_capture_for_export)
|
||||
):
|
||||
# TODO: switch to use the official graph_capture API once it is ready
|
||||
gm = export_fn(model)(x_dtensor, y_dtensor, z_dtensor)
|
||||
output_gm = gm(x_dtensor, y_dtensor, z_dtensor)
|
||||
gm = export_fn(model)(*inputs)
|
||||
output_gm = gm(*inputs)
|
||||
self.assertEqual(output, output_gm)
|
||||
|
||||
@parametrize(
|
||||
"export_fn",
|
||||
[
|
||||
graph_capture_and_aot_export_joint_with_descriptors_v2,
|
||||
graph_capture_and_aot_export_joint_with_descriptors,
|
||||
],
|
||||
)
|
||||
def test_flex_attention_dtensor_export(self, export_fn):
|
||||
device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
|
||||
model = FlexAttentionModel(self.device_type)
|
||||
|
||||
# Parallelize the model: shard on head dimension
|
||||
# proj_q, proj_k, proj_v are colwise parallel (output is sharded on head dimension)
|
||||
# proj_out is rowwise parallel (input is sharded, output needs reduction)
|
||||
parallelize_plan = {
|
||||
"proj_q": ColwiseParallel(),
|
||||
"proj_k": ColwiseParallel(),
|
||||
"proj_v": ColwiseParallel(),
|
||||
"proj_out": RowwiseParallel(),
|
||||
}
|
||||
tp_model = parallelize_module(model, device_mesh, parallelize_plan)
|
||||
batch_size = 4
|
||||
seq_len = 64
|
||||
embed_dim = 16
|
||||
num_heads = 8
|
||||
|
||||
# Input tensor replicated across all devices
|
||||
inp = torch.randn(batch_size, seq_len, embed_dim, device=self.device_type)
|
||||
inputs = (distribute_tensor(inp, device_mesh, placements=[Replicate()]),)
|
||||
|
||||
def causal_mask(b, h, q_idx, kv_idx):
|
||||
return q_idx >= kv_idx
|
||||
|
||||
block_mask = create_block_mask(
|
||||
causal_mask,
|
||||
batch_size,
|
||||
num_heads,
|
||||
seq_len,
|
||||
seq_len,
|
||||
device=self.device_type,
|
||||
)
|
||||
|
||||
flex_kwargs = {"block_mask": block_mask}
|
||||
|
||||
joint_gm = export_fn(tp_model, inputs, flex_kwargs)
|
||||
|
||||
self.assertTrue(
|
||||
_count_op(joint_gm, torch.ops.higher_order.flex_attention),
|
||||
1,
|
||||
)
|
||||
|
||||
self.assertTrue(
|
||||
_count_op(joint_gm, torch.ops.higher_order.flex_attention_backward),
|
||||
2,
|
||||
)
|
||||
|
||||
|
||||
instantiate_parametrized_tests(DTensorExportTest)
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ from torch.testing._internal.common_utils import (
|
||||
run_tests,
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorTestBase,
|
||||
with_comms,
|
||||
)
|
||||
@ -60,6 +61,9 @@ class TestDynamic(DTensorTestBase):
|
||||
|
||||
instantiate_parametrized_tests(TestDynamic)
|
||||
|
||||
TestDynamicWithLocalTensor = create_local_tensor_test_class(
|
||||
TestDynamic,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -13,6 +13,7 @@ from torch.distributed.tensor import (
|
||||
from torch.distributed.tensor.debug import CommDebugMode
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorTestBase,
|
||||
with_comms,
|
||||
)
|
||||
@ -167,7 +168,7 @@ class TestEmbeddingOp(DTensorTestBase):
|
||||
self._run_embedding_op_test(mesh, 0, [6, 7, 6], 13, 22)
|
||||
self._run_embedding_op_test(mesh, 0, [34], 15, 14, padding_idx=10)
|
||||
|
||||
from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
|
||||
from torch.distributed.tensor.placement_types import MaskPartial
|
||||
|
||||
# test collectives
|
||||
embedding_mod = torch.nn.Embedding(10, 20, device=self.device_type)
|
||||
@ -175,7 +176,7 @@ class TestEmbeddingOp(DTensorTestBase):
|
||||
inp = torch.randint(0, 10, (8, 8), device=self.device_type)
|
||||
replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)
|
||||
output = sharded_embedding(replicated_inp)
|
||||
self.assertIsInstance(output.placements[0], _MaskPartial)
|
||||
self.assertIsInstance(output.placements[0], MaskPartial)
|
||||
|
||||
comm_mode = CommDebugMode()
|
||||
|
||||
@ -191,9 +192,9 @@ class TestEmbeddingOp(DTensorTestBase):
|
||||
inp = torch.randint(0, 10, (4, 4), device=self.device_type)
|
||||
replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)
|
||||
|
||||
from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
|
||||
from torch.distributed.tensor.placement_types import MaskPartial
|
||||
|
||||
# case 1: two embeddings with the same shape, thus sharing the underlying _MaskPartial
|
||||
# case 1: two embeddings with the same shape, thus sharing the underlying MaskPartial
|
||||
# and MaskBuffer, because of cache hit from sharding propagation
|
||||
|
||||
emb1 = torch.nn.Embedding(10, 23, device=self.device_type)
|
||||
@ -205,28 +206,32 @@ class TestEmbeddingOp(DTensorTestBase):
|
||||
output2 = sharded_emb2(replicated_inp)
|
||||
|
||||
partial_placement1 = output1.placements[0]
|
||||
self.assertIsInstance(partial_placement1, _MaskPartial)
|
||||
self.assertIsInstance(partial_placement1, MaskPartial)
|
||||
output1.full_tensor()
|
||||
|
||||
partial_placement2 = output2.placements[0]
|
||||
self.assertIsInstance(partial_placement2, _MaskPartial)
|
||||
self.assertIsInstance(partial_placement2, MaskPartial)
|
||||
output2.full_tensor()
|
||||
|
||||
self.assertTrue(id(partial_placement1), id(partial_placement2))
|
||||
|
||||
# case 2: two embeddings with the same logical_dim_size, but different logical_shape
|
||||
# thus they will have different _MaskPartial placements (with no cache hit)
|
||||
# thus they will have different MaskPartial placements (with no cache hit)
|
||||
|
||||
emb3 = torch.nn.Embedding(10, 29, device=self.device_type)
|
||||
sharded_emb3 = self._apply_sharding(emb3, 0, mesh)
|
||||
output3 = sharded_emb3(replicated_inp)
|
||||
partial_placement3 = output3.placements[0]
|
||||
self.assertIsInstance(partial_placement3, _MaskPartial)
|
||||
self.assertIsInstance(partial_placement3, MaskPartial)
|
||||
output2.full_tensor()
|
||||
|
||||
# not equal because of different logical_shape, despite of same logical_dim_size
|
||||
self.assertNotEqual(partial_placement1, partial_placement3)
|
||||
|
||||
|
||||
TestEmbeddingOpWithLocalTensor = create_local_tensor_test_class(
|
||||
TestEmbeddingOp,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -7,6 +7,7 @@ import torch.distributed as dist
|
||||
from torch.distributed.tensor import distribute_tensor, Replicate
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorTestBase,
|
||||
with_comms,
|
||||
)
|
||||
@ -188,5 +189,11 @@ class DistOtherOpsTest(DTensorTestBase):
|
||||
)
|
||||
|
||||
|
||||
DistOtherOpsTestWithLocalTensor = create_local_tensor_test_class(
|
||||
DistOtherOpsTest,
|
||||
# Send / recv ops are not supported
|
||||
skipped_tests=["test_bernoulli"],
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -2,9 +2,11 @@
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
import torch
|
||||
from torch.distributed._local_tensor import maybe_run_for_local_tensor
|
||||
from torch.distributed.tensor import DeviceMesh, DTensor, Replicate, Shard, zeros
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorTestBase,
|
||||
with_comms,
|
||||
)
|
||||
@ -77,8 +79,13 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
dim=shard_dim,
|
||||
)
|
||||
)
|
||||
if self.rank < len(exp_tensor_list):
|
||||
eq_op(exp_tensor_list[self.rank], dist_tensor.to_local())
|
||||
|
||||
@maybe_run_for_local_tensor
|
||||
def check_per_rank_chunk(rank, local_tensor):
|
||||
if rank < len(exp_tensor_list):
|
||||
eq_op(exp_tensor_list[rank], local_tensor)
|
||||
|
||||
check_per_rank_chunk(self.rank, dist_tensor.to_local())
|
||||
else:
|
||||
exp_tensor = init_op(tensor_size, *args, **kwargs)
|
||||
eq_op(exp_tensor, dist_tensor.to_local())
|
||||
@ -150,12 +157,17 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
|
||||
self.assertEqual(dist_tensor.size(), torch.Size(size))
|
||||
local_tensor = dist_tensor.to_local()
|
||||
if self.rank <= 2:
|
||||
self.assertEqual(local_tensor.size(), torch.Size([8, 3]))
|
||||
self.assertEqual(torch.zeros(8, 3), local_tensor)
|
||||
else:
|
||||
self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
|
||||
self.assertEqual(torch.zeros(7, 3), local_tensor)
|
||||
|
||||
@maybe_run_for_local_tensor
|
||||
def check_per_rank_tensors(rank, local_tensor):
|
||||
if rank <= 2:
|
||||
self.assertEqual(local_tensor.size(), torch.Size([8, 3]))
|
||||
self.assertEqual(torch.zeros(8, 3), local_tensor)
|
||||
else:
|
||||
self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
|
||||
self.assertEqual(torch.zeros(7, 3), local_tensor)
|
||||
|
||||
check_per_rank_tensors(self.rank, local_tensor)
|
||||
|
||||
# construct a gpu device mesh with 2d: shard, replicate
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
|
||||
@ -250,5 +262,13 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
self.assertEqual(local_tensor, torch.tensor([]))
|
||||
|
||||
|
||||
DTensorConstructorTestWithLocalTensor = create_local_tensor_test_class(
|
||||
DTensorConstructorTest,
|
||||
skipped_tests=[
|
||||
# Non-contigous sub-meshes are not supported
|
||||
"test_zeros_submesh",
|
||||
],
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -7,6 +7,7 @@ from pprint import pformat
|
||||
from typing import NamedTuple
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.distributed.device_mesh import init_device_mesh
|
||||
from torch.distributed.tensor import (
|
||||
DeviceMesh,
|
||||
@ -27,7 +28,9 @@ from torch.distributed.tensor.parallel import (
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorTestBase,
|
||||
map_local_for_rank,
|
||||
skip_unless_torch_gpu,
|
||||
with_comms,
|
||||
)
|
||||
@ -471,11 +474,10 @@ class DistMathOpsTest(DTensorTestBase):
|
||||
out_req_grad: bool
|
||||
|
||||
subtest_fails = {}
|
||||
valid_filter = ( # noqa: E731
|
||||
lambda cfg: (
|
||||
not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[3:])
|
||||
)
|
||||
)
|
||||
|
||||
def valid_filter(cfg):
|
||||
return not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[3:])
|
||||
|
||||
subtest_cfgs = list(
|
||||
filter(
|
||||
valid_filter,
|
||||
@ -670,10 +672,11 @@ class DistMathOpsTest(DTensorTestBase):
|
||||
def test_vector_norm_partial(self):
|
||||
device_mesh = self.build_device_mesh()
|
||||
|
||||
rank = device_mesh.get_local_rank()
|
||||
all_ranks = list(range(self.world_size))
|
||||
|
||||
local_grad = torch.tensor([rank, 1], dtype=torch.float32)
|
||||
local_grad = map_local_for_rank(
|
||||
self.rank, lambda rank: torch.tensor([rank, 1], dtype=torch.float32)
|
||||
)
|
||||
full_grad = torch.tensor([sum(all_ranks), self.world_size], dtype=torch.float32)
|
||||
|
||||
partial_grad = DTensor.from_local(local_grad, device_mesh, [Partial()])
|
||||
@ -708,11 +711,14 @@ class DistMathOpsTest(DTensorTestBase):
|
||||
def test_foreach_norm_partial(self):
|
||||
device_mesh = self.build_device_mesh()
|
||||
|
||||
rank = device_mesh.get_local_rank()
|
||||
all_ranks = list(range(self.world_size))
|
||||
|
||||
local_grad0 = torch.tensor([rank, 1], dtype=torch.float32)
|
||||
local_grad1 = torch.tensor([rank + 1, 2], dtype=torch.float32)
|
||||
local_grad0 = map_local_for_rank(
|
||||
self.rank, lambda rank: torch.tensor([rank, 1], dtype=torch.float32)
|
||||
)
|
||||
local_grad1 = map_local_for_rank(
|
||||
self.rank, lambda rank: torch.tensor([rank + 1, 2], dtype=torch.float32)
|
||||
)
|
||||
|
||||
grad0 = torch.tensor([sum(all_ranks), self.world_size], dtype=torch.float32)
|
||||
grad1 = torch.tensor(
|
||||
@ -971,6 +977,68 @@ class DistMathOpsTest(DTensorTestBase):
|
||||
self.assertTrue(output_dtensor.placements[0].is_shard(shard_dim))
|
||||
self.assertEqual(output_dtensor.full_tensor(), output)
|
||||
|
||||
@with_comms
|
||||
def test_partial_reduction_ops(self):
|
||||
mesh = self.build_device_mesh()
|
||||
rank = dist.get_rank()
|
||||
|
||||
torch.manual_seed(rank)
|
||||
local_tensor = torch.rand(3, dtype=torch.float32, device=self.device_type)
|
||||
dt = DTensor.from_local(
|
||||
local_tensor, device_mesh=mesh, placements=[Partial("sum")]
|
||||
)
|
||||
out_without_redistribute = torch.norm(dt)
|
||||
|
||||
dt = dt.redistribute(dt.device_mesh, placements=[Replicate()])
|
||||
out_with_redistribute = torch.norm(dt)
|
||||
|
||||
self.assertEqual(out_without_redistribute, out_with_redistribute)
|
||||
|
||||
local_tensor = torch.rand(3, dtype=torch.float32, device=self.device_type)
|
||||
dt = DTensor.from_local(
|
||||
local_tensor, device_mesh=mesh, placements=[Partial("sum")]
|
||||
)
|
||||
out_without_redistribute = torch.max(dt)
|
||||
|
||||
dt = dt.redistribute(dt.device_mesh, placements=[Replicate()])
|
||||
out_with_redistribute = torch.max(dt)
|
||||
|
||||
self.assertEqual(out_without_redistribute, out_with_redistribute)
|
||||
|
||||
local_tensor = torch.rand(3, dtype=torch.float32, device=self.device_type)
|
||||
dt = DTensor.from_local(
|
||||
local_tensor, device_mesh=mesh, placements=[Partial("sum")]
|
||||
)
|
||||
out_without_redistribute = torch.min(dt)
|
||||
|
||||
dt = dt.redistribute(dt.device_mesh, placements=[Replicate()])
|
||||
out_with_redistribute = torch.min(dt)
|
||||
|
||||
self.assertEqual(out_without_redistribute, out_with_redistribute)
|
||||
|
||||
@with_comms
|
||||
def test_matching_partial_reduction_ops(self):
|
||||
mesh = self.build_device_mesh()
|
||||
rank = dist.get_rank()
|
||||
|
||||
torch.manual_seed(rank)
|
||||
local_tensor = torch.rand(3, dtype=torch.float32, device=self.device_type)
|
||||
dt = DTensor.from_local(
|
||||
local_tensor, device_mesh=mesh, placements=[Partial("max")]
|
||||
)
|
||||
out_without_redistribute = torch.max(dt)
|
||||
|
||||
dt = dt.redistribute(dt.device_mesh, placements=[Replicate()])
|
||||
out_with_redistribute = torch.max(dt)
|
||||
|
||||
self.assertTrue(out_without_redistribute.placements[0].is_partial())
|
||||
self.assertTrue(out_with_redistribute.placements[0].is_replicate())
|
||||
self.assertEqual(out_without_redistribute, out_with_redistribute)
|
||||
|
||||
|
||||
DistMathOpsTestWithLocalTensor = create_local_tensor_test_class(
|
||||
DistMathOpsTest,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -26,6 +26,7 @@ from torch.testing._internal.common_utils import (
|
||||
TEST_WITH_ROCM,
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorTestBase,
|
||||
skip_unless_torch_gpu,
|
||||
with_comms,
|
||||
@ -614,5 +615,9 @@ class DistMatrixOpsTest(DTensorTestBase):
|
||||
|
||||
instantiate_parametrized_tests(DistMatrixOpsTest)
|
||||
|
||||
DistMatrixOpsTestWithLocalTensor = create_local_tensor_test_class(
|
||||
DistMatrixOpsTest,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -37,6 +37,7 @@ from torch.distributed.tensor._ops.utils import (
|
||||
from torch.distributed.tensor.debug import CommDebugMode
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorOpTestBase,
|
||||
DTensorTestBase,
|
||||
with_comms,
|
||||
@ -644,5 +645,15 @@ class TestStrategyHashing(DTensorTestBase):
|
||||
self.assertEqual(out1.full_tensor(), out2.full_tensor())
|
||||
|
||||
|
||||
DistTensorReplicateStrategyRegistrationTestWithLocalTensor = (
|
||||
create_local_tensor_test_class(
|
||||
DistTensorReplicateStrategyRegistrationTest,
|
||||
)
|
||||
)
|
||||
|
||||
TestStrategyHashingWithLocalTensor = create_local_tensor_test_class(
|
||||
TestStrategyHashing,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -14,6 +14,7 @@ from torch.distributed.tensor import (
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
DTensorTestBase,
|
||||
MLPModule,
|
||||
with_comms,
|
||||
@ -716,5 +717,9 @@ class TestDTensorOptimizer(DTensorTestBase):
|
||||
self._assert_optimizer(None, mod, opt, mod_copy, dist_opt, inp)
|
||||
|
||||
|
||||
TestDTensorOptimizerWithLocalTensor = create_local_tensor_test_class(
|
||||
TestDTensorOptimizer,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
88
test/distributed/tensor/test_placement_types.py
Normal file
88
test/distributed/tensor/test_placement_types.py
Normal file
@ -0,0 +1,88 @@
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
import copy
|
||||
import itertools
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from torch._dynamo.variables.distributed import PlacementClassVariable
|
||||
from torch.distributed.tensor.placement_types import (
|
||||
_StridedShard,
|
||||
Partial,
|
||||
Replicate,
|
||||
Shard,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
|
||||
# Basic functionality test for Placement types.
|
||||
class PlacementTypesTestCase(TestCase):
|
||||
def test_type_identification(self):
|
||||
shard = Shard(3)
|
||||
strided_shard = _StridedShard(dim=3, split_factor=7)
|
||||
partial_sum = Partial("sum")
|
||||
partial_max = Partial("max")
|
||||
replicate = Replicate()
|
||||
|
||||
ident_tests = (
|
||||
(shard, True, False, False),
|
||||
(strided_shard, True, False, False),
|
||||
(partial_sum, False, True, False),
|
||||
(partial_max, False, True, False),
|
||||
(replicate, False, False, True),
|
||||
)
|
||||
for do_deepcopy in (False, True):
|
||||
for placement, is_shard, is_partial, is_replicate in ident_tests:
|
||||
if do_deepcopy:
|
||||
placement = copy.deepcopy(placement)
|
||||
self.assertEqual(placement.is_shard(), is_shard)
|
||||
self.assertEqual(placement.is_partial(), is_partial)
|
||||
self.assertEqual(placement.is_replicate(), is_replicate)
|
||||
|
||||
def test_equality(self):
|
||||
equivalence_classes = (
|
||||
(Shard(3), _StridedShard(dim=3, split_factor=7)),
|
||||
(Shard(4), _StridedShard(dim=4, split_factor=9)),
|
||||
(Replicate(),),
|
||||
(Partial("sum"),),
|
||||
(Partial("max"),),
|
||||
)
|
||||
for eq_class in equivalence_classes:
|
||||
# Each item in the equivalence class should be equal to every other item in
|
||||
# its class.
|
||||
for lhs, rhs in itertools.product(eq_class, eq_class):
|
||||
self.assertEqual(lhs, rhs)
|
||||
|
||||
# Each item in the equivalence class should not be equal to any item in any
|
||||
# other class.
|
||||
for other_class in equivalence_classes:
|
||||
if other_class is eq_class:
|
||||
continue
|
||||
for lhs, rhs in itertools.product(eq_class, other_class):
|
||||
self.assertNotEqual(lhs, rhs)
|
||||
|
||||
# Testing this case doesn't seem to fit neatly into the above equivalence class
|
||||
# framework.
|
||||
self.assertNotEqual(
|
||||
_StridedShard(dim=3, split_factor=1), _StridedShard(dim=3, split_factor=2)
|
||||
)
|
||||
|
||||
@unittest.skipIf(
|
||||
sys.version_info < (3, 10), "kw_only is only available in python >= 3.10"
|
||||
)
|
||||
def test_strided_shard_kwonly_argument(self):
|
||||
with self.assertRaises(TypeError):
|
||||
_StridedShard(3, 4)
|
||||
_StridedShard(3, split_factor=4)
|
||||
|
||||
def test_strided_shard_isinstance_shard(self):
|
||||
assert isinstance(_StridedShard(dim=3, split_factor=7), Shard)
|
||||
|
||||
def test_dynamo_can_identify_placement_classes(self):
|
||||
for cls in (Replicate, Shard, _StridedShard, Partial):
|
||||
self.assertTrue(
|
||||
PlacementClassVariable.is_placement_type(cls), msg=f"failed on {cls}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
@ -511,7 +511,7 @@ class DistTensorOpsTest(DTensorTestBase):
|
||||
# case 2 input sharding: input sharded, index replicated, output mask partial
|
||||
# only works when index has size 1 on the gather dimension and
|
||||
# input is sharded on the gather dimension
|
||||
from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
|
||||
from torch.distributed.tensor.placement_types import MaskPartial
|
||||
|
||||
gather_dim = 1
|
||||
global_input = torch.randn(12, 8, 16)
|
||||
@ -522,7 +522,7 @@ class DistTensorOpsTest(DTensorTestBase):
|
||||
with comm_mode:
|
||||
output_dt = torch.gather(input_dt, gather_dim, index_dt)
|
||||
self.assertEqual(comm_mode.get_total_counts(), 0)
|
||||
self.assertIsInstance(output_dt.placements[0], _MaskPartial)
|
||||
self.assertIsInstance(output_dt.placements[0], MaskPartial)
|
||||
self.assertEqual(output_dt.full_tensor(), global_output)
|
||||
|
||||
# case 3 index sharding: input replicated, index sharded, output sharded
|
||||
|
||||
@ -4901,7 +4901,7 @@ class NCCLTraceTest(NCCLTraceTestBase):
|
||||
for p2p_op_idx, input_sizes in zip(
|
||||
range(first_op, coalesced_op, 1), op_sizes_per_coalesce
|
||||
):
|
||||
# the indivudal ops inside the coalescing group the individual op metadata,
|
||||
# the individual ops inside the coalescing group the individual op metadata,
|
||||
# but not the timing info coming from the actual coalesced kernel
|
||||
profiling_name = (
|
||||
"nccl:recv 0<-1" if self.rank == 0 else "nccl:send 1->0"
|
||||
|
||||
@ -106,6 +106,30 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinuousTest):
|
||||
torch.ops.symm_mem.nvshmem_broadcast(tensor, src_rank, group_name)
|
||||
self.assertEqual(tensor, torch.arange(numel, dtype=dtype, device=self.device))
|
||||
|
||||
@skipIfRocm
|
||||
def test_mempool_tensor_w_collective(self) -> None:
|
||||
"""
|
||||
Test the effectiveness of MemPool on tensor factory ops.
|
||||
"""
|
||||
self._init_device()
|
||||
group_name = dist.group.WORLD.group_name
|
||||
symm_mem.enable_symm_mem_for_group(group_name)
|
||||
|
||||
dtype = torch.float
|
||||
numel = 1024
|
||||
|
||||
allocator = symm_mem.get_mempool_allocator(self.device)
|
||||
mempool = torch.cuda.MemPool(allocator)
|
||||
|
||||
with torch.cuda.use_mem_pool(mempool):
|
||||
tensor = torch.ones(numel, dtype=dtype, device=self.device)
|
||||
|
||||
symm_mem.rendezvous(tensor, group=group_name)
|
||||
dist.all_reduce(tensor)
|
||||
self.assertEqual(
|
||||
tensor, torch.ones(numel, dtype=dtype, device=self.device) * self.world_size
|
||||
)
|
||||
|
||||
@skipIfRocm
|
||||
def test_mempool_compute_ops(self) -> None:
|
||||
"""
|
||||
@ -374,7 +398,7 @@ class NVSHMEMAll2AllTest(MultiProcContinuousTest):
|
||||
nsplits, dtype=torch.int64, device=self.device
|
||||
).copy_(inp_splits)
|
||||
# 2 rows: output splits, output offsets
|
||||
# Initiallizing all values to -1 to check if they are updated
|
||||
# Initializing all values to -1 to check if they are updated
|
||||
out_splits_offsets = symm_mem.empty(
|
||||
(2, nsplits), dtype=torch.int64, device=self.device
|
||||
).fill_(-1)
|
||||
@ -479,7 +503,7 @@ class NVSHMEMAll2AllTest(MultiProcContinuousTest):
|
||||
(2, nsplits), dtype=torch.int64, device=self.device
|
||||
)
|
||||
# 2 rows: output splits, output offsets
|
||||
# Initiallizing all values to -1 to check if they are updated
|
||||
# Initializing all values to -1 to check if they are updated
|
||||
out_splits_offsets = symm_mem.empty(
|
||||
(2, nsplits), dtype=torch.int64, device=self.device
|
||||
).fill_(-1)
|
||||
@ -593,7 +617,7 @@ def dispatch_then_combine(device, align: int, group) -> None:
|
||||
inp_splits
|
||||
)
|
||||
# 2 rows: output splits, output offsets
|
||||
# Initiallizing all values to -1 to check if they are updated
|
||||
# Initializing all values to -1 to check if they are updated
|
||||
out_splits_offsets = symm_mem.empty(
|
||||
(2, nsplits), dtype=torch.int64, device=device
|
||||
).fill_(-1)
|
||||
@ -601,7 +625,7 @@ def dispatch_then_combine(device, align: int, group) -> None:
|
||||
# Buffers for combine
|
||||
combine_out = symm_mem.empty(max_out_numel, dtype=dtype, device=device).fill_(-1)
|
||||
# 2 rows: output splits, output offsets
|
||||
# Initiallizing all values to -1 to check if they are updated
|
||||
# Initializing all values to -1 to check if they are updated
|
||||
combine_out_splits_offsets = symm_mem.empty(
|
||||
(2, nsplits), dtype=torch.int64, device=device
|
||||
).fill_(-1)
|
||||
|
||||
@ -274,11 +274,12 @@ class SymmetricMemoryTest(MultiProcContinuousTest):
|
||||
self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())
|
||||
|
||||
|
||||
# We move AsyncTP tests to a seperate test suite because 1) Async TP ops are not
|
||||
# We move AsyncTP tests to a separate test suite because 1) Async TP ops are not
|
||||
# the core symmetric memory APIs, they are more like applications, 2)
|
||||
# MultiProcContinuousTest will skip all the following tests if a test fails (
|
||||
# we should fix this too). We still want to get the test signals for the core
|
||||
# symmetric memory APIs when Async TP ops fail.
|
||||
@skip_if_rocm_multiprocess # AsyncTP is not yet supported on ROCm
|
||||
@instantiate_parametrized_tests
|
||||
@requires_cuda_p2p_access()
|
||||
class AsyncTPTest(MultiProcContinuousTest):
|
||||
@ -620,7 +621,7 @@ class AsyncTPTest(MultiProcContinuousTest):
|
||||
|
||||
# [READ ME FIRST]
|
||||
# The `SymmMemEmptySetDeviceTest` suite parameterizes whether user sets the
|
||||
# device before calling symm_mem.emtpy. Either way should work.
|
||||
# device before calling symm_mem.empty. Either way should work.
|
||||
# However, since `set_device` is persistent, we cannot use the
|
||||
# `MultiProcContinuousTest` template because the next function will be
|
||||
# "contaminated", leading to flaky tests (e.g. hang). Therefore, we use
|
||||
|
||||
@ -51,7 +51,7 @@ nan
|
||||
>>> INF / INF
|
||||
nan
|
||||
|
||||
However unambigous operations with inf return inf:
|
||||
However unambiguous operations with inf return inf:
|
||||
>>> INF * INF
|
||||
inf
|
||||
>>> 1.5 * INF
|
||||
|
||||
@ -1711,7 +1711,7 @@ class TestBasicOps(__TestCase):
|
||||
t3 = tnew(t1)
|
||||
self.assertTrue(list(t1) == list(t2) == list(t3) == list('abc'))
|
||||
|
||||
# test that tee objects are weak referencable
|
||||
# test that tee objects are weak referenceable
|
||||
a, b = tee(range(10))
|
||||
p = weakref.proxy(a)
|
||||
self.assertEqual(getattr(p, '__class__'), type(b))
|
||||
@ -2243,7 +2243,7 @@ class TestPurePythonRoughEquivalents(__TestCase):
|
||||
t3 = tnew(t1)
|
||||
self.assertTrue(list(t1) == list(t2) == list(t3) == list('abc'))
|
||||
|
||||
# test that tee objects are weak referencable
|
||||
# test that tee objects are weak referenceable
|
||||
a, b = tee(range(10))
|
||||
p = weakref.proxy(a)
|
||||
self.assertEqual(getattr(p, '__class__'), type(b))
|
||||
|
||||
@ -153,7 +153,9 @@ def _get_custom_policy(no_recompute_list=None, must_recompute_list=None):
|
||||
return _custom_policy
|
||||
|
||||
|
||||
class ActivationCheckpointingViaTagsTests(torch._dynamo.test_case.TestCase):
|
||||
class ActivationCheckpointingViaTagsTests(
|
||||
torch._dynamo.test_case.TestCaseWithNestedGraphBreaks
|
||||
):
|
||||
def _validate(
|
||||
self,
|
||||
fn,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user