Compare commits

..

2 Commits

Author SHA1 Message Date
45715eb46e debugging cudnn numerics
ghstack-source-id: 460fd38569b797bdd607f6672aa16f35177aa5c8
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164950
2025-10-29 14:10:53 -07:00
22c7937326 bwd pass
ghstack-source-id: 563ff6899659ecced546e3723410732f5fc2878f
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164504
2025-10-29 14:10:52 -07:00
38 changed files with 2063 additions and 2511 deletions

View File

@ -129,7 +129,7 @@ function install_129 {
}
function install_128 {
CUDNN_VERSION=9.8.0.87
CUDNN_VERSION=9.10.2.21
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
# install CUDA 12.8.1 in the same container
install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
@ -150,7 +150,7 @@ function install_130 {
CUDNN_VERSION=9.13.0.50
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
# install CUDA 13.0 in the same container
install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
install_cudnn 13 $CUDNN_VERSION

View File

@ -100,8 +100,6 @@ COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

View File

@ -272,6 +272,18 @@ def smoke_test_cuda(
torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version())
print(f"Torch cuDNN version: {torch_cudnn_version}")
torch_cudnn_compile_version = torch._C._cudnn.getCompileVersion()
print(f"Torch cuDNN compile-time version: {torch_cudnn_compile_version}")
torch_cudnn_runtime_version = tuple(
[int(x) for x in torch_cudnn_version.split(".")]
)
if torch_cudnn_runtime_version != torch_cudnn_compile_version:
raise RuntimeError(
"cuDNN runtime version doesn't match comple version. "
f"Loaded: {torch_cudnn_runtime_version} "
f"Expected: {torch_cudnn_compile_version}"
)
if sys.platform in ["linux", "linux2"]:
torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
print(f"Torch nccl; version: {torch_nccl_version}")

View File

@ -460,18 +460,28 @@ test_inductor_shard() {
--verbose
}
test_inductor_aoti_cpp() {
test_inductor_aoti() {
# docker build uses bdist_wheel which does not work with test_aot_inductor
# TODO: need a faster way to build
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
# We need to hipify before building again
python3 tools/amd_build/build_amd.py
fi
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
else
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
fi
# aoti cmake custom command requires `torch` to be installed
# initialize the cmake build cache and install torch
/usr/bin/env "${BUILD_COMMAND[@]}"
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
}
@ -1766,7 +1776,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
test_inductor_aoti_cpp
test_inductor_aoti
fi
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
install_torchvision

View File

@ -7,9 +7,12 @@ if "%DESIRED_PYTHON%" == "3.13t" (
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
set PYTHON_EXEC="python3.13t"
) else if "%DESIRED_PYTHON%"=="3.14" (
echo Python version is set to 3.14 or 3.14t
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
) else if "%DESIRED_PYTHON%"=="3.14t" (
echo Python version is set to 3.14 or 3.14t
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
set PYTHON_EXEC="python3.14t"
) else (

View File

@ -1,8 +1,3 @@
---
name: docstring
description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
---
# PyTorch Docstring Writing Guide
This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.

View File

@ -1,385 +0,0 @@
---
name: skill-writer
description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
---
# Skill Writer
This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
## When to use this Skill
Use this Skill when:
- Creating a new Agent Skill
- Writing or updating SKILL.md files
- Designing skill structure and frontmatter
- Troubleshooting skill discovery issues
- Converting existing prompts or workflows into Skills
## Instructions
### Step 1: Determine Skill scope
First, understand what the Skill should do:
1. **Ask clarifying questions**:
- What specific capability should this Skill provide?
- When should Claude use this Skill?
- What tools or resources does it need?
- Is this for personal use or team sharing?
2. **Keep it focused**: One Skill = one capability
- Good: "PDF form filling", "Excel data analysis"
- Too broad: "Document processing", "Data tools"
### Step 2: Choose Skill location
Determine where to create the Skill:
**Personal Skills** (`~/.claude/skills/`):
- Individual workflows and preferences
- Experimental Skills
- Personal productivity tools
**Project Skills** (`.claude/skills/`):
- Team workflows and conventions
- Project-specific expertise
- Shared utilities (committed to git)
### Step 3: Create Skill structure
Create the directory and files:
```bash
# Personal
mkdir -p ~/.claude/skills/skill-name
# Project
mkdir -p .claude/skills/skill-name
```
For multi-file Skills:
```
skill-name/
├── SKILL.md (required)
├── reference.md (optional)
├── examples.md (optional)
├── scripts/
│ └── helper.py (optional)
└── templates/
└── template.txt (optional)
```
### Step 4: Write SKILL.md frontmatter
Create YAML frontmatter with required fields:
```yaml
---
name: skill-name
description: Brief description of what this does and when to use it
---
```
**Field requirements**:
- **name**:
- Lowercase letters, numbers, hyphens only
- Max 64 characters
- Must match directory name
- Good: `pdf-processor`, `git-commit-helper`
- Bad: `PDF_Processor`, `Git Commits!`
- **description**:
- Max 1024 characters
- Include BOTH what it does AND when to use it
- Use specific trigger words users would say
- Mention file types, operations, and context
**Optional frontmatter fields**:
- **allowed-tools**: Restrict tool access (comma-separated list)
```yaml
allowed-tools: Read, Grep, Glob
```
Use for:
- Read-only Skills
- Security-sensitive workflows
- Limited-scope operations
### Step 5: Write effective descriptions
The description is critical for Claude to discover your Skill.
**Formula**: `[What it does] + [When to use it] + [Key triggers]`
**Examples**:
✅ **Good**:
```yaml
description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
```
✅ **Good**:
```yaml
description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
```
❌ **Too vague**:
```yaml
description: Helps with documents
description: For data analysis
```
**Tips**:
- Include specific file extensions (.pdf, .xlsx, .json)
- Mention common user phrases ("analyze", "extract", "generate")
- List concrete operations (not generic verbs)
- Add context clues ("Use when...", "For...")
### Step 6: Structure the Skill content
Use clear Markdown sections:
```markdown
# Skill Name
Brief overview of what this Skill does.
## Quick start
Provide a simple example to get started immediately.
## Instructions
Step-by-step guidance for Claude:
1. First step with clear action
2. Second step with expected outcome
3. Handle edge cases
## Examples
Show concrete usage examples with code or commands.
## Best practices
- Key conventions to follow
- Common pitfalls to avoid
- When to use vs. not use
## Requirements
List any dependencies or prerequisites:
```bash
pip install package-name
```
## Advanced usage
For complex scenarios, see [reference.md](reference.md).
```
### Step 7: Add supporting files (optional)
Create additional files for progressive disclosure:
**reference.md**: Detailed API docs, advanced options
**examples.md**: Extended examples and use cases
**scripts/**: Helper scripts and utilities
**templates/**: File templates or boilerplate
Reference them from SKILL.md:
```markdown
For advanced usage, see [reference.md](reference.md).
Run the helper script:
\`\`\`bash
python scripts/helper.py input.txt
\`\`\`
```
### Step 8: Validate the Skill
Check these requirements:
✅ **File structure**:
- [ ] SKILL.md exists in correct location
- [ ] Directory name matches frontmatter `name`
✅ **YAML frontmatter**:
- [ ] Opening `---` on line 1
- [ ] Closing `---` before content
- [ ] Valid YAML (no tabs, correct indentation)
- [ ] `name` follows naming rules
- [ ] `description` is specific and < 1024 chars
✅ **Content quality**:
- [ ] Clear instructions for Claude
- [ ] Concrete examples provided
- [ ] Edge cases handled
- [ ] Dependencies listed (if any)
✅ **Testing**:
- [ ] Description matches user questions
- [ ] Skill activates on relevant queries
- [ ] Instructions are clear and actionable
### Step 9: Test the Skill
1. **Restart Claude Code** (if running) to load the Skill
2. **Ask relevant questions** that match the description:
```
Can you help me extract text from this PDF?
```
3. **Verify activation**: Claude should use the Skill automatically
4. **Check behavior**: Confirm Claude follows the instructions correctly
### Step 10: Debug if needed
If Claude doesn't use the Skill:
1. **Make description more specific**:
- Add trigger words
- Include file types
- Mention common user phrases
2. **Check file location**:
```bash
ls ~/.claude/skills/skill-name/SKILL.md
ls .claude/skills/skill-name/SKILL.md
```
3. **Validate YAML**:
```bash
cat SKILL.md | head -n 10
```
4. **Run debug mode**:
```bash
claude --debug
```
## Common patterns
### Read-only Skill
```yaml
---
name: code-reader
description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
allowed-tools: Read, Grep, Glob
---
```
### Script-based Skill
```yaml
---
name: data-processor
description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
---
# Data Processor
## Instructions
1. Use the processing script:
\`\`\`bash
python scripts/process.py input.csv --output results.json
\`\`\`
2. Validate output with:
\`\`\`bash
python scripts/validate.py results.json
\`\`\`
```
### Multi-file Skill with progressive disclosure
```yaml
---
name: api-designer
description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
---
# API Designer
Quick start: See [examples.md](examples.md)
Detailed reference: See [reference.md](reference.md)
## Instructions
1. Gather requirements
2. Design endpoints (see examples.md)
3. Document with OpenAPI spec
4. Review against best practices (see reference.md)
```
## Best practices for Skill authors
1. **One Skill, one purpose**: Don't create mega-Skills
2. **Specific descriptions**: Include trigger words users will say
3. **Clear instructions**: Write for Claude, not humans
4. **Concrete examples**: Show real code, not pseudocode
5. **List dependencies**: Mention required packages in description
6. **Test with teammates**: Verify activation and clarity
7. **Version your Skills**: Document changes in content
8. **Use progressive disclosure**: Put advanced details in separate files
## Validation checklist
Before finalizing a Skill, verify:
- [ ] Name is lowercase, hyphens only, max 64 chars
- [ ] Description is specific and < 1024 chars
- [ ] Description includes "what" and "when"
- [ ] YAML frontmatter is valid
- [ ] Instructions are step-by-step
- [ ] Examples are concrete and realistic
- [ ] Dependencies are documented
- [ ] File paths use forward slashes
- [ ] Skill activates on relevant queries
- [ ] Claude follows instructions correctly
## Troubleshooting
**Skill doesn't activate**:
- Make description more specific with trigger words
- Include file types and operations in description
- Add "Use when..." clause with user phrases
**Multiple Skills conflict**:
- Make descriptions more distinct
- Use different trigger words
- Narrow the scope of each Skill
**Skill has errors**:
- Check YAML syntax (no tabs, proper indentation)
- Verify file paths (use forward slashes)
- Ensure scripts have execute permissions
- List all dependencies
## Examples
See the documentation for complete examples:
- Simple single-file Skill (commit-helper)
- Skill with tool permissions (code-reviewer)
- Multi-file Skill (pdf-processing)
## Output format
When creating a Skill, I will:
1. Ask clarifying questions about scope and requirements
2. Suggest a Skill name and location
3. Create the SKILL.md file with proper frontmatter
4. Include clear instructions and examples
5. Add supporting files if needed
6. Provide testing instructions
7. Validate against all requirements
The result will be a complete, working Skill that follows all best practices and validation rules.

View File

@ -22,7 +22,7 @@ CUDA_ARCHES_FULL_VERSION = {
"12.6": "12.6.3",
"12.8": "12.8.1",
"12.9": "12.9.1",
"13.0": "13.0.2",
"13.0": "13.0.0",
}
CUDA_ARCHES_CUDNN_VERSION = {
"12.6": "9",
@ -96,21 +96,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
),
"13.0": (
"nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
"nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
"nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
"nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
"nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
"nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
"nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
"nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
"nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
"nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
),
"xpu": (
"intel-cmplr-lib-rt==2025.2.1 | "

View File

@ -270,7 +270,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -519,7 +519,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -768,7 +768,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1017,7 +1017,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1266,7 +1266,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1515,7 +1515,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1764,7 +1764,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -325,7 +325,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda13_0-test: # Testing
@ -991,7 +991,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda13_0-test: # Testing
@ -1657,7 +1657,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda13_0-test: # Testing
@ -2323,7 +2323,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda13_0-test: # Testing
@ -2989,7 +2989,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda13_0-test: # Testing
@ -3655,7 +3655,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda13_0-test: # Testing
@ -4321,7 +4321,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda13_0-test: # Testing

View File

@ -1,4 +1,4 @@
![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)
![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png)
--------------------------------------------------------------------------------
@ -72,7 +72,7 @@ Elaborating Further:
If you use NumPy, then you have used Tensors (a.k.a. ndarray).
![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png)
![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png)
PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
computation by a huge amount.
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
You get the best of speed and flexibility for your crazy research.
![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)
![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif)
### Python First

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,171 +0,0 @@
#pragma once
#include <ATen/core/Tensor.h>
namespace at::native {
using at::blas::ScalingType;
using at::blas::SwizzleType;
namespace {
// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
if (resolve_conj && tensor.is_conj()) {
return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
} else {
return c10::MaybeOwned<Tensor>::borrowed(tensor);
}
}
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
if (tensor.is_non_overlapping_and_dense()) { // common case
transpose_tensor = tensor.is_contiguous();
return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
}
IntArrayRef tensor_strides = tensor.strides();
IntArrayRef tensor_sizes = tensor.sizes();
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
transpose_tensor = false;
return resolve_conj_if_indicated(tensor, !transpose_result);
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
transpose_tensor = true;
return resolve_conj_if_indicated(tensor, transpose_result);
} else {
transpose_tensor = true;
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
}
}
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
if (tensor.is_non_overlapping_and_dense()) { // common case
transpose_tensor = tensor.is_contiguous();
return resolve_conj_if_indicated(tensor, true);
}
IntArrayRef tensor_strides = tensor.strides();
IntArrayRef tensor_sizes = tensor.sizes();
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
transpose_tensor = false;
return resolve_conj_if_indicated(tensor, true);
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
transpose_tensor = true;
return resolve_conj_if_indicated(tensor, true);
} else {
transpose_tensor = true;
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
}
}
} // namespace
/**
* @brief Prepares matrices for CUBLAS operation
*
* This constructor prepares tensors for CUBLAS
* The main difference is that PyTorch uses row-major as the default and
* CUBLAS expects column-major.
*
* @details
* To enable row-major output while using CUBLAS,
* we use the mathematical identity that (A × B)^T = B^T × A^T.
*
* Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
* T = row-major, N = col-major
*
* Example:
* For matrices A (M×K)(row-major) and B (K×N)(row-major):
* - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
* - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
* - However, since the output form cublas is column-major this is
* - equivalent to an output of size MxN row-major as expected
*
* The transpose flags are derived from the layouts of the passed in tensors
*
* If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
* to their unpacked values to match what cuBLAS expects.
*
* @param mat1 First input matrix
* @param mat2 Second input matrix
* @param c Output matrix (result)
* @param scale_a Optional scaling factor for first matrix
* @param scale_b Optional scaling factor for second matrix
* @param scale_result Optional scaling factor for result
*/
struct cublasCommonArgs {
cublasCommonArgs(
const Tensor& mat1,
const Tensor& mat2,
Tensor& c,
const std::optional<Tensor>& scale_a = std::nullopt,
const std::optional<Tensor>& scale_b = std::nullopt,
const std::optional<Tensor>& scale_result = std::nullopt,
const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
bool transpose_result = false, transpose_a = false, transpose_b = false;
result = prepare_matrix_for_cublas(c, transpose_result);
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
// Handle scale tensors if provided
if (scale_a && scale_b) {
// By default since we return in row-major we run the gemm
// as B.T @ A.T, check transpose_result to determine if we flip the scales
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
}
if (scale_result) {
scale_result_ptr = scale_result->data_ptr();
scale_result_dtype = scale_result->scalar_type();
}
// Update transpose flags
if (transpose_result) {
transpose_a = !transpose_a;
transpose_b = !transpose_b;
}
auto sizes_a = mata->sizes();
auto sizes_b = matb->sizes();
m = sizes_a[transpose_result ? 1 : 0];
k = sizes_a[transpose_result ? 0 : 1];
n = sizes_b[transpose_result ? 0 : 1];
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
result_ld = result->stride(transpose_result ? 0 : 1);
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
// if the gemm operands are in packed float4
if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
k = k * 2;
lda = lda * 2;
ldb = ldb * 2;
}
}
// Matrix members
char transa, transb;
int64_t m, n, k;
int64_t lda, ldb, result_ld;
c10::MaybeOwned<Tensor> mata, matb, result;
// Scale members
void* scale_mata_ptr = nullptr;
void* scale_matb_ptr = nullptr;
void* scale_result_ptr = nullptr;
std::optional<c10::ScalarType> scale_mata_dtype;
std::optional<ScalingType> scaling_mata_type;
std::optional<c10::ScalarType> scale_matb_dtype;
std::optional<ScalingType> scaling_matb_type;
std::optional<c10::ScalarType> scale_result_dtype;
};
} // namespace at::native

View File

@ -57,7 +57,6 @@ Tensor& random_mps_impl(Tensor& self,
if (self.numel() == 0) {
return self;
}
at::assert_no_internal_overlap(self);
// MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
const auto need_reshape = self.ndimension() > 4;
auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
@ -154,16 +153,8 @@ Tensor& random_mps_impl(Tensor& self,
feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
}
// Handle non-contiguous output tensors by creating a contiguous temporary
const auto needs_gather = needsGather(self);
Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
// Copy results back to original non-contiguous output
if (needs_gather) {
self.copy_(self_);
}
}
return self;

View File

@ -1358,15 +1358,9 @@ if(BUILD_TEST)
)
else()
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
# NativeRT is disabled
# add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
if(BUILD_AOT_INDUCTOR_TEST)
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
endif()
if(USE_DISTRIBUTED)
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
if(NOT WIN32)
@ -1384,6 +1378,16 @@ if(BUILD_TEST)
${CMAKE_BINARY_DIR}/test_mobile_nnc
)
endif()
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
${CMAKE_BINARY_DIR}/test_lazy)
endif()
if(BUILD_AOT_INDUCTOR_TEST)
add_subdirectory(
${TORCH_ROOT}/test/cpp/aoti_abi_check
${CMAKE_BINARY_DIR}/test_aoti_abi_check)
add_subdirectory(
${TORCH_ROOT}/test/cpp/aoti_inference
${CMAKE_BINARY_DIR}/test_aoti_inference)
endif()
endif()

View File

@ -1,8 +1,3 @@
# Skip on windows
if(WIN32)
return()
endif()
set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
# Build the cpp gtest binary containing the cpp-only tests.
@ -35,15 +30,8 @@ target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)
# WARNING: DO NOT LINK torch!!!
# The purpose is to check if the used aten/c10 headers are written in a header-only way
target_link_libraries(test_aoti_abi_check PRIVATE gtest_main sleef)
target_link_libraries(test_aoti_abi_check PRIVATE gtest_main)
target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
if(NOT USE_SYSTEM_SLEEF)
target_include_directories(test_aoti_abi_check PRIVATE ${CMAKE_BINARY_DIR}/include)
endif()
# Disable unused-variable warnings for variables that are only used to test compilation
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)
foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
@ -53,17 +41,12 @@ foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}")
add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main sleef)
target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main)
target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${ATen_CPU_INCLUDE})
if(NOT USE_SYSTEM_SLEEF)
target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${CMAKE_BINARY_DIR}/include)
endif()
# Define CPU_CAPABILITY and CPU_CAPABILITY_XXX macros for conditional compilation
target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY} CPU_CAPABILITY_${CPU_CAPABILITY})
target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE ${FLAGS})
target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-variable)
target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-but-set-variable)
endforeach()
endforeach()

View File

@ -2,27 +2,10 @@
#include <ATen/cpu/vec/vec.h>
#include <iostream>
namespace torch {
namespace aot_inductor {
template <typename T>
void ExpectVecEqual(
const at::vec::Vectorized<T>& expected,
const at::vec::Vectorized<T>& actual) {
using Vec = at::vec::Vectorized<T>;
// Have to use std::vector for comparison because at::vec::Vectorized doesn't
// support operator[] on aarch64
std::vector<T> expected_data(Vec::size());
std::vector<T> actual_data(Vec::size());
expected.store(expected_data.data());
actual.store(actual_data.data());
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_data[i], actual_data[i]);
}
}
TEST(TestVec, TestAdd) {
using Vec = at::vec::Vectorized<int>;
std::vector<int> a(1024, 1);
@ -33,7 +16,9 @@ TEST(TestVec, TestAdd) {
std::vector<int> expected(1024, 3);
Vec expected_vec = Vec::loadu(expected.data());
ExpectVecEqual(expected_vec, actual_vec);
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
}
TEST(TestVec, TestMax) {
@ -45,7 +30,9 @@ TEST(TestVec, TestMax) {
Vec actual_vec = at::vec::maximum(a_vec, b_vec);
Vec expected_vec = b_vec;
ExpectVecEqual(expected_vec, actual_vec);
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
}
TEST(TestVec, TestMin) {
@ -57,7 +44,9 @@ TEST(TestVec, TestMin) {
Vec actual_vec = at::vec::minimum(a_vec, b_vec);
Vec expected_vec = a_vec;
ExpectVecEqual(expected_vec, actual_vec);
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
}
TEST(TestVec, TestConvert) {
@ -69,7 +58,9 @@ TEST(TestVec, TestConvert) {
auto actual_vec = at::vec::convert<float>(a_vec);
auto expected_vec = b_vec;
ExpectVecEqual(expected_vec, actual_vec);
for (int i = 0; i < at::vec::Vectorized<int>::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
}
TEST(TestVec, TestClampMin) {
@ -81,7 +72,9 @@ TEST(TestVec, TestClampMin) {
Vec actual_vec = at::vec::clamp_min(a_vec, min_vec);
Vec expected_vec = min_vec;
ExpectVecEqual(expected_vec, actual_vec);
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
}
} // namespace aot_inductor

View File

@ -1,3 +1,4 @@
set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_inference)
# Build custom TorchScript op for AOTInductor
@ -7,12 +8,27 @@ set_target_properties(aoti_custom_class PROPERTIES
if(USE_CUDA)
target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA)
elseif(USE_ROCM)
target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
endif()
# Link against LibTorch
target_link_libraries(aoti_custom_class torch)
# the custom command that generates the TorchScript module
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
# This script requires the torch package to be installed.
COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
DEPENDS torch torch_python aoti_custom_class ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
)
add_custom_target(aoti_script_model ALL
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
)
add_dependencies(aoti_script_model aoti_custom_class)
# Build the cpp gtest binary containing the cpp-only tests.
set(INDUCTOR_TEST_SRCS
${AOT_INDUCTOR_TEST_ROOT}/test.cpp
@ -21,12 +37,23 @@ set(INDUCTOR_TEST_SRCS
add_executable(test_aoti_inference
${TORCH_ROOT}/test/cpp/common/main.cpp
${INDUCTOR_TEST_SRCS}
data.pt
script_data.pt
script_model_cpu.pt
script_model_cuda.pt
)
add_dependencies(test_aoti_inference aoti_custom_class)
add_dependencies(test_aoti_inference aoti_custom_class aoti_script_model)
# TODO temporary until we can delete the old gtest polyfills.
target_compile_definitions(test_aoti_inference PRIVATE USE_GTEST)
# Define a custom command to generate the library
add_custom_command(
OUTPUT data.pt
COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py
DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py
)
target_link_libraries(test_aoti_inference PRIVATE
torch
gtest_main
@ -44,10 +71,6 @@ target_compile_definitions(test_aoti_inference PRIVATE
CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
)
target_compile_options_if_supported(test_aoti_inference -Wno-unused-variable)
target_compile_options_if_supported(test_aoti_inference -Wno-unused-but-set-variable)
target_compile_options_if_supported(test_aoti_inference -Wno-unused-function)
if(INSTALL_TEST)
install(TARGETS test_aoti_inference DESTINATION bin)
# Install PDB files for MSVC builds

View File

@ -2,9 +2,7 @@
#include <gtest/gtest.h>
#include <atomic>
#include <condition_variable>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <functional>
#include <mutex>
#include <queue>
@ -30,64 +28,6 @@
namespace {
// Function to check if test data files exist and are valid
bool testDataFilesExist() {
std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
std::array<std::string, 4> required_files = {
"data.pt",
"script_data.pt",
"script_model_cpu.pt",
"script_model_cuda.pt"};
for (const auto& filename : required_files) {
std::string filepath = bindir + "/" + filename;
std::ifstream file(filepath);
if (!file.good()) {
return false;
}
}
return true;
}
// Function to ensure test data files are generated at runtime
void ensureTestDataGenerated() {
static std::once_flag generated_flag;
std::call_once(generated_flag, []() {
// Only generate if files don't exist or are placeholders
if (testDataFilesExist()) {
return;
}
std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
// Calculate path to source directory: build/test_aoti_inference -> build ->
// pytorch
std::string pytorch_root = bindir.substr(0, bindir.find_last_of("/"));
pytorch_root = pytorch_root.substr(0, pytorch_root.find_last_of("/"));
std::string source_dir = pytorch_root + "/test/cpp/aoti_inference";
// Generate test data files (data.pt, etc.) by running test.py directly
std::string test_script = source_dir + "/test.py";
std::string test_data_cmd = "cd " + bindir + " && python " + test_script;
std::cout << "Generating test data: " << test_data_cmd << std::endl;
int result1 = std::system(test_data_cmd.c_str());
if (result1 != 0) {
std::cerr << "Warning: Test data generation failed with code " << result1
<< std::endl;
}
// Generate model files (script_*.pt) by running compile_model.py directly
std::string compile_script = source_dir + "/compile_model.py";
std::string models_cmd = "cd " + bindir + " && python " + compile_script;
std::cout << "Generating model files: " << models_cmd << std::endl;
int result2 = std::system(models_cmd.c_str());
if (result2 != 0) {
std::cerr << "Warning: Model generation failed with code " << result2
<< std::endl;
}
});
}
const std::unordered_map<std::string, at::Tensor> derefTensorConstantMap(
torch::inductor::TensorConstantMap tensor_constant_map) {
std::unordered_map<std::string, at::Tensor> ret;
@ -915,6 +855,7 @@ void test_aoti_free_buffer(bool use_runtime_constant_folding) {
}
}
#if defined(USE_CUDA) || defined(USE_ROCM)
void test_cuda_alloc_test() {
torch::NoGradGuard no_grad;
@ -954,8 +895,8 @@ void test_cuda_alloc_test() {
runner->run(data_loader.attr(inputs_attr.c_str()).toTensorList().vec());
ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
}
#endif
#ifdef USE_CUDA
class ThreadPool {
private:
struct Task {
@ -1096,96 +1037,86 @@ void test_multi_cuda_streams(const std::string& device) {
ASSERT_TRUE(torch::allclose(ref_output_tensors[0], all_outputs[i][0]));
}
}
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
#endif
} // namespace
namespace torch::aot_inductor {
// Test fixture that ensures test data is generated once for all tests
class AotInductorTest : public ::testing::Test {
public:
// This runs once before all tests in this test suite
static void SetUpTestSuite() {
ensureTestDataGenerated();
}
};
TEST_F(AotInductorTest, BasicTestCpu) {
TEST(AotInductorTest, BasicTestCpu) {
test_aoti("cpu", false);
}
TEST_F(AotInductorTest, BasicScriptTestCpu) {
TEST(AotInductorTest, BasicScriptTestCpu) {
test_aoti_script("cpu");
}
TEST_F(AotInductorTest, BasicPackageLoaderTestCpu) {
TEST(AotInductorTest, BasicPackageLoaderTestCpu) {
test_aoti_package_loader("cpu", false);
}
TEST_F(AotInductorTest, ExtractConstantsMapCpu) {
TEST(AotInductorTest, ExtractConstantsMapCpu) {
test_aoti_extract_constants_map("cpu");
}
#ifdef USE_CUDA
TEST_F(AotInductorTest, BasicTestCuda) {
TEST(AotInductorTest, BasicTestCuda) {
test_aoti("cuda", true);
test_aoti("cuda", false);
}
TEST_F(AotInductorTest, BasicScriptTestCuda) {
TEST(AotInductorTest, BasicScriptTestCuda) {
test_aoti_script("cuda");
}
TEST_F(AotInductorTest, BasicPackageLoaderTestCuda) {
TEST(AotInductorTest, BasicPackageLoaderTestCuda) {
test_aoti_package_loader("cuda", false);
}
TEST_F(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
test_aoti_package_loader_multi_gpu("cuda", false);
}
TEST_F(AotInductorTest, UpdateUserManagedConstantsCuda) {
TEST(AotInductorTest, UpdateUserManagedConstantsCuda) {
test_aoti_user_managed_buffer();
}
TEST_F(AotInductorTest, RuntimeUpdateConstantsCuda) {
TEST(AotInductorTest, RuntimeUpdateConstantsCuda) {
test_aoti_constants_update("cuda", true);
}
TEST_F(AotInductorTest, UpdateConstantsCuda) {
TEST(AotInductorTest, UpdateConstantsCuda) {
test_aoti_constants_update("cuda", false);
}
TEST_F(AotInductorTest, ExtractConstantsMapCuda) {
TEST(AotInductorTest, ExtractConstantsMapCuda) {
test_aoti_extract_constants_map("cuda");
}
TEST_F(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
test_aoti_double_buffering("cuda", true);
}
TEST_F(AotInductorTest, UpdateInactiveConstantsCuda) {
TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
test_aoti_double_buffering("cuda", false);
}
TEST_F(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
test_aoti_double_buffering_with_tensor_constants();
}
TEST_F(AotInductorTest, FreeInactiveConstantBufferCuda) {
TEST(AotInductorTest, FreeInactiveConstantBufferCuda) {
test_aoti_free_buffer(false);
}
TEST_F(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
test_aoti_free_buffer(true);
}
TEST_F(AotInductorTest, MultiStreamTestCuda) {
TEST(AotInductorTest, MultiStreamTestCuda) {
test_multi_cuda_streams("cuda");
}
TEST_F(AotInductorTest, CudaAllocTestCuda) {
TEST(AotInductorTest, CudaAllocTestCuda) {
test_cuda_alloc_test();
}
#endif

View File

@ -279,7 +279,6 @@ class SymmetricMemoryTest(MultiProcContinuousTest):
# MultiProcContinuousTest will skip all the following tests if a test fails (
# we should fix this too). We still want to get the test signals for the core
# symmetric memory APIs when Async TP ops fail.
@skip_if_rocm_multiprocess # AsyncTP is not yet supported on ROCm
@instantiate_parametrized_tests
@requires_cuda_p2p_access()
class AsyncTPTest(MultiProcContinuousTest):

View File

@ -892,16 +892,10 @@ fn(torch.randn(5))
os.remove(
file_path
) # Delete temp file manually, due to setup NamedTemporaryFile as delete=False.
orig_maxDiff = unittest.TestCase.maxDiff
unittest.TestCase.maxDiff = None
try:
self.assertEqual( # process wrap difference: /r/n on Windows, /n on posix.
empty_line_normalizer(lines),
empty_line_normalizer(stderr.decode("utf-8")),
)
except Exception:
unittest.TestCase.maxDiff = orig_maxDiff
raise
self.assertEqual( # process wrap difference: /r/n on Windows, /n on posix.
empty_line_normalizer(lines),
empty_line_normalizer(stderr.decode("utf-8")),
)
@make_settings_test("torch._dynamo.eval_frame")
def test_log_traced_frames(self, records):

View File

@ -1000,18 +1000,6 @@ class ReproTests(torch._dynamo.test_case.TestCase):
self.exit_stack.close()
super().tearDown()
def test_compiled_module_truthiness(self):
# Test with empty ModuleList
original_empty = nn.ModuleList()
compiled_empty = torch.compile(original_empty)
self.assertEqual(bool(original_empty), bool(compiled_empty))
self.assertFalse(bool(compiled_empty))
# Test with non-empty ModuleList
original_filled = nn.ModuleList([nn.Linear(10, 5)])
compiled_filled = torch.compile(original_filled)
self.assertEqual(bool(original_filled), bool(compiled_filled))
self.assertTrue(bool(compiled_filled))
def guard_manager_clone_hook_fn(self, guard_manager_wrapper, f_locals, builder):
root = guard_manager_wrapper.root
cloned_root = root.clone_manager(lambda x: True)

View File

@ -2,11 +2,8 @@
# flake8: noqa: B950
import functools
import json
import os
import random
import string
import tempfile
import unittest
import warnings
from collections import namedtuple
@ -7048,120 +7045,6 @@ class TestLearnableBiases(InductorTestCase):
def test_flex_attention_with_dynamic_max_autotune_graph_partition(self, device):
self._test_flex_attention_with_dynamic_max_autotune(device)
@skip_on_cpu
def test_flex_attention_logging(self, device):
with tempfile.TemporaryDirectory() as tmpdir:
log_file = os.path.join(tmpdir, "flex_attention_configs")
with patch.dict(
os.environ, {"TORCHINDUCTOR_FLEX_ATTENTION_LOGGING_FILE": log_file}
):
query = torch.randn(
1,
2,
128,
64,
device=device,
dtype=torch.float16,
requires_grad=True,
)
key = torch.randn(
1,
2,
128,
64,
device=device,
dtype=torch.float16,
requires_grad=True,
)
value = torch.randn(
1,
2,
128,
64,
device=device,
dtype=torch.float16,
requires_grad=True,
)
def score_mod(score, b, h, q_idx, kv_idx):
return score * 2
def causal_mask(b, h, q_idx, kv_idx):
return q_idx >= kv_idx
block_mask = torch.compile(create_block_mask)(
causal_mask, 1, 1, 128, 128, device=device
)
compiled_flex = torch.compile(
flex_attention, mode="max-autotune-no-cudagraphs"
)
out = compiled_flex(
query=query,
key=key,
value=value,
score_mod=score_mod,
block_mask=block_mask,
)
out.sum().backward()
json_file = log_file + ".json"
self.assertTrue(
os.path.exists(json_file), f"Log file {json_file} was not created"
)
with open(json_file) as f:
log_data = json.load(f)
self.assertIsInstance(log_data, list)
self.assertEqual(len(log_data), 2)
keys_seen = [next(iter(entry.keys())) for entry in log_data]
expected_fwd_key = "('forward', 1, 2, 2, 128, 128, 64, 64)"
expected_bwd_key = "('backward', 1, 2, 2, 128, 128, 64, 64)"
self.assertIn(expected_fwd_key, keys_seen)
self.assertIn(expected_bwd_key, keys_seen)
for entry in log_data:
self.assertIsInstance(entry, dict)
self.assertEqual(len(entry), 1)
dims_key = next(iter(entry.keys()))
choices = entry[dims_key]
kernel_type = eval(dims_key)[0]
self.assertIsInstance(choices, list)
self.assertGreater(len(choices), 0)
for i, choice in enumerate(choices):
self.assertIn("type", choice)
self.assertIn("time", choice)
if choice["type"] == "triton":
self.assertIn("num_warps", choice)
self.assertIn("num_stages", choice)
if kernel_type == "forward":
self.assertIn("BLOCK_M", choice)
self.assertIn("BLOCK_N", choice)
self.assertNotIn("BLOCK_M1", choice)
elif kernel_type == "backward":
self.assertIn("BLOCK_M1", choice)
self.assertIn("BLOCK_N1", choice)
self.assertIn("BLOCK_M2", choice)
self.assertIn("BLOCK_N2", choice)
self.assertNotIn("BLOCK_M", choice)
self.assertNotIn("BLOCK_N", choice)
if i > 0:
self.assertLessEqual(choices[0]["time"], choice["time"])
@skip_on_cpu
def test_inspect_bug(self, device):
# https://github.com/pytorch/pytorch/issues/139374

View File

@ -12,6 +12,7 @@ from torch.testing._internal.common_device_type import (
dtypes,
dtypesIfMPS,
expectedFailureMPS,
expectedFailureMPSPre15,
expectedFailureXLA,
instantiate_device_type_tests,
)
@ -172,6 +173,7 @@ class TestDropoutNNDeviceType(NNTestCase):
else:
self.assertNotEqual(permuted_inp, out)
@expectedFailureMPSPre15
def test_Dropout(self, device):
input = torch.empty(1000)
self._test_dropout(nn.Dropout, device, input)

View File

@ -529,7 +529,7 @@ class TestProfiler(TestCase):
found_mm = True
if "gemm" in e.name.lower() or "Cijk" in e.name:
found_gemm = True
if "memcpy" in e.name.lower() or "__amd_rocclr_copyBuffer" in e.name:
if "memcpy" in e.name.lower():
found_memcpy = True
if use_cuda:
self.assertTrue(found_gemm)

View File

@ -27,7 +27,6 @@ import torch
import torch.distributed as dist
from torch.multiprocessing import current_process, get_context
from torch.testing._internal.common_utils import (
get_report_dir,
get_report_path,
IS_CI,
IS_MACOS,
@ -35,6 +34,7 @@ from torch.testing._internal.common_utils import (
set_cwd,
shell,
TEST_CUDA,
TEST_SAVE_XML,
TEST_WITH_ASAN,
TEST_WITH_ROCM,
TEST_WITH_SLOW_GRADCHECK,
@ -529,14 +529,6 @@ def run_test(
replacement = {"-f": "-x", "-dist=loadfile": "--dist=loadfile"}
unittest_args = [replacement.get(arg, arg) for arg in unittest_args]
xml_report_dir = get_report_dir(test_file, None, options.pytest)
if is_cpp_test:
unittest_args.append(
f"--junit-xml-reruns={get_report_path(xml_report_dir, test_file)}"
)
else:
unittest_args.append(f"--save-xml={xml_report_dir}")
if options.showlocals:
if options.pytest:
unittest_args.extend(["--showlocals", "--tb=long", "--color=yes"])
@ -1234,6 +1226,12 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
# is much slower than running them directly
pytest_args.extend(["-n", str(NUM_PROCS)])
if TEST_SAVE_XML:
# Add the option to generate XML test report here as C++ tests
# won't go into common_utils
test_report_path = get_report_path(pytest=True)
pytest_args.extend(["--junit-xml-reruns", test_report_path])
if options.pytest_k_expr:
pytest_args.extend(["-k", options.pytest_k_expr])

View File

@ -7846,45 +7846,6 @@ class TestMPS(TestCaseMPS):
y = torch.normal(torch.zeros(shape, device="mps"), torch.ones(shape, device="mps"))
self.assertNotEqual(y[0], y[1])
def test_random_ops_noncontiguous(self):
"""Test random in-place operations on non-contiguous tensors.
All random in-place operations should work on non-contiguous tensors.
See issues #165257 and #124029.
"""
# Test each random in-place operation
ops = [
("normal_", lambda t: t.normal_(0, 1)),
("uniform_", lambda t: t.uniform_(0, 1)),
("exponential_", lambda t: t.exponential_(1.0)),
("bernoulli_", lambda t: t.bernoulli_(0.5)),
("random_", lambda t: t.random_()),
("random_with_to", lambda t: t.random_(10)),
("random_with_range", lambda t: t.random_(0, 10)),
]
for name, op_func in ops:
with self.subTest(operation=name):
# Create non-contiguous tensor via transpose
t_mps = torch.zeros(50, 50, device='mps').T.clone()
self.assertFalse(t_mps.is_contiguous(),
f"{name}: tensor should be non-contiguous")
# Apply operation
op_func(t_mps)
# Verify tensor was modified (not all zeros)
max_val = t_mps.max().item()
self.assertNotEqual(max_val, 0.0,
f"{name}: operation failed to modify non-contiguous tensor")
# Test rand_like specifically (issue #124029)
t = torch.ones((3, 2, 2), device='mps').permute(2, 0, 1)
self.assertFalse(t.is_contiguous(), "rand_like input should be non-contiguous")
result = torch.rand_like(t)
self.assertFalse(result.is_contiguous(), "rand_like result should be non-contiguous")
self.assertNotEqual(result.max().item(), 0.0, "rand_like should generate non-zero values")
# Test exponential
@unittest.skip("This does not test anything")
def test_exponential(self):

View File

@ -5,22 +5,29 @@ from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.attention import varlen_attn
from torch.nn.attention.varlen import varlen_attn
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
from torch.testing._internal.common_device_type import instantiate_device_type_tests
from torch.testing._internal.common_nn import NNTestCase
from torch.testing._internal.common_utils import parametrize, run_tests
from torch.testing._internal.common_utils import parametrize, run_tests, skipIfRocm
from torch.utils._python_dispatch import TorchDispatchMode
VarlenShape = namedtuple(
"VarlenShape", ["batch_size", "max_seq_len", "embed_dim", "num_heads"]
)
default_tolerances = {
torch.float16: {"atol": 1e-1, "rtol": 1e-1},
torch.bfloat16: {"atol": 9e-2, "rtol": 5e-2},
torch.float32: {"atol": 1e-5, "rtol": 1.3e-6},
}
class OpLoggingMode(TorchDispatchMode):
"""Logging mode that captures all dispatched operations"""
def __init__(self):
self.called_ops = []
def __torch_dispatch__(self, func, types, args=(), kwargs=None):
op_name = str(func)
self.called_ops.append(op_name)
return func(*args, **(kwargs or {}))
class AttentionBlock(nn.Module):
@ -39,12 +46,9 @@ class AttentionBlock(nn.Module):
embed_dim, embed_dim, bias=False, device=device, dtype=dtype
)
def forward_varlen(
def get_varlen_qkv(
self,
x_packed: torch.Tensor,
cu_seq: torch.Tensor,
max_len: int,
is_causal: bool = False,
):
qkv = self.qkv_proj(x_packed)
q, k, v = qkv.chunk(3, dim=-1)
@ -53,24 +57,56 @@ class AttentionBlock(nn.Module):
k = k.view(-1, self.num_heads, self.head_dim)
v = v.view(-1, self.num_heads, self.head_dim)
attn_out = varlen_attn(
q, k, v, cu_seq, cu_seq, max_len, max_len, is_causal=is_causal
)
return q, k, v
def forward_varlen(
self,
x_packed: torch.Tensor,
cu_seq: torch.Tensor,
max_len: int,
is_causal: bool = False,
):
q, k, v = self.get_varlen_qkv(x_packed)
attn_out = varlen_attn(q, k, v, cu_seq, cu_seq, max_len, max_len, is_causal)
attn_out = attn_out.view(-1, self.embed_dim)
return self.out_proj(attn_out)
def forward_sdpa(self, x_padded: torch.Tensor, is_causal: bool = False):
def forward_sdpa(
self,
x_padded: torch.Tensor,
seq_lengths: torch.Tensor,
is_causal: bool = False,
):
batch_size, seq_len, _ = x_padded.shape
qkv = self.qkv_proj(x_padded)
q, k, v = qkv.chunk(3, dim=-1)
mask = (
torch.arange(seq_len, device=x_padded.device)[None, :]
< seq_lengths[:, None]
)
attn_mask = mask[:, None, None, :].expand(
batch_size, self.num_heads, seq_len, seq_len
)
q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=is_causal)
if is_causal:
causal_mask = torch.triu(
torch.ones(seq_len, seq_len, device=x_padded.device, dtype=torch.bool),
diagonal=1,
)
combined_mask = causal_mask[None, None, :, :] | ~attn_mask
attn_out = F.scaled_dot_product_attention(q, k, v, attn_mask=~combined_mask)
else:
attn_out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
attn_out = (
attn_out.transpose(1, 2)
.contiguous()
@ -91,7 +127,9 @@ def create_variable_length_batch(
seq_lengths = torch.tensor(seq_lengths, device=device)
total_tokens = seq_lengths.sum().item()
x_packed = torch.randn(total_tokens, shape.embed_dim, device=device, dtype=dtype)
x_packed = torch.randn(
total_tokens, shape.embed_dim, device=device, dtype=dtype, requires_grad=True
)
cu_seq = torch.zeros(shape.batch_size + 1, device=device, dtype=torch.int32)
cu_seq[1:] = seq_lengths.cumsum(0)
@ -106,6 +144,7 @@ def create_variable_length_batch(
end_idx = start_idx + seq_len
x_padded[i, :seq_len] = x_packed[start_idx:end_idx]
start_idx = end_idx
x_padded = x_padded.clone().detach().requires_grad_()
return {
"seq_lengths": seq_lengths,
@ -118,6 +157,7 @@ def create_variable_length_batch(
class TestVarlenAttention(NNTestCase):
@skipIfRocm(msg="ROCM does not support variable length attention")
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
)
@ -133,7 +173,11 @@ class TestVarlenAttention(NNTestCase):
total_tokens = shape.batch_size * shape.max_seq_len
x_packed = torch.randn(
total_tokens, shape.embed_dim, device=device, dtype=dtype
total_tokens,
shape.embed_dim,
device=device,
dtype=dtype,
requires_grad=True,
)
cu_seq = torch.tensor(
[0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32
@ -147,6 +191,131 @@ class TestVarlenAttention(NNTestCase):
self.assertEqual(output.device, torch.device(device))
self.assertEqual(output.dtype, dtype)
varlen_grad_out = torch.ones_like(output)
varlen_grad = torch.autograd.grad(
outputs=output,
inputs=x_packed,
grad_outputs=varlen_grad_out,
retain_graph=True,
create_graph=False,
allow_unused=False,
)[0]
self.assertIsNotNone(varlen_grad)
self.assertEqual(varlen_grad.shape, x_packed.shape)
self.assertEqual(varlen_grad.dtype, x_packed.dtype)
@skipIfRocm(msg="ROCM does not support variable length attention")
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
)
@parametrize("dtype", [torch.bfloat16, torch.float16])
def test_custom_op_compliance(self, device, dtype):
torch.manual_seed(42)
shape = VarlenShape(batch_size=2, max_seq_len=512, embed_dim=1024, num_heads=16)
attention_block = AttentionBlock(
shape.embed_dim, shape.num_heads, device, dtype
)
total_tokens = shape.batch_size * shape.max_seq_len
x_packed = torch.randn(
total_tokens,
shape.embed_dim,
device=device,
dtype=dtype,
)
cu_seq = torch.tensor(
[0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32
)
q, k, v = attention_block.get_varlen_qkv(x_packed)
torch.library.opcheck(
torch.ops.torch_attn._varlen_attn,
(q, k, v, cu_seq, cu_seq, shape.max_seq_len, shape.max_seq_len, False),
)
out, lse, rng_state = torch.ops.torch_attn._varlen_attn(
q, k, v, cu_seq, cu_seq, shape.max_seq_len, shape.max_seq_len, False
)
grad_out = torch.randn_like(out)
# we don't support double backward
# skipping test_autograd_registration, test_aot_dispatch_dynamic, test_aot_dispatch_static
torch.library.opcheck(
torch.ops.torch_attn._varlen_attn_backward,
(
grad_out,
q,
k,
v,
out,
lse,
cu_seq,
cu_seq,
shape.max_seq_len,
shape.max_seq_len,
False,
rng_state,
),
test_utils=["test_schema", "test_faketensor"],
)
@skipIfRocm(msg="ROCM does not support variable length attention")
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
)
@parametrize("dtype", [torch.bfloat16, torch.float16])
def test_custom_op_registration(self, device, dtype):
torch.manual_seed(42)
shape = VarlenShape(batch_size=2, max_seq_len=512, embed_dim=1024, num_heads=16)
attention_block = AttentionBlock(
shape.embed_dim, shape.num_heads, device, dtype
)
total_tokens = shape.batch_size * shape.max_seq_len
x_packed = torch.randn(
total_tokens,
shape.embed_dim,
device=device,
dtype=dtype,
requires_grad=True,
)
cu_seq = torch.tensor(
[0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32
)
compiled_forward = torch.compile(
attention_block.forward_varlen, backend="eager", fullgraph=True
)
with OpLoggingMode() as mode:
output = compiled_forward(
x_packed, cu_seq, shape.max_seq_len, is_causal=False
)
varlen_grad_out = torch.ones_like(output)
_ = torch.autograd.grad(
outputs=output,
inputs=x_packed,
grad_outputs=varlen_grad_out,
retain_graph=True,
create_graph=False,
allow_unused=False,
)[0]
called_ops = mode.called_ops
custom_ops_called = any(
"torch_attn._varlen_attn" in op for op in called_ops
) and any("torch_attn._varlen_attn_backward" in op for op in called_ops)
assert custom_ops_called
@skipIfRocm(msg="ROCM does not support variable length attention")
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
)
@ -156,14 +325,21 @@ class TestVarlenAttention(NNTestCase):
torch.manual_seed(42)
shape = VarlenShape(
batch_size=8, max_seq_len=2048, embed_dim=1024, num_heads=16
batch_size=2, max_seq_len=128, embed_dim=32, num_heads=4
)
attention_block = AttentionBlock(
shape.embed_dim, shape.num_heads, device, dtype
)
golden_attention_block = AttentionBlock(
shape.embed_dim, shape.num_heads, device, torch.float64
)
variable_length_batch_data = create_variable_length_batch(shape, device, dtype)
golden_variable_length_batch_data = create_variable_length_batch(
shape, device, torch.float64
)
varlen_output = attention_block.forward_varlen(
variable_length_batch_data["x_packed"],
@ -172,18 +348,89 @@ class TestVarlenAttention(NNTestCase):
is_causal=is_causal,
)
sdpa_output = attention_block.forward_sdpa(
variable_length_batch_data["x_padded"], is_causal=is_causal
variable_length_batch_data["x_padded"],
variable_length_batch_data["seq_lengths"],
is_causal=is_causal,
)
golden_sdpa_output = golden_attention_block.forward_sdpa(
golden_variable_length_batch_data["x_padded"],
golden_variable_length_batch_data["seq_lengths"],
is_causal=is_causal,
)
tolerances = default_tolerances[dtype]
start_idx = 0
for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]):
end_idx = start_idx + seq_len
varlen_seq = varlen_output[start_idx:end_idx]
sdpa_seq = sdpa_output[i, :seq_len]
golden_sdpa_seq = golden_sdpa_output[i, :seq_len]
fwd_atol = (
2 * (golden_sdpa_seq + 0.3 - 0.3 - golden_sdpa_seq).abs().max().item()
)
varlen_error = (varlen_seq - fwd_atol).abs().max().item()
sdpa_error = (sdpa_seq - fwd_atol).abs().max().item()
assert varlen_error <= sdpa_error + fwd_atol
start_idx = end_idx
varlen_grad_out = torch.ones_like(varlen_output)
sdpa_grad_out = torch.ones_like(sdpa_output)
golden_sdpa_grad_out = torch.ones_like(golden_sdpa_output)
start_idx = 0
for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]):
end_idx = start_idx + seq_len
sdpa_grad_out[i, :seq_len] = varlen_grad_out[start_idx:end_idx]
start_idx = end_idx
varlen_grad = torch.autograd.grad(
outputs=varlen_output,
inputs=variable_length_batch_data["x_packed"],
grad_outputs=varlen_grad_out,
retain_graph=True,
create_graph=False,
allow_unused=False,
)[0]
sdpa_grad = torch.autograd.grad(
outputs=sdpa_output,
inputs=variable_length_batch_data["x_padded"],
grad_outputs=sdpa_grad_out,
retain_graph=True,
create_graph=False,
allow_unused=False,
)[0]
golden_sdpa_grad = torch.autograd.grad(
outputs=golden_sdpa_output,
inputs=golden_variable_length_batch_data["x_padded"],
grad_outputs=golden_sdpa_grad_out,
retain_graph=True,
create_graph=False,
allow_unused=False,
)[0]
start_idx = 0
for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]):
end_idx = start_idx + seq_len
varlen_grad_seq = varlen_grad[start_idx:end_idx]
sdpa_grad_seq = sdpa_grad[i, :seq_len]
golden_sdpa_seq = golden_sdpa_grad[i, :seq_len]
fwd_atol = (
2 * (golden_sdpa_seq + 0.3 - 0.3 - golden_sdpa_seq).abs().max().item()
)
varlen_error = (varlen_grad_seq - fwd_atol).abs().max().item()
sdpa_error = (sdpa_grad_seq - fwd_atol).abs().max().item()
assert varlen_error <= sdpa_error + fwd_atol
torch.testing.assert_close(varlen_seq, sdpa_seq, **tolerances)
start_idx = end_idx

View File

@ -42,7 +42,7 @@ import weakref
from dataclasses import dataclass
from enum import Enum
from os.path import dirname, join
from typing import Any, NamedTuple, Optional, Sized, TYPE_CHECKING, Union
from typing import Any, NamedTuple, Optional, TYPE_CHECKING, Union
from unittest.mock import patch
import sympy
@ -395,13 +395,6 @@ class OptimizedModule(torch.nn.Module):
self._initialize()
self.training = self._orig_mod.training
def __len__(self) -> int:
# Proxy the len call to the original module
if isinstance(self._orig_mod, Sized):
return len(self._orig_mod)
# Mimic python's default behavior for objects without a length
raise TypeError(f"{type(self._orig_mod).__name__} does not support len()")
def _initialize(self) -> None:
# Do this stuff in constructor to lower overhead slightly
if isinstance(self.dynamo_ctx, DisableContext):

View File

@ -1793,6 +1793,14 @@ def _aot_stage2b_bw_compile(
# tensor which is wrong.
ph_size = ph_arg.size()
# pyrefly: ignore # bad-argument-type
if len(ph_size) == 0 and len(real_stride) > 0:
# Fix for 0-dimensional tensors: When a tensor becomes 0-d
# (e.g., via squeeze), its stride should be () not (1,).
# This mismatch can occur when dynamic shape operations produce
# tensors that are later squeezed to 0-d. The stride metadata
# may get preserved causing a dimension mismatch (#164814)
real_stride = ()
# pyrefly: ignore # bad-argument-type
placeholder_list[i] = ph_arg.as_strided(ph_size, real_stride)

View File

@ -409,10 +409,9 @@ class SchedulerDonatedBuffer(SchedulerBuffer):
class BaseSchedulerNode:
ancestors: OrderedSet[str]
debug_device_str: Callable[[BaseSchedulerNode], list[str]]
group: tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]
last_usage: OrderedSet[str]
read_writes: dependencies.ReadWrites
unmet_dependencies: OrderedSet[Dep]
# .min_order and .max_order are only relevant for "grouped" nodes such as FusedSchedulerNode.
# e.g. if the FusedSchedulerNode includes nodes (op_1, op_2, op_3), and op_X is X-th node
# in `self.scheduler.nodes`, then for this FusedSchedulerNode, .min_order is 1 and .max_order is 3.
@ -421,24 +420,22 @@ class BaseSchedulerNode:
min_order: int
max_order: int
mpi_node: MemoryPlanningInfoForNode
mutation_renames: dict[str, str]
node: Optional[ir.Operation]
outputs: list[SchedulerBuffer]
outputs_by_name: dict[str, SchedulerBuffer]
override_estimated_runtime: Optional[float] = None
read_writes: dependencies.ReadWrites
unmet_dependencies: OrderedSet[Dep]
def __init__(self, scheduler: Scheduler) -> None:
self.scheduler = scheduler
self.debug_device_str = lambda *args, **kwargs: []
self.scheduler: Scheduler = scheduler
self.debug_device_str: Callable[[BaseSchedulerNode], list[str]] = (
lambda *args, **kwargs: []
)
def _init_from_node(self, node: ir.Operation) -> None:
self.node = node
self.ancestors = OrderedSet()
self.last_usage = OrderedSet() # buffers that won't be used after this kernel
self.node: Optional[ir.Operation] = node
self.ancestors: OrderedSet[str] = OrderedSet()
self.last_usage = OrderedSet[
str
]() # buffers that won't be used after this kernel
self.written = False
self.outputs = [
self.outputs: list[SchedulerBuffer] = [
SchedulerBuffer(
scheduler=self.scheduler,
node=output,
@ -446,14 +443,16 @@ class BaseSchedulerNode:
)
for output in node.get_outputs()
]
self.outputs_by_name = {buf.get_name(): buf for buf in self.outputs}
self.outputs_by_name: dict[str, SchedulerBuffer] = {
buf.get_name(): buf for buf in self.outputs
}
# mutation_renames for the current node. Due to potential
# more mutations happening later, this can be different
# to Scheduler.mutation_renames. Also this dict should be small
# since only mutation information relevant to the deps for this
# node is stored here.
self.mutation_renames = {}
self.mutation_renames: dict[str, str] = {}
def __repr__(self) -> str:
return f"{type(self).__name__}(name={self.get_name()!r})"
@ -2436,34 +2435,6 @@ def pick_loop_order(
return order
def _replace_operation_buffer(
orig_node: ir.MultiTemplateBuffer, new_node: ir.OperationBuffer
) -> None:
replaced_buf_name = new_node.get_name()
orig_buf_name = orig_node.get_name()
assert isinstance(orig_buf_name, str) and isinstance(replaced_buf_name, str)
replaced_op_name = new_node.get_operation_name()
orig_op_name = orig_node.get_operation_name()
assert isinstance(orig_op_name, str) and isinstance(replaced_op_name, str)
del V.graph.name_to_buffer[replaced_buf_name]
new_node.name = orig_buf_name
del V.graph.name_to_op[replaced_op_name]
new_node.operation_name = orig_op_name
orig = V.graph.buffers.index(orig_node)
V.graph.buffers.remove(new_node)
V.graph.buffers[orig] = new_node
V.graph.name_to_buffer[orig_buf_name] = new_node
orig = V.graph.operations.index(orig_node)
V.graph.operations.remove(new_node)
V.graph.operations[orig] = new_node
V.graph.name_to_op[orig_op_name] = new_node
@dataclasses.dataclass
class NodeUser:
node: Union[BaseSchedulerNode, OutputNode]
@ -3365,6 +3336,33 @@ class Scheduler:
will force completion of compilation and benchmarking.
"""
def replace_operation_buffer(
orig_node: ir.MultiTemplateBuffer, new_node: ir.OperationBuffer
) -> None:
replaced_buf_name = new_node.get_name()
orig_buf_name = orig_node.get_name()
assert isinstance(orig_buf_name, str) and isinstance(replaced_buf_name, str)
replaced_op_name = new_node.get_operation_name()
orig_op_name = orig_node.get_operation_name()
assert isinstance(orig_op_name, str) and isinstance(replaced_op_name, str)
del V.graph.name_to_buffer[replaced_buf_name]
new_node.name = orig_buf_name
del V.graph.name_to_op[replaced_op_name]
new_node.operation_name = orig_op_name
orig = V.graph.buffers.index(orig_node)
V.graph.buffers.remove(new_node)
V.graph.buffers[orig] = new_node
V.graph.name_to_buffer[orig_buf_name] = new_node
orig = V.graph.operations.index(orig_node)
V.graph.operations.remove(new_node)
V.graph.operations[orig] = new_node
V.graph.name_to_op[orig_op_name] = new_node
for i, node in enumerate(self.nodes):
if isinstance(node, SchedulerNode) and isinstance(
node.node, ir.MultiTemplateBuffer
@ -3418,47 +3416,40 @@ class Scheduler:
assign_origin_node(out_tensorbox, multi_node.origin_node)
out_buffer.layout = multi_node.layout
self._replace_node(out_buffer, multi_node, i, node)
replace_operation_buffer(multi_node, out_buffer)
new_scheduler_node = self.create_scheduler_node(out_buffer)
def _replace_node(
self,
out_buffer: ir.OperationBuffer,
multi_node: ir.MultiTemplateBuffer,
i: int,
node: SchedulerNode,
) -> None:
_replace_operation_buffer(multi_node, out_buffer)
new_scheduler_node = self.create_scheduler_node(out_buffer)
self.nodes[i] = new_scheduler_node
self.name_to_node[node.get_name()] = new_scheduler_node
self.name_to_fused_node[node.get_name()] = new_scheduler_node
self.nodes[i] = new_scheduler_node
self.name_to_node[node.get_name()] = new_scheduler_node
self.name_to_fused_node[node.get_name()] = new_scheduler_node
# We need to reflect the mutation renames that were recorded in the original node
mutation_renames = {}
for dep in itertools.chain(
node.read_writes.reads, node.unmet_dependencies
):
if real_name := self.mutation_real_name.get(dep.name, None):
mutation_renames[real_name] = dep.name
# We need to reflect the mutation renames that were recorded in the original node
mutation_renames = {}
for dep in itertools.chain(node.read_writes.reads, node.unmet_dependencies):
if real_name := self.mutation_real_name.get(dep.name, None):
mutation_renames[real_name] = dep.name
def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]:
return OrderedSet(dep.rename(mutation_renames) for dep in deps)
def rename_deps(deps: OrderedSet[Dep]) -> OrderedSet[Dep]:
return OrderedSet(dep.rename(mutation_renames) for dep in deps)
new_scheduler_node.unmet_dependencies = rename_deps(
new_scheduler_node.unmet_dependencies
)
new_scheduler_node.read_writes.reads = rename_deps(
new_scheduler_node.read_writes.reads
)
new_scheduler_node.unmet_dependencies = rename_deps(
new_scheduler_node.unmet_dependencies
)
new_scheduler_node.read_writes.reads = rename_deps(
new_scheduler_node.read_writes.reads
)
for new_out, old_out in zip(
new_scheduler_node.get_outputs(), node.get_outputs()
):
self.name_to_buf[old_out.get_name()] = new_out
new_out.users = old_out.users
for new_out, old_out in zip(
new_scheduler_node.get_outputs(), node.get_outputs()
):
self.name_to_buf[old_out.get_name()] = new_out
new_out.users = old_out.users
new_scheduler_node.min_order = node.min_order
new_scheduler_node.max_order = node.max_order
new_scheduler_node.last_usage = node.last_usage
new_scheduler_node.min_order = node.min_order
new_scheduler_node.max_order = node.max_order
new_scheduler_node.last_usage = node.last_usage
def _any_atomic_add(self, node_list: Sequence[BaseSchedulerNode]) -> bool:
return any(

View File

@ -17,7 +17,6 @@ import time
from collections.abc import Sequence
from concurrent.futures import as_completed, ThreadPoolExecutor
from io import StringIO
from pathlib import Path
from types import ModuleType
from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
from typing_extensions import Self
@ -2105,11 +2104,6 @@ class TritonTemplate(KernelTemplate):
"matrix_instr_nonkdim": kwargs.get("matrix_instr_nonkdim", 0),
"waves_per_eu": kwargs.get("waves_per_eu", 0),
"kpack": kwargs.get("kpack", 2),
**{
k: kwargs[k]
for k in AlgorithmSelectorCache.FLEX_ATTENTION_TUNABLE_KEYS
if k in kwargs
},
},
mutated_inputs=mutated_inputs,
workspace_arg=workspace_arg,
@ -2403,17 +2397,6 @@ def get_mm_log_filename() -> Optional[str]:
return mm_file_name
@functools.cache
def get_flex_attention_log_filename() -> Optional[str]:
flex_attention_file_name = os.environ.get(
"TORCHINDUCTOR_FLEX_ATTENTION_LOGGING_FILE", None
)
if not flex_attention_file_name:
return None
return str(Path(flex_attention_file_name).with_suffix(".json"))
def append_to_log(filename, data):
lock_file = filename.replace(".json", ".lock")
lock = FileLock(lock_file)
@ -2624,25 +2607,6 @@ class AlgorithmSelectorCache(PersistentCache):
doesn't depend on the output layout.
"""
FLEX_ATTENTION_TUNABLE_KEYS = tuple(
dict.fromkeys(
[
"num_warps",
"num_stages",
"BLOCK_M",
"BLOCK_N",
"BLOCK_M1",
"BLOCK_N1",
"BLOCK_M2",
"BLOCK_N2",
"USE_TMA",
"kpack",
"matrix_instr_nonkdim",
"waves_per_eu",
]
)
)
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
@ -3576,73 +3540,6 @@ class AlgorithmSelectorCache(PersistentCache):
)
return pruned_choices
@staticmethod
def get_flex_attention_choice_info(
choice: ChoiceCaller, timings: dict[ChoiceCaller, float]
) -> dict[str, Any]:
if isinstance(choice, torch._inductor.select_algorithm.ExternKernelCaller):
return {"type": "extern", "time": timings[choice]}
assert isinstance(choice, torch._inductor.select_algorithm.TritonTemplateCaller)
info = choice.info_dict()
result = {
"type": "triton",
"time": timings[choice],
}
for key in AlgorithmSelectorCache.FLEX_ATTENTION_TUNABLE_KEYS:
if key in info:
result[key] = info[key]
return result
@staticmethod
def maybe_log_flex_attention_results(
name: str, input_nodes: list[ir.IRNode], timings: dict[ChoiceCaller, float]
) -> None:
flex_attention_filename = get_flex_attention_log_filename()
if not flex_attention_filename or "flex_attention" not in name:
return
if len(input_nodes) < 3:
return
query_size = input_nodes[0].get_size()
key_size = input_nodes[1].get_size()
value_size = input_nodes[2].get_size()
B = query_size[0]
Hq = query_size[1]
seq_len_q = query_size[2]
qk_head_dim = query_size[3]
Hkv = key_size[1]
seq_len_kv = key_size[2]
v_head_dim = value_size[3]
kernel_type = "backward" if "backward" in name else "forward"
dims_key = str(
(
kernel_type,
B,
Hq,
Hkv,
seq_len_q,
seq_len_kv,
qk_head_dim,
v_head_dim,
)
)
sorted_choices = sorted(timings, key=timings.__getitem__)
out_dict = {
dims_key: [
AlgorithmSelectorCache.get_flex_attention_choice_info(choice, timings)
for choice in sorted_choices
]
}
append_to_log(flex_attention_filename, out_dict)
@staticmethod
def log_results(
name: str,
@ -3653,7 +3550,6 @@ class AlgorithmSelectorCache(PersistentCache):
prescreening_elapse: Optional[float] = None,
hint_override: Optional[int] = None,
):
"""Log the autotuning results, currently only handles mm and flex"""
V.debug.log_autotuning_results(
name, input_nodes, timings, elapse, precompile_elapse
)
@ -3722,10 +3618,6 @@ class AlgorithmSelectorCache(PersistentCache):
append_to_log(mm_filename, out_dict)
AlgorithmSelectorCache.maybe_log_flex_attention_results(
name, input_nodes, timings
)
best_time = timings[best]
sys.stderr.write(f"AUTOTUNE {name}({sizes})\n")
sys.stderr.write(f"strides: {strides}\n")

View File

@ -14,14 +14,11 @@ from torch.backends.cuda import (
SDPAParams,
)
from .varlen import varlen_attn
__all__: list[str] = [
"SDPBackend",
"sdpa_kernel",
"WARN_FOR_UNFUSED_KERNELS",
"varlen_attn",
]
# Note: [SDPA warnings]

View File

@ -7,7 +7,7 @@ that calls into the optimized Flash Attention kernels.
import logging
from functools import lru_cache
from typing import NamedTuple, Optional, Union
from typing import Any, NamedTuple, Optional, Union
import torch
@ -20,7 +20,7 @@ __all__ = ["varlen_attn", "AuxRequest"]
@lru_cache(maxsize=8)
def _should_use_cudnn(device_index: int) -> bool:
"""Cache device capability check to avoid repeated CUDA calls."""
return False
return True
class AuxRequest(NamedTuple):
@ -33,8 +33,7 @@ class AuxRequest(NamedTuple):
lse: bool = False
# import failures when I try to register as custom op
# @torch.library.custom_op("torch_nn_attention::_varlen_attn", mutates_args={})
@torch.library.custom_op("torch_attn::_varlen_attn", mutates_args={})
def _varlen_attn(
query: torch.Tensor,
key: torch.Tensor,
@ -44,7 +43,7 @@ def _varlen_attn(
max_q: int,
max_k: int,
is_causal: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Private custom op for variable-length attention.
@ -52,9 +51,9 @@ def _varlen_attn(
"""
use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
if use_cudnn:
log.info("Using cuDNN backend for varlen_attn")
result = torch.ops.aten._cudnn_attention_forward(
query,
key,
@ -70,7 +69,7 @@ def _varlen_attn(
False, # return_debug_mask
)
# cuDNN returns: (output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask)
output, softmax_lse = result[0], result[1]
output, softmax_lse, rng_state, philox_offset = result[0], result[1], result[6], result[7]
else:
log.info("Using Flash Attention backend for varlen_attn")
output, softmax_lse, rng_state, _, _ = torch.ops.aten._flash_attention_forward(
@ -85,11 +84,16 @@ def _varlen_attn(
is_causal,
return_debug_mask=False,
)
philox_offset = torch.zeros((), dtype=torch.int64, device=query.device)
return output, softmax_lse
rng_state_ = torch.zeros(
(2,), dtype=torch.uint64, device=query.device
) # hardcoded since dropout is hardcoded to 0
return output, softmax_lse, rng_state_, philox_offset
# @_varlen_attn.register_fake
@_varlen_attn.register_fake
def _varlen_attn_fake(
query: torch.Tensor,
key: torch.Tensor,
@ -99,7 +103,7 @@ def _varlen_attn_fake(
max_q: int,
max_k: int,
is_causal: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Fake implementation for meta tensor computation and tracing.
@ -110,14 +114,24 @@ def _varlen_attn_fake(
# Output has same shape as query
output = torch.empty_like(query)
# For varlen path: logsumexp shape is (num_heads, total_q)
# For varlen path with cuDNN: logsumexp shape is (total_q, num_heads, 1)
total_q = query.size(0)
num_heads = query.size(1)
logsumexp = torch.empty(
(num_heads, total_q), dtype=torch.float, device=query.device
)
return output, logsumexp
use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
if use_cudnn:
logsumexp = torch.empty(
(total_q, num_heads, 1), dtype=torch.float, device=query.device
)
else:
logsumexp = torch.empty(
(num_heads, total_q), dtype=torch.float, device=query.device
)
rng_state = torch.empty((2,), dtype=torch.uint64, device=query.device)
philox_offset = torch.zeros((), dtype=torch.int64, device=query.device)
return output, logsumexp, rng_state, philox_offset
def varlen_attn(
@ -191,9 +205,142 @@ def varlen_attn(
... query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
... )
"""
out, lse = _varlen_attn(
out, lse, _, _ = torch.ops.torch_attn._varlen_attn(
query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal
)
if return_aux is not None and return_aux.lse:
return out, lse
return out
def _setup_context(ctx: Any, inputs: tuple[Any, ...], output: Any) -> None:
query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal = inputs
out, lse, rng_state, philox_offset = output
ctx.save_for_backward(query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state, philox_offset)
ctx.max_q = max_q
ctx.max_k = max_k
ctx.is_causal = is_causal
@torch.library.custom_op("torch_attn::_varlen_attn_backward", mutates_args={})
def _varlen_attn_backward(
grad_out: torch.Tensor,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
out: torch.Tensor,
lse: torch.Tensor,
cu_seq_q: torch.Tensor,
cu_seq_k: torch.Tensor,
max_q: int,
max_k: int,
is_causal: bool,
rng_state: torch.Tensor,
philox_offset: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
unused = torch.empty(0, device=query.device)
use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
if use_cudnn:
log.info("Using cuDNN backend for varlen_attn")
head_dim = query.size(-1)
scale = 1.0 / (head_dim ** 0.5)
dq, dk, dv = torch.ops.aten._cudnn_attention_backward(
grad_out = grad_out,
query = query,
key = key,
value = value,
out = out,
logsumexp = lse,
philox_seed = rng_state,
philox_offset = philox_offset,
attn_bias = None,
cum_seq_q = cu_seq_q,
cum_seq_k = cu_seq_k,
max_q = max_q,
max_k = max_k,
dropout_p = 0.0,
is_causal = is_causal,
# passing in scale doesn't change the value of the gradients
# scale=scale
)
else:
log.info("Using Flash Attention backend for varlen_attn")
dq, dk, dv = torch.ops.aten._flash_attention_backward(
grad_out,
query,
key,
value,
out,
lse,
cu_seq_q,
cu_seq_k,
max_q,
max_k,
0.0,
is_causal,
rng_state,
unused,
)
return dq, dk, dv
@_varlen_attn_backward.register_fake
def _varlen_attn_backward_fake(
grad_out: torch.Tensor,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
out: torch.Tensor,
lse: torch.Tensor,
cu_seq_q: torch.Tensor,
cu_seq_k: torch.Tensor,
max_q: int,
max_k: int,
is_causal: bool,
rng_state: torch.Tensor,
philox_offset: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Fake implementation for meta tensor computation and tracing.
"""
grad_query = torch.empty_like(query)
grad_key = torch.empty_like(key)
grad_value = torch.empty_like(value)
return grad_query, grad_key, grad_value
def _backward(
ctx: Any, grad_out: torch.Tensor, grad_lse: torch.Tensor, grad_rng: torch.Tensor, grad_philox_offset: torch.Tensor
) -> tuple[Optional[torch.Tensor], ...]:
query, key, value, cu_seq_q, cu_seq_k, out, lse, rng_state, philox_offset = ctx.saved_tensors
max_q = ctx.max_q
max_k = ctx.max_k
is_causal = ctx.is_causal
dq, dk, dv = torch.ops.torch_attn._varlen_attn_backward(
grad_out,
query,
key,
value,
out,
lse,
cu_seq_q,
cu_seq_k,
max_q,
max_k,
is_causal,
rng_state,
philox_offset
)
return dq, dk, dv, None, None, None, None, None, None
_varlen_attn.register_autograd(_backward, setup_context=_setup_context)

View File

@ -74,17 +74,6 @@ def export_compat(
if opset_version is None:
opset_version = onnx_constants.ONNX_DEFAULT_OPSET
if isinstance(model, torch.nn.Module):
if model.training:
warnings.warn(
"Exporting a model while it is in training mode. "
"Please ensure that this is intended, as it may lead to "
"different behavior during inference. "
"Calling model.eval() before export is recommended.",
UserWarning,
stacklevel=2,
)
if isinstance(model, torch.export.ExportedProgram):
# We know the model is already exported program, so the args, kwargs, and dynamic_shapes
# are not used

View File

@ -812,6 +812,7 @@ if torch.backends.mps.is_available():
"__rmod__",
"__rsub__",
"__rpow__",
"bernoulli",
"clamp_max",
"clamp_min",
"masked_scatter",

View File

@ -950,6 +950,13 @@ def prof_meth_call(*args, **kwargs):
torch._C.ScriptFunction.__call__ = prof_func_call # type: ignore[method-assign]
torch._C.ScriptMethod.__call__ = prof_meth_call # type: ignore[method-assign]
def _get_test_report_path():
# allow users to override the test file location. We need this
# because the distributed tests run the same test file multiple
# times with different configurations.
override = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE')
test_source = override if override is not None else 'python-unittest'
return os.path.join('test-reports', test_source)
def parse_cmd_line_args():
global CI_FUNCTORCH_ROOT
@ -980,7 +987,9 @@ def parse_cmd_line_args():
parser.add_argument('--repeat', type=int, default=1)
parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true')
parser.add_argument('--use-pytest', action='store_true')
parser.add_argument('--save-xml', type=str)
parser.add_argument('--save-xml', nargs='?', type=str,
const=_get_test_report_path(),
default=_get_test_report_path() if IS_CI else None)
parser.add_argument('--discover-tests', action='store_true')
parser.add_argument('--log-suffix', type=str, default="")
parser.add_argument('--run-parallel', type=int, default=1)
@ -1010,9 +1019,6 @@ def parse_cmd_line_args():
# infer flags based on the default settings
GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
if args.save_xml is None and IS_CI:
args.xml_dir = get_report_dir(sys.argv[0], args.log_suffix, args.use_pytest)
RERUN_DISABLED_TESTS = args.rerun_disabled_tests
SLOW_TESTS_FILE = args.import_slow_tests
@ -1185,37 +1191,19 @@ def lint_test_case_extension(suite):
return succeed
def get_report_dir(test_name: str, log_suffix: Optional[str], is_pytest: bool) -> str:
"""Generates a test report directory path. Test name does not need to be
sanitized."""
# total path = test-reports/test_source+log_suffix/test_filename
# Base path
test_source = "python-unittest"
if is_pytest:
test_source = "python-pytest"
# allow users to override the test file location. We need this
# because the distributed tests run the same test file multiple
# times with different configurations.
override = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE')
if override is not None:
test_source = override
# Add log suffix to if provided
if log_suffix and log_suffix != "":
test_source = test_source + log_suffix
test_report_dir = os.path.join('test-reports', test_source)
# Add test file name to path
test_filename = sanitize_test_filename(test_name)
test_report_dir = os.path.join(test_report_dir, test_filename)
os.makedirs(test_report_dir, exist_ok=True)
return test_report_dir
def get_report_path(report_dir: str, test_filename: str) -> str:
return os.path.join(report_dir, f"{sanitize_test_filename(test_filename)}-{os.urandom(8).hex()}.xml")
def get_report_path(argv=None, pytest=False):
if argv is None:
argv = UNITTEST_ARGS
test_filename = sanitize_test_filename(argv[0])
test_report_path = TEST_SAVE_XML + LOG_SUFFIX
test_report_path = os.path.join(test_report_path, test_filename)
if pytest:
test_report_path = test_report_path.replace('python-unittest', 'python-pytest')
os.makedirs(test_report_path, exist_ok=True)
test_report_path = os.path.join(test_report_path, f"{test_filename}-{os.urandom(8).hex()}.xml")
return test_report_path
os.makedirs(test_report_path, exist_ok=True)
return test_report_path
def sanitize_pytest_xml(xml_file: str):
@ -1358,7 +1346,7 @@ def run_tests(argv=None):
pytest_args = argv + ["--use-main-module"]
test_report_path = ""
if TEST_SAVE_XML:
test_report_path = get_report_path(TEST_SAVE_XML, argv[0])
test_report_path = get_report_path(pytest=True)
print(f'Test results will be stored in {test_report_path}')
pytest_args.append(f'--junit-xml-reruns={test_report_path}')
if PYTEST_SINGLE_TEST:
@ -1402,7 +1390,7 @@ def run_tests(argv=None):
def printErrors(self) -> None:
super().printErrors()
self.printErrorList("XPASS", self.unexpectedSuccesses)
test_report_path = get_report_path(TEST_SAVE_XML, argv[0])
test_report_path = get_report_path()
verbose = '--verbose' in argv or '-v' in argv
if verbose:
print(f'Test results will be stored in {test_report_path}')