mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-01 22:14:53 +08:00
Compare commits
23 Commits
codex/add-
...
gh/laithsa
| Author | SHA1 | Date | |
|---|---|---|---|
| dba9537ec5 | |||
| b8ce62add6 | |||
| b994e3dfe2 | |||
| a4ca86145d | |||
| a0be597ea4 | |||
| 13593796cc | |||
| 21bb1455e5 | |||
| 2e7752e6ce | |||
| 4b87543b65 | |||
| bb526733ef | |||
| 19050dd509 | |||
| 16686fda12 | |||
| 1d79ebb3da | |||
| a788092fb8 | |||
| a8cfab1e46 | |||
| 3161e2b61f | |||
| 15530d81d3 | |||
| f3c4db3804 | |||
| cc47c109a1 | |||
| 476de516f3 | |||
| 3b3eece835 | |||
| a9776fa0cc | |||
| b225bcaa97 |
@ -83,6 +83,10 @@ function build_cpython {
|
||||
py_suffix=${py_ver::-1}
|
||||
py_folder=$py_suffix
|
||||
fi
|
||||
# Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
|
||||
if [ "$py_suffix" == "3.14.0" ]; then
|
||||
py_suffix="3.14.0rc2"
|
||||
fi
|
||||
wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
|
||||
do_cpython_build $py_ver Python-$py_suffix
|
||||
|
||||
|
||||
@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
|
||||
logger.info("Successfully cloned %s", target)
|
||||
return r, commit
|
||||
|
||||
except GitCommandError:
|
||||
logger.exception("Git operation failed")
|
||||
except GitCommandError as e:
|
||||
logger.error("Git operation failed: %s", e)
|
||||
raise
|
||||
|
||||
|
||||
|
||||
4
.flake8
4
.flake8
@ -7,12 +7,14 @@ max-line-length = 120
|
||||
# C408 ignored because we like the dict keyword argument syntax
|
||||
# E501 is not flexible enough, we're using B950 instead
|
||||
ignore =
|
||||
E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
|
||||
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
|
||||
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
|
||||
# these ignores are from flake8-logging-format; please fix!
|
||||
G100,G101,G200
|
||||
# these ignores are from flake8-simplify. please fix or ignore with commented reason
|
||||
SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
|
||||
# SIM104 is already covered by pyupgrade ruff
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
69bbe7363897764f9e758d851cd0340147d27f94
|
||||
1b013f5b5a87a1882eb143c26d79d091150d6a37
|
||||
|
||||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -33,7 +33,6 @@ ciflow_push_tags:
|
||||
- ciflow/rocm
|
||||
- ciflow/rocm-mi300
|
||||
- ciflow/rocm-mi355
|
||||
- ciflow/rocm-navi31
|
||||
- ciflow/s390
|
||||
- ciflow/slow
|
||||
- ciflow/torchbench
|
||||
|
||||
@ -26,8 +26,9 @@ name: !{{ build_environment }}
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
|
||||
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
|
||||
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
|
||||
{%- endmacro %}
|
||||
|
||||
|
||||
1
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
1
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
@ -63,6 +63,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.10.4"
|
||||
freethreaded: false
|
||||
|
||||
11
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
11
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -59,6 +59,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.10.4"
|
||||
freethreaded: false
|
||||
@ -168,6 +169,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.11.4"
|
||||
freethreaded: false
|
||||
@ -277,6 +279,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.12.4"
|
||||
freethreaded: false
|
||||
@ -386,6 +389,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.13.4"
|
||||
freethreaded: false
|
||||
@ -495,6 +499,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.13.4"
|
||||
freethreaded: true
|
||||
@ -604,8 +609,9 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.14.0"
|
||||
python-version: "3.14.0-rc.2"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
@ -713,8 +719,9 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.14.0"
|
||||
python-version: "3.14.0-rc.2"
|
||||
freethreaded: true
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
|
||||
63
.github/workflows/rocm-navi31.yml
vendored
63
.github/workflows/rocm-navi31.yml
vendored
@ -1,63 +0,0 @@
|
||||
name: rocm-navi31
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/rocm-navi31/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
|
||||
# Also run less frequently on weekends.
|
||||
- cron: 45 */2 * * 1-5
|
||||
- cron: 45 4,12 * * 0,6
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
target-determination:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: before-test
|
||||
uses: ./.github/workflows/target_determination.yml
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3_10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
tests-to-include: >-
|
||||
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
|
||||
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
|
||||
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
|
||||
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
|
||||
inductor/test_flex_attention inductor/test_max_autotune' || '' }}
|
||||
secrets: inherit
|
||||
26
.github/workflows/rocm.yml
vendored
26
.github/workflows/rocm.yml
vendored
@ -59,3 +59,29 @@ jobs:
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-gfx1100-test:
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3_10-gfx1100
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
]}
|
||||
tests-to-include: >
|
||||
test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
|
||||
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
|
||||
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
|
||||
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
|
||||
inductor/test_flex_attention inductor/test_max_autotune
|
||||
secrets: inherit
|
||||
|
||||
34
.github/workflows/trunk.yml
vendored
34
.github/workflows/trunk.yml
vendored
@ -190,40 +190,6 @@ jobs:
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
|
||||
secrets: inherit
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
306
AUTOGRAD_GRADFN_EXPLAINED.md
Normal file
306
AUTOGRAD_GRADFN_EXPLAINED.md
Normal file
@ -0,0 +1,306 @@
|
||||
# Understanding PyTorch Autograd and grad_fn
|
||||
|
||||
## What is `grad_fn`?
|
||||
|
||||
`grad_fn` is a pointer to the **backward function** that created a tensor. It's PyTorch's way of building a **computation graph** for automatic differentiation.
|
||||
|
||||
## Simple Example
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
y = x ** 2
|
||||
z = y + 3
|
||||
|
||||
print(f"x.grad_fn: {x.grad_fn}") # None (leaf node)
|
||||
print(f"y.grad_fn: {y.grad_fn}") # <PowBackward0>
|
||||
print(f"z.grad_fn: {z.grad_fn}") # <AddBackward0>
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
x.grad_fn: None
|
||||
y.grad_fn: <PowBackward0 object at 0x...>
|
||||
z.grad_fn: <AddBackward0 object at 0x...>
|
||||
```
|
||||
|
||||
## The Computation Graph
|
||||
|
||||
Each operation creates a node in the computation graph:
|
||||
|
||||
```
|
||||
Forward:
|
||||
x (leaf) → [PowBackward0] → y → [AddBackward0] → z
|
||||
|
||||
Backward (when z.backward() is called):
|
||||
z ← [AddBackward0] ← y ← [PowBackward0] ← x
|
||||
grad = 1 grad = 1 grad = 2*x
|
||||
```
|
||||
|
||||
## Key Concepts
|
||||
|
||||
### 1. **Leaf Tensors**
|
||||
- Created directly by user (not from operations)
|
||||
- `grad_fn` is `None`
|
||||
- Only leaf tensors accumulate gradients in `.grad`
|
||||
|
||||
```python
|
||||
x = torch.tensor([2.0], requires_grad=True) # Leaf tensor
|
||||
print(x.is_leaf) # True
|
||||
print(x.grad_fn) # None
|
||||
```
|
||||
|
||||
### 2. **Non-Leaf Tensors**
|
||||
- Result of operations
|
||||
- Have `grad_fn` pointing to their creator
|
||||
- Don't accumulate gradients by default
|
||||
|
||||
```python
|
||||
y = x * 2 # Non-leaf tensor
|
||||
print(y.is_leaf) # False
|
||||
print(y.grad_fn) # <MulBackward0>
|
||||
```
|
||||
|
||||
### 3. **grad_fn Chains**
|
||||
Each `grad_fn` has `next_functions` pointing to previous operations:
|
||||
|
||||
```python
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
y = x * 2
|
||||
z = y + 3
|
||||
|
||||
print(z.grad_fn) # <AddBackward0>
|
||||
print(z.grad_fn.next_functions)
|
||||
# ((<MulBackward0>, 0), (<AccumulateGrad>, 0))
|
||||
# ^-- previous operation ^-- gradient accumulator for constant
|
||||
```
|
||||
|
||||
## How `.backward()` Works
|
||||
|
||||
When you call `loss.backward()`:
|
||||
|
||||
1. **Start from loss tensor**
|
||||
```python
|
||||
loss.grad_fn # Starting point
|
||||
```
|
||||
|
||||
2. **Walk the graph backward**
|
||||
- Calls `.backward()` on each `grad_fn`
|
||||
- Each `grad_fn` computes gradients for its inputs
|
||||
- Passes gradients to `next_functions`
|
||||
|
||||
3. **Accumulate at leaf tensors**
|
||||
- When reaching a leaf tensor, gradient is stored in `.grad`
|
||||
|
||||
## Code Pointers in PyTorch
|
||||
|
||||
### C++ Implementation
|
||||
|
||||
The core autograd engine is in C++:
|
||||
|
||||
**Main autograd engine:**
|
||||
- `torch/csrc/autograd/engine.cpp` - Executes backward pass
|
||||
- `torch/csrc/autograd/function.cpp` - Base class for all `grad_fn`
|
||||
- `torch/csrc/autograd/functions/` - Specific backward functions
|
||||
|
||||
**Key functions:**
|
||||
```cpp
|
||||
// torch/csrc/autograd/engine.cpp
|
||||
auto Engine::execute(...) {
|
||||
// Main backward execution loop
|
||||
// Walks the graph and calls grad_fn->apply()
|
||||
}
|
||||
```
|
||||
|
||||
### Python Side
|
||||
|
||||
**Tensor class:**
|
||||
- `torch/_tensor.py` - Defines `Tensor.backward()`
|
||||
- `torch/autograd/__init__.py` - Main autograd interface
|
||||
|
||||
**Custom autograd functions:**
|
||||
- `torch/autograd/function.py` - `torch.autograd.Function` base class
|
||||
- This is what `CompiledFunction` inherits from!
|
||||
|
||||
## Creating Custom grad_fn
|
||||
|
||||
You can create custom backward functions:
|
||||
|
||||
```python
|
||||
class MySquare(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x):
|
||||
ctx.save_for_backward(x)
|
||||
return x ** 2
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
x, = ctx.saved_tensors
|
||||
return grad_output * 2 * x
|
||||
|
||||
# Usage
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
y = MySquare.apply(x)
|
||||
print(y.grad_fn) # <MySquareBackward object>
|
||||
|
||||
y.backward()
|
||||
print(x.grad) # tensor([4.0]) = 2 * 2
|
||||
```
|
||||
|
||||
## Visualizing the Graph
|
||||
|
||||
### Using `torchviz`:
|
||||
```python
|
||||
from torchviz import make_dot
|
||||
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
y = x ** 2
|
||||
z = y + 3
|
||||
|
||||
make_dot(z, params={'x': x}).render('computation_graph', format='png')
|
||||
```
|
||||
|
||||
### Manual inspection:
|
||||
```python
|
||||
def print_graph(tensor, indent=0):
|
||||
print(' ' * indent + str(tensor.grad_fn))
|
||||
if tensor.grad_fn is not None:
|
||||
for next_fn, _ in tensor.grad_fn.next_functions:
|
||||
if next_fn is not None:
|
||||
print_graph(next_fn, indent + 2)
|
||||
|
||||
print_graph(z)
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
<AddBackward0 object at 0x...>
|
||||
<PowBackward0 object at 0x...>
|
||||
<AccumulateGrad object at 0x...>
|
||||
<AccumulateGrad object at 0x...>
|
||||
```
|
||||
|
||||
## Debugging Autograd
|
||||
|
||||
### 1. **Check if gradients are computed**
|
||||
```python
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
y = x * 2
|
||||
z = y + 3
|
||||
|
||||
print(x.requires_grad) # True
|
||||
print(y.requires_grad) # True
|
||||
print(z.requires_grad) # True
|
||||
```
|
||||
|
||||
### 2. **Retain gradients for non-leaf tensors**
|
||||
```python
|
||||
y.retain_grad() # Keep gradient for non-leaf
|
||||
z.backward()
|
||||
print(y.grad) # Now available!
|
||||
```
|
||||
|
||||
### 3. **Detect in-place operations**
|
||||
```python
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
y = x * 2
|
||||
y += 1 # Error! In-place op on tensor used in backward
|
||||
```
|
||||
|
||||
### 4. **Hooks for debugging**
|
||||
```python
|
||||
def hook_fn(grad):
|
||||
print(f"Gradient: {grad}")
|
||||
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
x.register_hook(hook_fn)
|
||||
|
||||
y = x ** 2
|
||||
y.backward()
|
||||
# Prints: Gradient: tensor([4.0])
|
||||
```
|
||||
|
||||
## Connection to CompiledFunction
|
||||
|
||||
In our earlier example, `CompiledFunction` is just another `grad_fn`:
|
||||
|
||||
```python
|
||||
class CompiledFunction(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(ctx, *args):
|
||||
# Run compiled forward
|
||||
return compiled_fw(args)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, *grad_outputs):
|
||||
# Run compiled backward
|
||||
return compiled_bw(ctx.saved_tensors, grad_outputs)
|
||||
```
|
||||
|
||||
When you use `torch.compile()`:
|
||||
```python
|
||||
output = compiled_model(input)
|
||||
print(output.grad_fn) # <CompiledFunctionBackward>
|
||||
```
|
||||
|
||||
The output tensor's `grad_fn` points to `CompiledFunction.backward()`, which runs the compiled backward graph!
|
||||
|
||||
## Learning Resources
|
||||
|
||||
### Official PyTorch Docs
|
||||
1. **Autograd mechanics**: https://pytorch.org/docs/stable/notes/autograd.html
|
||||
2. **Extending autograd**: https://pytorch.org/docs/stable/notes/extending.html
|
||||
3. **Autograd API**: https://pytorch.org/docs/stable/autograd.html
|
||||
|
||||
### Code to Read
|
||||
1. Start with examples:
|
||||
- `/home/lsakka/pytorch10/pytorch/test/test_autograd.py`
|
||||
- `/home/lsakka/pytorch10/pytorch/test/test_custom_ops.py`
|
||||
|
||||
2. Understand custom functions:
|
||||
- `/home/lsakka/pytorch10/pytorch/torch/autograd/function.py`
|
||||
|
||||
3. See how ops register backward:
|
||||
- `/home/lsakka/pytorch10/pytorch/torch/_ops.py`
|
||||
- `/home/lsakka/pytorch10/pytorch/tools/autograd/derivatives.yaml`
|
||||
|
||||
### Key Files in PyTorch Codebase
|
||||
```
|
||||
torch/csrc/autograd/
|
||||
├── engine.cpp # Main backward execution engine
|
||||
├── function.cpp # Base grad_fn class
|
||||
├── functions/ # Backward implementations
|
||||
│ ├── basic_ops.cpp # Add, Mul, etc.
|
||||
│ └── tensor.cpp # Tensor operations
|
||||
└── variable.cpp # Tensor + autograd integration
|
||||
|
||||
torch/autograd/
|
||||
├── function.py # Python Function base class
|
||||
├── grad_mode.py # no_grad, enable_grad contexts
|
||||
└── profiler.py # Profiling autograd
|
||||
```
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Concept | Meaning |
|
||||
|---------|---------|
|
||||
| `tensor.grad_fn` | Backward function that created this tensor |
|
||||
| `tensor.is_leaf` | True if tensor is a leaf (user-created) |
|
||||
| `tensor.requires_grad` | True if tensor tracks gradients |
|
||||
| `tensor.grad` | Accumulated gradient (only for leaf tensors) |
|
||||
| `tensor.retain_grad()` | Keep gradient for non-leaf tensor |
|
||||
| `tensor.backward()` | Compute gradients for this tensor |
|
||||
| `next_functions` | Previous operations in the graph |
|
||||
| `AccumulateGrad` | Final node that accumulates gradient to `.grad` |
|
||||
|
||||
## Summary
|
||||
|
||||
**grad_fn** is the backbone of PyTorch's autograd system. It:
|
||||
- Links tensors to their creator operations
|
||||
- Forms the computation graph
|
||||
- Enables automatic differentiation
|
||||
- Gets called automatically during `.backward()`
|
||||
- Is how compiled backward works - `CompiledFunction.backward` is just another `grad_fn`!
|
||||
|
||||
Understanding `grad_fn` is key to understanding how PyTorch computes gradients automatically!
|
||||
150
BACKWARD_COMPILATION_EXPLANATION.md
Normal file
150
BACKWARD_COMPILATION_EXPLANATION.md
Normal file
@ -0,0 +1,150 @@
|
||||
# How `.backward()` Knows to Use the Compiled Backward Pass
|
||||
|
||||
## The Key Mechanism: Autograd Graph Connection
|
||||
|
||||
When you call `loss.backward()`, PyTorch **doesn't know** that the loss was computed with a compiled forward. Instead, it uses the **autograd graph** that was built during the forward pass.
|
||||
|
||||
## How It Works
|
||||
|
||||
### 1. **Forward Pass Creates Autograd Nodes**
|
||||
|
||||
When you run the compiled forward:
|
||||
|
||||
```python
|
||||
compiled_model = torch.compile(model)
|
||||
output = compiled_model(input_data) # This creates a CompiledFunction in the graph
|
||||
loss = criterion(output, target_data)
|
||||
```
|
||||
|
||||
The key class is `CompiledFunction(torch.autograd.Function)` in `/home/lsakka/pytorch10/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py:2103`:
|
||||
|
||||
```python
|
||||
class CompiledFunction(torch.autograd.Function):
|
||||
compiled_fw = compiled_fw_func
|
||||
compiled_bw = compiled_bw_func
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, *deduped_flat_tensor_args):
|
||||
# Runs the compiled forward graph
|
||||
fw_outs = call_func_at_runtime_with_args(
|
||||
CompiledFunction.compiled_fw,
|
||||
args,
|
||||
disable_amp=disable_amp,
|
||||
)
|
||||
# ... saves tensors for backward ...
|
||||
return tuple(raw_returns)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, *flat_args):
|
||||
# This gets called automatically by loss.backward()
|
||||
out = call_func_at_runtime_with_args(
|
||||
CompiledFunction.compiled_bw,
|
||||
all_args,
|
||||
steal_args=True,
|
||||
disable_amp=disable_amp,
|
||||
)
|
||||
return out
|
||||
```
|
||||
|
||||
### 2. **Tensor's `grad_fn` Points to CompiledFunction**
|
||||
|
||||
When `CompiledFunction.forward()` returns tensors, those tensors have their `grad_fn` attribute set to `CompiledFunctionBackward`:
|
||||
|
||||
```python
|
||||
output = compiled_model(input_data)
|
||||
print(output.grad_fn) # <CompiledFunctionBackward object>
|
||||
```
|
||||
|
||||
This is **automatic** - PyTorch's autograd system automatically creates this connection when you subclass `torch.autograd.Function`.
|
||||
|
||||
### 3. **`loss.backward()` Walks the Autograd Graph**
|
||||
|
||||
When you call `loss.backward()`:
|
||||
|
||||
```python
|
||||
loss = criterion(output, target_data)
|
||||
loss.backward() # Triggers the autograd engine
|
||||
```
|
||||
|
||||
PyTorch's autograd engine:
|
||||
1. Starts from the `loss` tensor
|
||||
2. Walks backward through the computation graph
|
||||
3. For each node, calls its `.backward()` method
|
||||
4. When it reaches `CompiledFunctionBackward`, it calls `CompiledFunction.backward()`
|
||||
5. `CompiledFunction.backward()` calls the **compiled backward graph**
|
||||
|
||||
## Visual Flow
|
||||
|
||||
```
|
||||
User Code:
|
||||
output = compiled_model(input)
|
||||
↓
|
||||
Creates tensors with grad_fn = CompiledFunctionBackward
|
||||
↓
|
||||
loss = criterion(output, target)
|
||||
↓
|
||||
loss.backward() ← User calls this
|
||||
↓
|
||||
Autograd Engine:
|
||||
Walks graph → Finds CompiledFunctionBackward
|
||||
↓
|
||||
Calls CompiledFunction.backward()
|
||||
↓
|
||||
Executes compiled_bw(saved_tensors, grad_outputs)
|
||||
↓
|
||||
Returns gradients tuple (grad_W, grad_b, ...)
|
||||
↓
|
||||
Autograd Engine:
|
||||
Accumulates gradients into .grad attributes
|
||||
```
|
||||
|
||||
## Key Insight
|
||||
|
||||
**`.backward()` is NOT compiled**. What gets compiled is:
|
||||
- ✅ The **forward computation graph** → `compiled_fw`
|
||||
- ✅ The **backward computation graph** → `compiled_bw`
|
||||
|
||||
But the **autograd mechanism itself** (walking the graph, calling backward functions) is **NOT compiled** - it's the normal PyTorch autograd engine.
|
||||
|
||||
The compiled backward is just **one node** in the larger autograd graph. If you have:
|
||||
|
||||
```python
|
||||
output1 = compiled_model(input) # Compiled forward
|
||||
output2 = some_other_op(output1) # Regular eager op
|
||||
loss = output2.sum()
|
||||
loss.backward()
|
||||
```
|
||||
|
||||
The autograd graph will be:
|
||||
```
|
||||
loss → SumBackward → some_other_op → CompiledFunctionBackward → ...
|
||||
[eager] [eager] [COMPILED]
|
||||
```
|
||||
|
||||
## When is the Backward Compiled?
|
||||
|
||||
The backward graph can be compiled in two modes:
|
||||
|
||||
### 1. **Eager Compilation (Default for AOT Autograd)**
|
||||
- Backward is compiled during the first forward pass
|
||||
- Located at line 1909 in `runtime_wrappers.py`
|
||||
|
||||
### 2. **Lazy Compilation (Used by torch.compile)**
|
||||
- Backward is compiled the first time `.backward()` is called
|
||||
- Located at line 2468 in `runtime_wrappers.py`
|
||||
|
||||
```python
|
||||
if CompiledFunction.compiled_bw is None:
|
||||
# First backward call - compile the backward graph now
|
||||
CompiledFunction.compiled_bw = aot_config.bw_compiler(
|
||||
bw_module, placeholder_list
|
||||
)
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
**Q: How does `loss.backward()` know to use the compiled backward?**
|
||||
|
||||
**A:** It doesn't need to know! The compiled forward creates output tensors whose `grad_fn` points to `CompiledFunction.backward()`. When you call `loss.backward()`, PyTorch's autograd engine walks the graph and automatically calls `CompiledFunction.backward()`, which internally calls the compiled backward graph.
|
||||
|
||||
The magic is in the **autograd graph connection**, not in `.backward()` itself!
|
||||
@ -289,14 +289,15 @@ IF(USE_FBGEMM_GENAI)
|
||||
|
||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
set(fbgemm_genai_mx8mx8bf16_grouped
|
||||
set(fbgemm_genai_cuh
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
||||
"${FBGEMM_GENAI_SRCS}/"
|
||||
)
|
||||
|
||||
target_include_directories(fbgemm_genai PRIVATE
|
||||
${FBGEMM_THIRD_PARTY}/cutlass/include
|
||||
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
|
||||
${fbgemm_genai_mx8mx8bf16_grouped}
|
||||
${fbgemm_genai_cuh}
|
||||
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
|
||||
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
|
||||
)
|
||||
@ -313,14 +314,13 @@ IF(USE_FBGEMM_GENAI)
|
||||
|
||||
# Add additional HIPCC compiler flags for performance
|
||||
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
||||
-mllvm
|
||||
-amdgpu-coerce-illegal-types=1
|
||||
-mllvm
|
||||
-enable-post-misched=0
|
||||
-mllvm
|
||||
-greedy-reverse-local-assignment=1
|
||||
-fhip-new-launch-api)
|
||||
if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
|
||||
list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
|
||||
endif()
|
||||
|
||||
# Only compile for gfx942 for now.
|
||||
# This is rather hacky, I could not figure out a clean solution :(
|
||||
|
||||
@ -183,6 +183,11 @@ struct CUDACachingHostAllocatorImpl
|
||||
return true;
|
||||
}
|
||||
|
||||
bool pinned_use_background_threads() override {
|
||||
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
|
||||
pinned_use_background_threads();
|
||||
}
|
||||
|
||||
EventPool::Event create_event_internal(DeviceIndex idx) {
|
||||
// Leak the event pool to avoid shutdown issue.
|
||||
static auto* event_pool = new EventPool();
|
||||
|
||||
@ -177,6 +177,7 @@ inline void segmented_sort_pairs(
|
||||
}
|
||||
}
|
||||
|
||||
#if CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
|
||||
inline void unique_by_key(
|
||||
KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
|
||||
@ -192,6 +193,7 @@ inline void unique_by_key(
|
||||
CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
|
||||
keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace impl {
|
||||
|
||||
@ -577,6 +579,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
|
||||
#endif
|
||||
}
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
|
||||
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
|
||||
inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
|
||||
@ -604,6 +607,7 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
|
||||
void unique(InputIteratorT input, OutputIteratorT output,
|
||||
|
||||
@ -28,6 +28,22 @@
|
||||
#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
|
||||
#endif
|
||||
|
||||
// cub support for UniqueByKey is added to cub 1.16 in:
|
||||
// https://github.com/NVIDIA/cub/pull/405
|
||||
#if CUB_VERSION >= 101600
|
||||
#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
|
||||
#else
|
||||
#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
|
||||
#endif
|
||||
|
||||
// cub support for scan by key is added to cub 1.15
|
||||
// in https://github.com/NVIDIA/cub/pull/376
|
||||
#if CUB_VERSION >= 101500
|
||||
#define CUB_SUPPORTS_SCAN_BY_KEY() 1
|
||||
#else
|
||||
#define CUB_SUPPORTS_SCAN_BY_KEY() 0
|
||||
#endif
|
||||
|
||||
// cub support for cub::FutureValue is added to cub 1.15 in:
|
||||
// https://github.com/NVIDIA/cub/pull/305
|
||||
#if CUB_VERSION >= 101500
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#include <ATen/core/ATen_fwd.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/core/SymInt.h>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
@ -1710,11 +1711,14 @@ Tensor narrow_symint(
|
||||
"], but got ",
|
||||
start,
|
||||
")")
|
||||
if (start < 0) {
|
||||
start = start + cur_size;
|
||||
}
|
||||
// Bounds check without converting start:
|
||||
// - If start < 0: need (start + cur_size) + length <= cur_size, i.e., start +
|
||||
// length <= 0
|
||||
// - If start >= 0: need start + length <= cur_size
|
||||
auto end = start + length;
|
||||
TORCH_SYM_CHECK(
|
||||
start.sym_le(cur_size - length),
|
||||
(start.sym_lt(0).sym_and((end).sym_le(0)))
|
||||
.sym_or(start.sym_ge(0).sym_and((end).sym_le(cur_size))),
|
||||
"start (",
|
||||
start,
|
||||
") + length (",
|
||||
@ -1722,7 +1726,31 @@ Tensor narrow_symint(
|
||||
") exceeds dimension size (",
|
||||
cur_size,
|
||||
").");
|
||||
return at::slice_symint(self, dim, start, start + length, 1);
|
||||
|
||||
if (TORCH_GUARD_OR_FALSE(start.sym_ge(0).sym_or(end.sym_ne(0)))) {
|
||||
return at::slice_symint(self, dim, start, end, 1);
|
||||
} else if (TORCH_GUARD_OR_FALSE(start.sym_lt(0))) {
|
||||
// Avoid the complex symbolic expressions path for non-unbacked.
|
||||
return at::slice_symint(self, dim, start + cur_size, end + cur_size, 1);
|
||||
} else {
|
||||
// Cannot statically determine the condition due to unbacked.
|
||||
// This is an interesting situation; when start is negative and
|
||||
// start + length == 0, slice and narrow do different things.
|
||||
// i.e., x.narrow(0, -2, 2) != x[-2:0]; in that case, we want to
|
||||
// pass curr_size instead of 0. Otherwise, they would do the same thing.
|
||||
// This says at runtime: if start < 0 and end == 0, then pass curr_size
|
||||
// instead of 0.
|
||||
|
||||
auto use_different = start.sym_lt(0).sym_and(end.sym_eq(0)).toSymInt();
|
||||
auto result =
|
||||
at::slice_symint(self, dim, start, end + use_different * cur_size, 1);
|
||||
|
||||
// Ensure slice allocated unbacked size is specialized to length.
|
||||
SymInt new_size = result.sym_size(dim);
|
||||
TORCH_SYM_CHECK(new_size.sym_eq(length), "")
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// This overload exists purely for XLA, because they wanted to pass in
|
||||
@ -1736,8 +1764,8 @@ Tensor narrow_tensor_symint(
|
||||
start.dim() == 0 &&
|
||||
isIntegralType(start.scalar_type(), /*includeBool=*/false),
|
||||
"start must be an 0-dim integral Tensor.");
|
||||
int64_t st = start.item<int64_t>();
|
||||
return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length));
|
||||
c10::SymInt st = start.item().toSymInt();
|
||||
return at::narrow_symint(self, dim, std::move(st), std::move(length));
|
||||
}
|
||||
|
||||
std::
|
||||
|
||||
@ -141,8 +141,6 @@ void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
|
||||
return;
|
||||
}
|
||||
|
||||
checkTrilTriuMemoryOverlap(result, self);
|
||||
|
||||
bool inplace_op = self.is_same(result);
|
||||
|
||||
bool inplace_update = false;
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
#include <ATen/MemoryOverlap.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
|
||||
@ -55,13 +54,4 @@ static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor
|
||||
return std::make_tuple(true, tensor);
|
||||
}
|
||||
|
||||
static inline void checkTrilTriuMemoryOverlap(const Tensor& result, const Tensor& self) {
|
||||
if (result.is_same(self)) {
|
||||
at::assert_no_internal_overlap(result);
|
||||
} else {
|
||||
at::assert_no_internal_overlap(result);
|
||||
at::assert_no_overlap(result, self);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -120,7 +120,7 @@ static void pow_tensor_scalar_kernel(
|
||||
} else if (dtype == ScalarType::Half) {
|
||||
[&]() {
|
||||
using scalar_t =
|
||||
decltype(c10::impl::ScalarTypeToCPPType<ScalarType::Half>::t);
|
||||
c10::impl::ScalarTypeToCPPTypeT<ScalarType::Half>;
|
||||
const auto exp = exp_scalar.to<scalar_t>();
|
||||
using Vec = Vectorized<scalar_t>;
|
||||
cpu_kernel_vec(iter,
|
||||
|
||||
@ -2322,23 +2322,12 @@ _scaled_nvfp4_nvfp4(
|
||||
const Tensor& scale_b, const SwizzleType swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
Tensor& out,
|
||||
const std::optional<Tensor>& global_scale_a = std::nullopt,
|
||||
const std::optional<Tensor>& global_scale_b = std::nullopt) {
|
||||
const bool single_scale,
|
||||
Tensor& out) {
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
|
||||
#endif
|
||||
std::optional<Tensor> alpha = std::nullopt;
|
||||
// Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise,
|
||||
// if this is "And" we would silently do nothing in the case where one global scale is
|
||||
// passed and not the other.
|
||||
if (global_scale_a.has_value() || global_scale_b.has_value()) {
|
||||
TORCH_CHECK_VALUE(global_scale_a.has_value(),
|
||||
"For two-level-scaled NVFP4, global_scale_a must have a value");
|
||||
TORCH_CHECK_VALUE(global_scale_b.has_value(),
|
||||
"For two-level-scaled NVFP4, global_scale_b must have a value");
|
||||
alpha = global_scale_a.value().mul(global_scale_b.value());
|
||||
}
|
||||
TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
|
||||
// Restrictions:
|
||||
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
|
||||
// Scales must be swizzled
|
||||
@ -2360,7 +2349,7 @@ _scaled_nvfp4_nvfp4(
|
||||
|
||||
auto scaling_choice_a = ScalingType::BlockWise1x16;
|
||||
auto scaling_choice_b = ScalingType::BlockWise1x16;
|
||||
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha);
|
||||
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
|
||||
}
|
||||
|
||||
|
||||
@ -2566,10 +2555,9 @@ _scaled_mm_cuda_v2_out(
|
||||
} else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
|
||||
return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
|
||||
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out,
|
||||
scale_a[1], scale_b[1]);
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
|
||||
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
|
||||
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
|
||||
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) {
|
||||
return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
|
||||
} else {
|
||||
|
||||
@ -856,9 +856,13 @@ struct type_specialized_kernel_launcher {
|
||||
out_calc_t output_offset_calculator,
|
||||
loader_t loader,
|
||||
storer_t storer) {
|
||||
if (ret_t == rt_binary_specializations[arg_index][0] &&
|
||||
arg0_t == rt_binary_specializations[arg_index][1] &&
|
||||
arg1_t == rt_binary_specializations[arg_index][2])
|
||||
constexpr ScalarType sret_t = rt_binary_specializations[arg_index][0];
|
||||
constexpr ScalarType sarg0_t = rt_binary_specializations[arg_index][1];
|
||||
constexpr ScalarType sarg1_t = rt_binary_specializations[arg_index][2];
|
||||
if (ret_t == sret_t && arg0_t == sarg0_t && arg1_t == sarg1_t) {
|
||||
using cret_t = c10::impl::ScalarTypeToCPPTypeT<sret_t>;
|
||||
using carg0_t = c10::impl::ScalarTypeToCPPTypeT<sarg0_t>;
|
||||
using carg1_t = c10::impl::ScalarTypeToCPPTypeT<sarg1_t>;
|
||||
launch_vectorized_templated_kernel<
|
||||
func_t,
|
||||
array_t,
|
||||
@ -866,12 +870,9 @@ struct type_specialized_kernel_launcher {
|
||||
out_calc_t,
|
||||
loader_t,
|
||||
storer_t,
|
||||
decltype(c10::impl::ScalarTypeToCPPType<
|
||||
rt_binary_specializations[arg_index][0]>::t),
|
||||
decltype(c10::impl::ScalarTypeToCPPType<
|
||||
rt_binary_specializations[arg_index][1]>::t),
|
||||
decltype(c10::impl::ScalarTypeToCPPType<
|
||||
rt_binary_specializations[arg_index][2]>::t)>(
|
||||
cret_t,
|
||||
carg0_t,
|
||||
carg1_t>(
|
||||
numel,
|
||||
f,
|
||||
data,
|
||||
@ -879,6 +880,7 @@ struct type_specialized_kernel_launcher {
|
||||
output_offset_calculator,
|
||||
loader,
|
||||
storer);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -15,7 +15,9 @@
|
||||
#include <ATen/native/cuda/block_reduce.cuh>
|
||||
#include <ATen/native/cuda/thread_constants.h>
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
#include <thrust/iterator/reverse_iterator.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -238,6 +240,10 @@ __global__ void renorm_kernel(
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
#if !CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
template<typename index_t>
|
||||
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
|
||||
#endif
|
||||
|
||||
Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
|
||||
int64_t num_weights, int64_t padding_idx,
|
||||
@ -300,6 +306,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
|
||||
|
||||
if (scale_grad_by_freq) {
|
||||
count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
@ -326,6 +333,11 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
|
||||
num_indices
|
||||
);
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
|
||||
embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
return embedding_backward_cuda_kernel(grad, orig_indices,
|
||||
|
||||
@ -10,7 +10,9 @@
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
#if CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
#include <thrust/iterator/counting_iterator.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -194,9 +196,18 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm
|
||||
partials_per_segment_offset[num_of_segments-1];
|
||||
}
|
||||
|
||||
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
|
||||
*num_of_segments_ptr = num_of_segments;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // anon namespace
|
||||
|
||||
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
template<typename index_t>
|
||||
int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
|
||||
#endif
|
||||
|
||||
Tensor embedding_backward_cuda_kernel(
|
||||
const Tensor &grad,
|
||||
@ -223,12 +234,20 @@ Tensor embedding_backward_cuda_kernel(
|
||||
auto segment_offsets = at::empty({numel}, orig_indices.options());
|
||||
auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
|
||||
int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
|
||||
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
|
||||
int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
|
||||
write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
|
||||
cuda::cub::unique_by_key(
|
||||
sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
|
||||
segment_offsets.mutable_data_ptr<index_t>(),
|
||||
num_of_segments_ptr, sorted_indices.numel());
|
||||
});
|
||||
#endif
|
||||
|
||||
int64_t max_segments = std::min<int64_t>(numel, num_weights);
|
||||
|
||||
|
||||
@ -31,10 +31,16 @@
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
#include <thrust/iterator/reverse_iterator.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if !CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
template<typename index_t>
|
||||
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
@ -193,6 +199,7 @@ Tensor embedding_bag_backward_cuda_sum_avg(
|
||||
|
||||
if (scale_grad_by_freq) {
|
||||
count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
@ -219,6 +226,11 @@ Tensor embedding_bag_backward_cuda_sum_avg(
|
||||
num_indices
|
||||
);
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
|
||||
embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
|
||||
count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag,
|
||||
|
||||
90
aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
Normal file
90
aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
Normal file
@ -0,0 +1,90 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/cuda/SortingCommon.cuh>
|
||||
#include <ATen/cuda/cub_definitions.cuh>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#else
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#endif
|
||||
|
||||
#include <ATen/cuda/ThrustAllocator.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/unique.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/iterator/constant_iterator.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if !CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
|
||||
template<typename index_t>
|
||||
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
at::cuda::ThrustAllocator allocator;
|
||||
auto policy = thrust::cuda::par(allocator).on(stream);
|
||||
|
||||
auto num_indices = count.numel();
|
||||
|
||||
// Compute an increasing sequence per unique item in sortedIndices:
|
||||
// sorted: 2 5 5 5 7 7 8 9 9
|
||||
// count: 1 1 2 3 1 2 1 1 2
|
||||
auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
|
||||
auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
|
||||
thrust::inclusive_scan_by_key(
|
||||
policy,
|
||||
sorted_data,
|
||||
sorted_data + num_indices,
|
||||
thrust::make_constant_iterator(1),
|
||||
count_data
|
||||
);
|
||||
|
||||
// Take the maximum of each count per unique key in reverse:
|
||||
// sorted: 2 5 5 5 7 7 8 9 9
|
||||
// count: 1 3 3 3 2 2 1 2 2
|
||||
thrust::inclusive_scan_by_key(
|
||||
policy,
|
||||
thrust::make_reverse_iterator(sorted_data + num_indices),
|
||||
thrust::make_reverse_iterator(sorted_data),
|
||||
thrust::make_reverse_iterator(count_data + num_indices),
|
||||
thrust::make_reverse_iterator(count_data + num_indices),
|
||||
thrust::equal_to<index_t>(),
|
||||
thrust::maximum<index_t>()
|
||||
);
|
||||
}
|
||||
|
||||
template
|
||||
void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
|
||||
template
|
||||
void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
|
||||
|
||||
#endif
|
||||
|
||||
template<typename index_t>
|
||||
int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
at::cuda::ThrustAllocator allocator;
|
||||
auto policy = thrust::cuda::par(allocator).on(stream);
|
||||
const ptrdiff_t numel = sorted_indices.numel();
|
||||
auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
|
||||
auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
|
||||
auto ends = thrust::unique_by_key_copy(
|
||||
policy,
|
||||
sorted_indices_dev,
|
||||
sorted_indices_dev + numel,
|
||||
thrust::make_counting_iterator(0),
|
||||
dummy_dev,
|
||||
thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
|
||||
return thrust::get<0>(ends) - dummy_dev;
|
||||
}
|
||||
|
||||
template
|
||||
int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
|
||||
template
|
||||
int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
|
||||
|
||||
} // namespace at::native
|
||||
@ -146,7 +146,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
|
||||
int64_t batch_size = target.size(0);
|
||||
int64_t H = target.size(1);
|
||||
int64_t W = target.size(2);
|
||||
int64_t n_classes = grad_input.size(1);
|
||||
|
||||
CUDA_KERNEL_LOOP(index, n_threads) {
|
||||
const int64_t b = index % batch_size;
|
||||
@ -157,7 +156,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
|
||||
if (cur_target == ignore_index) {
|
||||
continue;
|
||||
}
|
||||
CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
|
||||
scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1));
|
||||
grad_input[b][cur_target][h][w] = value * grad_output[b][h][w];
|
||||
}
|
||||
|
||||
@ -413,12 +413,14 @@ struct ReduceOp {
|
||||
value = thread_reduce<output_vec_size>(input_slice);
|
||||
}
|
||||
|
||||
if (config.should_block_x_reduce()) {
|
||||
value = block_x_reduce<output_vec_size>(value, shared_memory);
|
||||
}
|
||||
if (config.should_block_y_reduce()) {
|
||||
value = block_y_reduce<output_vec_size>(value, shared_memory);
|
||||
}
|
||||
__syncthreads();
|
||||
if (config.should_block_x_reduce()) {
|
||||
value = block_x_reduce<output_vec_size>(value, shared_memory);
|
||||
}
|
||||
|
||||
using out_ptr_vec_t = std::array<out_scalar_t*, output_vec_size>;
|
||||
using offset_vec_t = std::array<index_t, output_vec_size>;
|
||||
offset_vec_t base_offsets;
|
||||
@ -655,8 +657,8 @@ struct ReduceOp {
|
||||
__syncthreads();
|
||||
// Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
|
||||
// matching Triton, etc.
|
||||
// TODO(PaulZhang12): AMD and internal
|
||||
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
|
||||
// todo for AMD
|
||||
#ifdef USE_ROCM
|
||||
for (int offset = 1; offset < dim_x; offset <<= 1) {
|
||||
#else
|
||||
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// TODO: remove this when CUDA <11.6 is no longer supported
|
||||
void topk_out_with_sort(
|
||||
const Tensor& self,
|
||||
int64_t k, int64_t dim, bool largest,
|
||||
@ -30,12 +31,21 @@ void topk_out_with_sort(
|
||||
indices.copy_(sorted_indices.narrow(dim, 0, k));
|
||||
}
|
||||
|
||||
// TODO: remove this when CUDA <11.6 is no longer supported
|
||||
bool disable_sort_for_topk();
|
||||
bool should_use_sort(const Tensor& self, int64_t dim) {
|
||||
#if defined(USE_ROCM)
|
||||
if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
|
||||
return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
|
||||
#else
|
||||
return false;
|
||||
if (disable_sort_for_topk()) return false;
|
||||
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
|
||||
if (self.dim() == 0) return false;
|
||||
if (self.dtype() == kBool) return false; // Bool is not support by topk
|
||||
int64_t slice_size = self.size(dim);
|
||||
if (slice_size == 0) return false;
|
||||
int64_t num_slices = self.numel() / slice_size;
|
||||
return num_slices <= 10 && slice_size >= 100000;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -21,6 +21,11 @@ using namespace at::native;
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// TODO: remove this when CUDA <11.6 is no longer supported
|
||||
bool disable_sort_for_topk() {
|
||||
return CUB_SUPPORTS_SCAN_BY_KEY();
|
||||
}
|
||||
|
||||
namespace sbtopk { // single_block_topk
|
||||
|
||||
template <typename T>
|
||||
@ -413,6 +418,10 @@ __global__ void computeBlockwiseWithinKCounts(
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
#if !CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
return;
|
||||
#endif
|
||||
|
||||
Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
|
||||
|
||||
// if largest, then only threads that has tidx > desired_digit are active
|
||||
@ -468,6 +477,7 @@ __global__ void computeBlockwiseWithinKCounts(
|
||||
}
|
||||
}
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
// Assumption: slice_size can not be larger than UINT32_MAX
|
||||
template <typename Bitwise>
|
||||
__global__ void computeBlockwiseKthCounts(
|
||||
@ -599,6 +609,7 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
|
||||
// occupancy of this kernel is limited by registers per threads
|
||||
@ -676,12 +687,16 @@ void launch(
|
||||
uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
|
||||
AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
|
||||
uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
|
||||
AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
|
||||
|
||||
auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
|
||||
uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
|
||||
#else
|
||||
uint32_t* withinKCounts = nullptr;
|
||||
#endif
|
||||
|
||||
Bitwise desiredMask = 0;
|
||||
dim3 grid;
|
||||
@ -728,6 +743,7 @@ void launch(
|
||||
}
|
||||
desired = desired_in;
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
|
||||
desired, counts, num_blocks, blocks_per_slice, kthCounts);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
@ -743,6 +759,28 @@ void launch(
|
||||
topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
|
||||
blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
#else
|
||||
// Find topk values based on kth values
|
||||
{
|
||||
dim3 grid;
|
||||
TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
|
||||
int warp_size = at::cuda::warp_size();
|
||||
dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
|
||||
sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
|
||||
input,
|
||||
inputSliceSize,
|
||||
outputSliceSize,
|
||||
largest,
|
||||
numInputSlices,
|
||||
inputWithinSliceStride,
|
||||
topK,
|
||||
topKWithinSliceStride,
|
||||
indices,
|
||||
indicesWithinSliceStride,
|
||||
kthValues);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace mbtopk
|
||||
@ -750,6 +788,7 @@ void launch(
|
||||
bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
|
||||
if (num_slices > std::numeric_limits<uint32_t>::max() ||
|
||||
slice_size > std::numeric_limits<uint32_t>::max()) return false;
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
|
||||
return (num_slices <= 20 && slice_size >= 20000) ||
|
||||
(num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
|
||||
@ -758,6 +797,12 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
|
||||
(num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
|
||||
(num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
|
||||
(num_slices > 4000 && slice_size >= 400);
|
||||
#else
|
||||
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
|
||||
return (num_slices <= 400 && slice_size >= 5000) ||
|
||||
(num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
|
||||
(num_slices >= 4000 && slice_size >= 300);
|
||||
#endif
|
||||
}
|
||||
|
||||
void launch_gather_topk_kernel(
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/MemoryOverlap.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/TriangularOpsUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -45,7 +44,7 @@ __global__ void triu_tril_kernel(
|
||||
const int64_t k,
|
||||
const int64_t N_padded,
|
||||
const IndexType last_dim_padded) {
|
||||
int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread;
|
||||
int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
|
||||
if (linear_idx >= N_padded) {
|
||||
return;
|
||||
}
|
||||
@ -111,8 +110,6 @@ __global__ void triu_tril_kernel(
|
||||
|
||||
template <bool upper>
|
||||
void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
|
||||
checkTrilTriuMemoryOverlap(result, self);
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
|
||||
at::ScalarType::ComplexHalf,
|
||||
at::ScalarType::Half,
|
||||
|
||||
@ -466,7 +466,7 @@ struct ReduceJitOp {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
|
||||
#ifdef USE_ROCM
|
||||
for (int offset = 1; offset < dim_x; offset <<= 1) {
|
||||
#else
|
||||
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
|
||||
|
||||
@ -441,7 +441,7 @@ kernel void applySYRK(
|
||||
uint3 tid [[thread_position_in_threadgroup]],
|
||||
uint3 tgid [[threadgroup_position_in_grid]],
|
||||
uint3 tpg [[threads_per_threadgroup]],
|
||||
uint warp_id [[simdgroup_index_in_threadgroup]]) {
|
||||
uint sgitg [[simdgroup_index_in_threadgroup]]) {
|
||||
const uint tx = tid.x;
|
||||
const uint ty = tid.y;
|
||||
const uint simdGroupsPerThreadgroup = (tpg.x * tpg.y + 31) / 32;
|
||||
@ -474,8 +474,11 @@ kernel void applySYRK(
|
||||
(actSize_j % 8 == 0) && (actSize_h % 8 == 0) && (actSize_k % 8 == 0);
|
||||
|
||||
if (use_simdgroup) {
|
||||
uint warp_id = sgitg;
|
||||
|
||||
simdgroup_matrix<float, 8, 8> negative_identity =
|
||||
simdgroup_matrix<float, 8, 8>(-1.0);
|
||||
simdgroup_matrix<float, 8, 8> identity = simdgroup_matrix<float, 8, 8>(1.0);
|
||||
simdgroup_matrix<float, 8, 8> Prod;
|
||||
simdgroup_matrix<float, 8, 8> Afrag;
|
||||
simdgroup_matrix<float, 8, 8> Bfrag;
|
||||
@ -518,7 +521,8 @@ kernel void applySYRK(
|
||||
/* transpose = */ upper);
|
||||
|
||||
simdgroup_multiply(Prod, Afrag, Bfrag);
|
||||
simdgroup_multiply_accumulate(Cfrag, Prod, negative_identity, Cfrag);
|
||||
simdgroup_multiply(Prod, Prod, negative_identity);
|
||||
simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod);
|
||||
}
|
||||
|
||||
simdgroup_store(
|
||||
|
||||
@ -706,7 +706,6 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
|
||||
tags: reduction
|
||||
|
||||
|
||||
- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
|
||||
@ -716,7 +715,6 @@
|
||||
cpp_no_default_args: ['dim']
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: all_dims_default
|
||||
tags: reduction
|
||||
|
||||
- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -725,7 +723,6 @@
|
||||
CPU, CUDA: all_out
|
||||
MPS: all_out_mps
|
||||
MTIA: all_out_mtia
|
||||
tags: reduction
|
||||
|
||||
- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -734,16 +731,13 @@
|
||||
CPU, CUDA: all_dims_out
|
||||
CompositeExplicitAutograd: all_dims_out_default
|
||||
cpp_no_default_args: ['dim']
|
||||
tags: reduction
|
||||
|
||||
- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
|
||||
variants: function, method
|
||||
@ -755,14 +749,14 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured_delegate: any.out
|
||||
variants: function, method
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured_delegate: any.dims_out
|
||||
variants: function, method
|
||||
cpp_no_default_args: ['dim']
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: any_dims_default
|
||||
|
||||
@ -772,7 +766,6 @@
|
||||
dispatch:
|
||||
CPU, CUDA: any_out
|
||||
MPS: any_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -781,16 +774,13 @@
|
||||
CPU, CUDA: any_dims_out
|
||||
CompositeExplicitAutograd: any_dims_out_default
|
||||
cpp_no_default_args: ['dim']
|
||||
tags: reduction
|
||||
|
||||
- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
|
||||
dispatch:
|
||||
@ -836,27 +826,25 @@
|
||||
structured_delegate: argmax.out
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA: argmax_out
|
||||
MPS: argmax_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
|
||||
structured_delegate: argmin.out
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA: argmin_out
|
||||
MPS: argmin_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: acosh(Tensor self) -> Tensor
|
||||
variants: function, method
|
||||
@ -1382,7 +1370,6 @@
|
||||
dispatch:
|
||||
SparseCPU: bmm_sparse_cpu
|
||||
SparseCUDA: bmm_sparse_cuda
|
||||
SparseMPS: bmm_sparse_mps
|
||||
NestedTensorCPU: bmm_nested
|
||||
NestedTensorCUDA: bmm_nested_cuda
|
||||
tags: core
|
||||
@ -1398,7 +1385,6 @@
|
||||
MTIA: bmm_out_mtia
|
||||
SparseCPU: bmm_out_sparse_cpu
|
||||
SparseCUDA: bmm_out_sparse_cuda
|
||||
SparseMPS: bmm_out_sparse_mps
|
||||
SparseCsrCUDA: bmm_out_sparse_csr_cuda
|
||||
|
||||
- func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
|
||||
@ -1881,14 +1867,12 @@
|
||||
CUDA: count_nonzero_cuda
|
||||
MPS: count_nonzero_mps
|
||||
autogen: count_nonzero.dim_IntList_out
|
||||
tags: reduction
|
||||
|
||||
- func: count_nonzero(Tensor self, int? dim=None) -> Tensor
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: count_nonzero
|
||||
autogen: count_nonzero.out
|
||||
tags: reduction
|
||||
|
||||
- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
|
||||
variants: function, method
|
||||
@ -3809,23 +3793,19 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: logsumexp
|
||||
tags: reduction
|
||||
|
||||
- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
# calls squeeze
|
||||
CompositeExplicitAutogradNonFunctional: logsumexp_out
|
||||
tags: reduction
|
||||
|
||||
- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
|
||||
|
||||
@ -3875,7 +3855,6 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured_delegate: aminmax.out
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -3883,7 +3862,6 @@
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: aminmax_out
|
||||
MPS: aminmax_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
|
||||
dispatch:
|
||||
@ -3899,7 +3877,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
QuantizedCPU, QuantizedCUDA: qmax
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -3909,16 +3887,13 @@
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: max_out
|
||||
MPS: max_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
|
||||
variants: function
|
||||
@ -3931,14 +3906,13 @@
|
||||
- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
|
||||
variants: function, method
|
||||
structured_delegate: amax.out
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: amax_out
|
||||
MPS: amax_out_mps
|
||||
tags: reduction
|
||||
|
||||
# Return: (Tensor output, Tensor indices)
|
||||
- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
|
||||
@ -4000,14 +3974,13 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: mean
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
|
||||
- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: mean_dtype_out
|
||||
tags: reduction
|
||||
|
||||
- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
structured_delegate: mean.out
|
||||
@ -4015,7 +3988,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
QuantizedCPU: mean_quantized_cpu
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -4024,16 +3997,13 @@
|
||||
CPU, CUDA: mean_out
|
||||
MPS: mean_out_mps
|
||||
QuantizedCPU: mean_out_quantized_cpu
|
||||
tags: reduction
|
||||
|
||||
- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # Composite
|
||||
@ -4096,7 +4066,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
QuantizedCPU, QuantizedCUDA: qmin
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -4106,28 +4076,24 @@
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: min_out
|
||||
MPS: min_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
|
||||
variants: function, method
|
||||
structured_delegate: amin.out
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: amin_out
|
||||
MPS: amin_out_mps
|
||||
tags: reduction
|
||||
|
||||
# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
|
||||
# native_functions.yaml
|
||||
@ -4207,7 +4173,7 @@
|
||||
structured_delegate: mm.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: _sparse_mm
|
||||
SparseCPU, SparseCUDA: _sparse_mm
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm
|
||||
tags: core
|
||||
|
||||
@ -5892,7 +5858,6 @@
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
|
||||
autogen: sum.out
|
||||
tags: reduction
|
||||
|
||||
- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
# TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
|
||||
@ -5903,12 +5868,11 @@
|
||||
NestedTensorCPU: NestedTensor_sum_dim_CPU
|
||||
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -5916,11 +5880,9 @@
|
||||
dispatch:
|
||||
CPU, CUDA: sum_out
|
||||
MPS: sum_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
# TODO: this function will be replaced once nested expand semantics have been settled on
|
||||
- func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
|
||||
@ -5932,13 +5894,11 @@
|
||||
dispatch:
|
||||
CPU, CUDA: nansum
|
||||
MPS: nansum_mps
|
||||
tags: reduction
|
||||
|
||||
- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CPU, CUDA: nansum_out
|
||||
MPS: nansum_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
|
||||
variants: function, method
|
||||
@ -6002,13 +5962,11 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -6017,19 +5975,16 @@
|
||||
CPU, CUDA: std
|
||||
MPS: std_mps
|
||||
QuantizedCPU: std_quantized_cpu
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -6038,51 +5993,42 @@
|
||||
CPU, CUDA: std_mean
|
||||
MPS: std_mean_mps
|
||||
autogen: std_mean.correction_out
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
tags: reduction
|
||||
|
||||
- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: std_out
|
||||
QuantizedCPU: std_out_quantized_cpu
|
||||
tags: reduction
|
||||
|
||||
- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
tags: reduction
|
||||
|
||||
- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -6091,13 +6037,13 @@
|
||||
CPU, CUDA: prod
|
||||
MPS: prod_mps
|
||||
autogen: prod.out
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
structured_delegate: prod.int_out
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -6105,16 +6051,13 @@
|
||||
dispatch:
|
||||
CPU, CUDA: prod_out
|
||||
MPS: prod_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: t(Tensor(a) self) -> Tensor(a)
|
||||
device_check: NoCheck
|
||||
@ -6575,12 +6518,11 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
|
||||
- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
|
||||
@ -6590,51 +6532,43 @@
|
||||
CPU, CUDA: var
|
||||
MPS: var_mps
|
||||
MTIA: var_mtia
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: var_out
|
||||
tags: reduction
|
||||
|
||||
- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -6643,18 +6577,15 @@
|
||||
CPU, CUDA: var_mean
|
||||
MPS: var_mean_mps
|
||||
autogen: var_mean.correction_out
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
tags: reduction
|
||||
|
||||
- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
|
||||
variants: method
|
||||
@ -6914,7 +6845,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: norm
|
||||
autogen: norm.ScalarOpt_dtype_out
|
||||
tags: reduction
|
||||
|
||||
- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -6922,7 +6852,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: norm
|
||||
autogen: norm.Scalar_out
|
||||
tags: reduction
|
||||
|
||||
- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
|
||||
structured_delegate: norm.dtype_out
|
||||
@ -6930,7 +6859,6 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm
|
||||
tags: reduction
|
||||
|
||||
- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
|
||||
structured_delegate: norm.out
|
||||
@ -6938,7 +6866,6 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_norm
|
||||
tags: reduction
|
||||
|
||||
- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -6946,7 +6873,6 @@
|
||||
dispatch:
|
||||
CPU, CUDA: norm_dtype_out
|
||||
MPS: norm_dtype_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -6954,26 +6880,21 @@
|
||||
dispatch:
|
||||
CPU, CUDA: norm_out
|
||||
MPS: norm_out_mps
|
||||
tags: reduction
|
||||
|
||||
# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
|
||||
- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
|
||||
variants: method, function
|
||||
@ -7191,7 +7112,6 @@
|
||||
MTIA: addmm_out_mtia
|
||||
SparseCPU: addmm_out_sparse_dense_cpu
|
||||
SparseCUDA: addmm_out_sparse_dense_cuda
|
||||
SparseMPS: addmm_out_sparse_dense_mps
|
||||
SparseCsrCPU: addmm_out_sparse_compressed_cpu
|
||||
SparseCsrCUDA: addmm_out_sparse_compressed_cuda
|
||||
|
||||
@ -7201,7 +7121,6 @@
|
||||
dispatch:
|
||||
SparseCPU: addmm_sparse_dense_cpu
|
||||
SparseCUDA: addmm_sparse_dense_cuda
|
||||
SparseMPS: addmm_sparse_dense_mps
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
|
||||
tags: core
|
||||
|
||||
@ -10159,14 +10078,12 @@
|
||||
CPU, CUDA: min
|
||||
MPS: min_mps
|
||||
QuantizedCPU: min_quantized_cpu
|
||||
tags: [reduction]
|
||||
|
||||
- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: min_unary_out
|
||||
QuantizedCPU: min_quantized_unary_out
|
||||
tags: [reduction]
|
||||
|
||||
- func: fmin(Tensor self, Tensor other) -> Tensor
|
||||
structured_delegate: fmin.out
|
||||
@ -10189,7 +10106,6 @@
|
||||
CPU, CUDA: max
|
||||
MPS: max_mps
|
||||
QuantizedCPU: max_quantized_cpu
|
||||
tags: [reduction]
|
||||
|
||||
- func: fmax(Tensor self, Tensor other) -> Tensor
|
||||
structured_delegate: fmax.out
|
||||
@ -10236,7 +10152,6 @@
|
||||
dispatch:
|
||||
CPU, CUDA: max_unary_out
|
||||
QuantizedCPU: max_quantized_unary_out
|
||||
tags: [reduction]
|
||||
|
||||
- func: minimum(Tensor self, Tensor other) -> Tensor
|
||||
structured_delegate: minimum.out
|
||||
@ -10356,7 +10271,6 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured_delegate: all.all_out
|
||||
variants: method, function
|
||||
tags: reduction
|
||||
|
||||
- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck
|
||||
@ -10365,7 +10279,6 @@
|
||||
CPU, CUDA: all_all_out
|
||||
MTIA: all_all_out_mtia
|
||||
MPS: all_all_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: any(Tensor self) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -10373,7 +10286,7 @@
|
||||
variants: method, function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: any_sparse
|
||||
tags: [core, reduction]
|
||||
tags: core
|
||||
|
||||
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck
|
||||
@ -10381,7 +10294,6 @@
|
||||
dispatch:
|
||||
CPU, CUDA: any_all_out
|
||||
MPS: any_all_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -14433,7 +14345,6 @@
|
||||
python_module: linalg
|
||||
variants: function
|
||||
structured_delegate: linalg_vector_norm.out
|
||||
tags: reduction
|
||||
|
||||
- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
python_module: linalg
|
||||
@ -14441,7 +14352,6 @@
|
||||
dispatch:
|
||||
CPU, CUDA: linalg_vector_norm_out
|
||||
MPS: linalg_vector_norm_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
python_module: linalg
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/SparseTensorUtils.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/sparse/SparseStubs.h>
|
||||
#include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
|
||||
@ -19,8 +18,6 @@
|
||||
#include <ATen/ops/ones_like.h>
|
||||
#include <ATen/ops/argsort.h>
|
||||
#include <ATen/ops/result_type.h>
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/addmm_native.h>
|
||||
#include <ATen/ops/copy_sparse_to_sparse.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#endif
|
||||
@ -36,305 +33,6 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary();
|
||||
#include <ATen/native/mps/Mul_metallib.h>
|
||||
#endif
|
||||
|
||||
static Tensor& s_addmm_out_sparse_dense_mps(
|
||||
Tensor& r,
|
||||
const Tensor& t,
|
||||
const SparseTensor& sparse_,
|
||||
const Tensor& dense,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha) {
|
||||
TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: sparse_dim must be 2, got ", sparse_.sparse_dim());
|
||||
TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: sparse values must be 0-dense-dim, got ", sparse_.dense_dim());
|
||||
TORCH_CHECK(dense.dim() == 2, "addmm: 'dense' must be 2D, got ", dense.dim());
|
||||
TORCH_CHECK(t.dim() == 2, "addmm: 't' must be 2D, got ", t.dim());
|
||||
|
||||
const int64_t I = sparse_.size(0);
|
||||
const int64_t J = sparse_.size(1);
|
||||
const int64_t K = dense.size(1);
|
||||
|
||||
TORCH_CHECK(dense.size(0) == J,
|
||||
"addmm: dense (mat2) dim0 must be ", J, ", got ", dense.size(0));
|
||||
TORCH_CHECK(t.size(0) == I && t.size(1) == K,
|
||||
"addmm: 't' shape must be (", I, ", ", K, "), got (", t.size(0), ", ", t.size(1), ")");
|
||||
|
||||
r.resize_({I, K});
|
||||
|
||||
auto sparse = sparse_.coalesce();
|
||||
const int64_t nnz = sparse._nnz();
|
||||
|
||||
if (nnz == 0 || I == 0 || K == 0) {
|
||||
at::mul_out(r, t, beta);
|
||||
return r;
|
||||
}
|
||||
|
||||
const auto v_dtype = sparse._values().scalar_type();
|
||||
const auto d_dtype = dense.scalar_type();
|
||||
const auto t_dtype = t.scalar_type();
|
||||
auto compute_dtype = c10::promoteTypes(c10::promoteTypes(v_dtype, d_dtype), t_dtype);
|
||||
|
||||
TORCH_CHECK(canCast(compute_dtype, r.scalar_type()),
|
||||
"Can't convert computed type ", compute_dtype, " to output ", r.scalar_type());
|
||||
|
||||
auto indices2d = sparse._indices().contiguous();
|
||||
auto values = sparse._values().to(compute_dtype);
|
||||
auto dense_c = dense.to(compute_dtype).contiguous();
|
||||
auto t_c = t.to(compute_dtype).contiguous();
|
||||
|
||||
const bool out_needs_cast = (r.scalar_type() != compute_dtype) || !r.is_contiguous();
|
||||
Tensor out_buf = out_needs_cast
|
||||
? at::empty({I, K}, r.options().dtype(compute_dtype))
|
||||
: r;
|
||||
auto out_contig = out_buf.contiguous();
|
||||
|
||||
auto device = r.device();
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
const float alpha_f = alpha.to<float>();
|
||||
const float beta_f = beta.to<float>();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
const std::string func = "spmm_addmm_coo_" + mps::scalarToMetalTypeString(values);
|
||||
auto pso = lib.getPipelineStateForFunc(func);
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
const uint32_t gridX = static_cast<uint32_t>(K);
|
||||
const uint32_t gridZ = static_cast<uint32_t>(I);
|
||||
const uint32_t tgW = std::min<uint32_t>(gridX, tew);
|
||||
|
||||
MTLSize grid = MTLSizeMake(gridX, 1, gridZ);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
indices2d,
|
||||
values,
|
||||
dense_c,
|
||||
t_c,
|
||||
out_contig,
|
||||
std::array<uint32_t, 3>{static_cast<uint32_t>(I),
|
||||
static_cast<uint32_t>(J),
|
||||
static_cast<uint32_t>(K)},
|
||||
std::array<float, 2>{alpha_f, beta_f},
|
||||
static_cast<uint32_t>(nnz));
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
|
||||
if (out_needs_cast) {
|
||||
r.copy_(out_contig.to(r.scalar_type()));
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
static void build_batch_ptr_mps(
|
||||
const Tensor& indices_dim0,
|
||||
int64_t B,
|
||||
Tensor& batch_ptr
|
||||
) {
|
||||
// Builds an array of pointers which point to each batches elements. Example:
|
||||
// idx_b = [0, 0, 0, 1, 1, 2, 2, 2, 2] // 9 non-zero elements
|
||||
// └─────┘ └──┘ └─────────┘
|
||||
// batch 0 batch 1 batch 2
|
||||
// batch_ptr = [0, 3, 5, 9]
|
||||
// │ │ │ └─ end of batch 2 (total nnz)
|
||||
// │ │ └──── batch 2 starts at index 5
|
||||
// │ └─────── batch 1 starts at index 3
|
||||
// └────────── batch 0 starts at index 0
|
||||
TORCH_CHECK(indices_dim0.is_mps() && batch_ptr.is_mps(), "MPS device expected");
|
||||
auto device = indices_dim0.device();
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
const int64_t nnz = indices_dim0.numel();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("build_batch_ptr_from_sorted_batches");
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
const uint32_t Q = static_cast<uint32_t>(B + 1);
|
||||
const uint32_t tgW = std::min<uint32_t>(Q, tew);
|
||||
MTLSize grid = MTLSizeMake(Q, 1, 1);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
indices_dim0,
|
||||
batch_ptr,
|
||||
std::array<uint32_t, 2>{static_cast<uint32_t>(nnz),
|
||||
static_cast<uint32_t>(B)});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void build_row_ptr_per_batch_mps(
|
||||
const Tensor& rows,
|
||||
const Tensor& batch_ptr,
|
||||
int64_t B,
|
||||
int64_t I,
|
||||
Tensor& row_ptr
|
||||
) {
|
||||
// Build per-batch CSR-style row pointer arrays from row indices sorted by batch
|
||||
// Given:
|
||||
// rows: 1-D array of length nnz with row ids in [0, I), sorted within each batch
|
||||
// batch_ptr: length B+1, where [batch_ptr[b], batch_ptr[b+1]) is the subrange for batch b
|
||||
// Produces:
|
||||
// - row_ptr: shape [B, I+1]
|
||||
//
|
||||
// Example (B = 2, I = 4):
|
||||
// rows = [0, 0, 1, 3, 0, 2, 2] // 7 non-zero elements
|
||||
// └─── batch 0 ──┘ └─ batch 1 ─┘
|
||||
// batch_ptr = [0, 4, 7]
|
||||
// │ │ └─ end of batch 1 (total nnz)
|
||||
// │ └──── end of batch 0/start of batch 1
|
||||
// └─────── start of batch 0
|
||||
//
|
||||
// per-batch row pointers (I+1 entries each):
|
||||
// row_ptr[0] = [0, 2, 3, 3, 4]
|
||||
// row_ptr[1] = [0, 1, 1, 3, 3]
|
||||
// laid out in memory: [0, 2, 3, 3, 4, 0, 1, 1, 3, 3]
|
||||
TORCH_CHECK(rows.is_mps() && batch_ptr.is_mps() && row_ptr.is_mps(), "MPS device expected");
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("build_row_ptr_from_sorted_rows_by_batch");
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
const uint32_t Qx = static_cast<uint32_t>(I + 1);
|
||||
const uint32_t Qy = static_cast<uint32_t>(B);
|
||||
const uint32_t tgW = std::min<uint32_t>(Qx, tew);
|
||||
|
||||
MTLSize grid = MTLSizeMake(Qx, Qy, 1);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
rows,
|
||||
batch_ptr,
|
||||
row_ptr,
|
||||
std::array<uint32_t, 2>{static_cast<uint32_t>(I),
|
||||
static_cast<uint32_t>(B)});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Tensor& bmm_out_sparse_mps(const SparseTensor& self_, const Tensor& mat2_, Tensor& result_) {
|
||||
TORCH_CHECK(result_.is_mps(), "bmm_sparse: expected 'out' to be MPS, got ", result_.device());
|
||||
TORCH_CHECK(self_.is_mps(), "bmm_sparse: expected 'self' to be MPS, got ", self_.device());
|
||||
TORCH_CHECK(mat2_.is_mps(), "bmm_sparse: expected 'mat2' to be MPS, got ", mat2_.device());
|
||||
|
||||
TORCH_CHECK(self_.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self_.dense_dim());
|
||||
TORCH_CHECK(self_.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self_.sparse_dim());
|
||||
TORCH_CHECK(mat2_.dim() == 3, "bmm_sparse: Tensor 'mat2' must have 3 dims, but has ", mat2_.dim());
|
||||
|
||||
TORCH_CHECK(self_.size(0) == mat2_.size(0), "bmm_sparse: 'self.size(0)' and 'mat2.size(0)' must match");
|
||||
TORCH_CHECK(self_.size(2) == mat2_.size(1), "bmm_sparse: 'self.size(2)' and 'mat2.size(1)' must match");
|
||||
|
||||
const int64_t B = self_.size(0);
|
||||
const int64_t I = self_.size(1);
|
||||
const int64_t J = self_.size(2);
|
||||
const int64_t K = mat2_.size(2);
|
||||
|
||||
auto self = self_.coalesce();
|
||||
const int64_t nnz = self._nnz();
|
||||
if (nnz == 0) {
|
||||
return result_.zero_();
|
||||
}
|
||||
|
||||
const auto computeDtype = at::kFloat;
|
||||
|
||||
auto indices = self._indices();
|
||||
auto values = self._values();
|
||||
|
||||
auto values_c = values.scalar_type() == computeDtype ? values : values.to(computeDtype);
|
||||
auto mat2_c = mat2_.scalar_type() == computeDtype ? mat2_ : mat2_.to(computeDtype);
|
||||
auto mat2_contig = mat2_c.contiguous();
|
||||
|
||||
auto idx_b = indices.select(0, 0).contiguous();
|
||||
auto idx_i = indices.select(0, 1).contiguous();
|
||||
auto idx_j = indices.select(0, 2).contiguous();
|
||||
|
||||
// builds an array of pointers of where the batch_idx's pointer starts and ends
|
||||
// look in function for better explanation
|
||||
auto batch_ptr = at::empty({B + 1}, at::device(result_.device()).dtype(kLong));
|
||||
build_batch_ptr_mps(idx_b, B, batch_ptr);
|
||||
// build row_ptr per batch: for each (b, i) get [start, end) into rows/cols/vals
|
||||
auto row_ptr = at::empty({B * (I + 1)}, at::device(result_.device()).dtype(kLong));
|
||||
build_row_ptr_per_batch_mps(idx_i, batch_ptr, B, I, row_ptr);
|
||||
|
||||
const bool out_needs_cast = (result_.scalar_type() != computeDtype) || !result_.is_contiguous();
|
||||
Tensor out_buf = out_needs_cast
|
||||
? at::empty({B, I, K}, result_.options().dtype(computeDtype))
|
||||
: result_;
|
||||
auto out_contig = out_buf.contiguous();
|
||||
|
||||
auto stream = getCurrentMPSStream();
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("spmm_bmm_coo_rows_grouped_" + mps::scalarToMetalTypeString(values));
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
const uint32_t tgW = std::min<uint32_t>((uint32_t)K, tew);
|
||||
|
||||
// One threadgroup per (row i, batch b), lanes cover K
|
||||
MTLSize grid = MTLSizeMake(tgW, (uint32_t)I, (uint32_t)B);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
idx_i,
|
||||
idx_j,
|
||||
values_c,
|
||||
mat2_contig,
|
||||
out_contig,
|
||||
row_ptr,
|
||||
std::array<uint32_t, 4>{(uint32_t)B, (uint32_t)I, (uint32_t)J, (uint32_t)K});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
if (out_needs_cast) {
|
||||
result_.copy_(out_contig.to(result_.scalar_type()));
|
||||
}
|
||||
return result_;
|
||||
}
|
||||
|
||||
Tensor bmm_sparse_mps(const Tensor& self, const Tensor& mat2) {
|
||||
Tensor result = at::zeros({self.size(0), self.size(1), mat2.size(2)}, mat2.options());
|
||||
return bmm_out_sparse_mps(self, mat2, result);
|
||||
}
|
||||
|
||||
Tensor& addmm_out_sparse_dense_mps(
|
||||
const Tensor& self,
|
||||
const SparseTensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha,
|
||||
Tensor& result) {
|
||||
c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
|
||||
return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
|
||||
}
|
||||
|
||||
Tensor addmm_sparse_dense_mps(
|
||||
const Tensor& self,
|
||||
const SparseTensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha
|
||||
) {
|
||||
c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
|
||||
Tensor result = at::empty({0}, self.options());
|
||||
return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
|
||||
}
|
||||
|
||||
static SparseTensor& mul_out_dense_sparse_mps(
|
||||
const Tensor& dense,
|
||||
const Tensor& sparse,
|
||||
|
||||
@ -1,105 +1,10 @@
|
||||
#include <metal_stdlib>
|
||||
#include <c10/metal/indexing.h>
|
||||
#include <c10/metal/utils.h>
|
||||
using namespace c10::metal;
|
||||
using namespace metal;
|
||||
|
||||
inline uint lower_bound_i64(device const long* arr, uint lo, uint hi, long key) {
|
||||
uint l = lo, r = hi;
|
||||
while (l < r) {
|
||||
uint m = (l + r) >> 1;
|
||||
long v = arr[m];
|
||||
if (v < key) {
|
||||
l = m + 1;
|
||||
} else {
|
||||
r = m;
|
||||
}
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
inline uint upper_bound_i64(device const long* arr, uint lo, uint hi, long key) {
|
||||
uint l = lo, r = hi;
|
||||
while (l < r) {
|
||||
uint m = (l + r) >> 1;
|
||||
long v = arr[m];
|
||||
if (v <= key) {
|
||||
l = m + 1;
|
||||
} else {
|
||||
r = m;
|
||||
}
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
kernel void build_row_ptr_from_sorted_rows_by_batch(
|
||||
device const long* rows [[buffer(0)]],
|
||||
device const long* batch_ptr [[buffer(1)]],
|
||||
device long* row_ptr [[buffer(2)]],
|
||||
constant uint2& dims [[buffer(3)]],
|
||||
uint3 tid [[thread_position_in_grid]])
|
||||
{
|
||||
const uint I = dims.x;
|
||||
const uint B = dims.y;
|
||||
|
||||
const uint i = tid.x;
|
||||
const uint b = tid.y;
|
||||
|
||||
if (b >= B || i > I) return;
|
||||
|
||||
const uint base = (uint)batch_ptr[b];
|
||||
const uint lim = (uint)batch_ptr[b + 1];
|
||||
|
||||
const ulong out_base = (ulong)b * (ulong)(I + 1);
|
||||
|
||||
if (i == I) {
|
||||
row_ptr[out_base + (ulong)I] = (long)lim;
|
||||
} else {
|
||||
const long key = (long)i;
|
||||
const uint pos = lower_bound_i64(rows, base, lim, key);
|
||||
row_ptr[out_base + (ulong)i] = (long)pos;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
kernel void spmm_bmm_coo_rows_grouped(
|
||||
device const long* rows [[buffer(0)]],
|
||||
device const long* cols [[buffer(1)]],
|
||||
device const T* vals [[buffer(2)]],
|
||||
device const T* dense [[buffer(3)]],
|
||||
device T* out [[buffer(4)]],
|
||||
device const long* row_ptr [[buffer(5)]],
|
||||
constant uint4& dims [[buffer(6)]],
|
||||
uint3 tid [[thread_position_in_grid]],
|
||||
uint3 ltid [[thread_position_in_threadgroup]],
|
||||
uint3 tptg [[threads_per_threadgroup]])
|
||||
{
|
||||
const uint B = dims.x;
|
||||
const uint I = dims.y;
|
||||
const uint J = dims.z;
|
||||
const uint K = dims.w;
|
||||
|
||||
const uint b = tid.z;
|
||||
const uint i = tid.y;
|
||||
const uint lane = ltid.x;
|
||||
const uint tgW = tptg.x;
|
||||
|
||||
const ulong rp_base = (ulong)b * (ulong)(I + 1);
|
||||
const uint start = (uint)row_ptr[rp_base + (ulong)i];
|
||||
const uint end = (uint)row_ptr[rp_base + (ulong)i + 1];
|
||||
|
||||
for (uint k = lane; k < K; k += tgW) {
|
||||
auto acc = static_cast<accum_t<T>>(T(0));
|
||||
for (uint p = start; p < end; ++p) {
|
||||
const uint c = (uint)cols[p];
|
||||
const auto v = static_cast<accum_t<T>>(vals[p]);
|
||||
const uint d_off = ((b * J) + c) * K + k;
|
||||
const auto d = static_cast<accum_t<T>>(dense[d_off]);
|
||||
acc += mul(v, d);
|
||||
}
|
||||
const uint y_off = ((b * I) + i) * K + k;
|
||||
out[y_off] = static_cast<T>(acc);
|
||||
}
|
||||
}
|
||||
template <typename T> struct MulAccum { using type = float; };
|
||||
template <> struct MulAccum<float2> { using type = float2; };
|
||||
|
||||
template <typename T>
|
||||
kernel void dense_sparse_mul_kernel(
|
||||
@ -127,9 +32,10 @@ kernel void dense_sparse_mul_kernel(
|
||||
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
|
||||
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
|
||||
|
||||
const auto a = static_cast<accum_t<T>>(values[val_idx]);
|
||||
const auto b = static_cast<accum_t<T>>(dense[dense_idx]);
|
||||
out_values[val_idx] = static_cast<T>(mul(a, b));
|
||||
using accum_t = typename MulAccum<T>::type;
|
||||
const accum_t a = static_cast<accum_t>(values[val_idx]);
|
||||
const accum_t b = static_cast<accum_t>(dense[dense_idx]);
|
||||
out_values[val_idx] = static_cast<T>(a * b);
|
||||
}
|
||||
|
||||
kernel void intersect_binary_search(
|
||||
@ -214,76 +120,6 @@ kernel void fused_gather_mul_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void build_batch_ptr_from_sorted_batches(
|
||||
device const long* batches [[buffer(0)]],
|
||||
device long* batch_ptr [[buffer(1)]],
|
||||
constant uint2& nnz_B [[buffer(2)]],
|
||||
uint3 tid [[thread_position_in_grid]])
|
||||
{
|
||||
uint b = tid.x;
|
||||
uint nnz = nnz_B.x;
|
||||
uint batch = nnz_B.y;
|
||||
|
||||
if (b == batch) {
|
||||
batch_ptr[b] = (long)nnz;
|
||||
return;
|
||||
}
|
||||
|
||||
uint lo = 0;
|
||||
uint hi = nnz;
|
||||
long key = (long)b;
|
||||
while (lo < hi) {
|
||||
uint mid = (lo + hi) >> 1;
|
||||
long v = batches[mid];
|
||||
if (v < key) lo = mid + 1;
|
||||
else hi = mid;
|
||||
}
|
||||
batch_ptr[b] = (long)lo;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
kernel void spmm_addmm_coo(
|
||||
device const long* indices2d [[buffer(0)]],
|
||||
device const T* vals [[buffer(1)]],
|
||||
device const T* dense [[buffer(2)]],
|
||||
device const T* t_in [[buffer(3)]],
|
||||
device T* out [[buffer(4)]],
|
||||
constant uint3& dims [[buffer(5)]],
|
||||
constant float2& alpha_beta [[buffer(6)]],
|
||||
constant uint& nnz [[buffer(7)]],
|
||||
uint3 tid [[thread_position_in_grid]])
|
||||
{
|
||||
const uint K = dims.z;
|
||||
const uint k = tid.x;
|
||||
const uint i = tid.z;
|
||||
const float alpha = alpha_beta.x;
|
||||
const float beta = alpha_beta.y;
|
||||
|
||||
device const long* rows = indices2d;
|
||||
device const long* cols = indices2d + nnz;
|
||||
|
||||
const uint start = lower_bound_i64(rows, 0u, nnz, (long)i);
|
||||
const uint end = upper_bound_i64(rows, 0u, nnz, (long)i);
|
||||
|
||||
// accumulator is float for scalar/half/bfloat and float2 for float2
|
||||
auto acc = static_cast<accum_t<T>>(T(0));
|
||||
|
||||
for (uint p = start; p < end; ++p) {
|
||||
const uint c = (uint)cols[p];
|
||||
const auto v = static_cast<accum_t<T>>(vals[p]);
|
||||
const uint dense_off = c * K + k;
|
||||
const auto d = static_cast<accum_t<T>>(dense[dense_off]);
|
||||
acc += mul(v, d);
|
||||
}
|
||||
|
||||
const uint off = i * K + k;
|
||||
const auto base = (beta != 0.0f) ? (static_cast<accum_t<T>>(t_in[off]) * beta) : static_cast<accum_t<T>>(T(0));
|
||||
const auto y = base + alpha * acc;
|
||||
out[off] = static_cast<T>(y);
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
|
||||
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
|
||||
dense_sparse_mul_kernel<DTYPE>( \
|
||||
@ -315,36 +151,6 @@ INSTANTIATE_DENSE_SPARSE_MUL(float2);
|
||||
constant uint2& dims_output [[buffer(8)]], \
|
||||
uint3 gid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
|
||||
|
||||
|
||||
#define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE) \
|
||||
template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void \
|
||||
spmm_bmm_coo_rows_grouped<DTYPE>( \
|
||||
device const long* rows [[buffer(0)]], \
|
||||
device const long* cols [[buffer(1)]], \
|
||||
device const DTYPE* vals [[buffer(2)]], \
|
||||
device const DTYPE* dense [[buffer(3)]], \
|
||||
device DTYPE* out [[buffer(4)]], \
|
||||
device const long* row_ptr [[buffer(5)]], \
|
||||
constant uint4& dims [[buffer(6)]], \
|
||||
uint3 tid [[thread_position_in_grid]], \
|
||||
uint3 ltid [[thread_position_in_threadgroup]], \
|
||||
uint3 tptg [[threads_per_threadgroup]]);
|
||||
|
||||
INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED);
|
||||
|
||||
#define INSTANTIATE_SPMM_ADDMM_COO(DTYPE) \
|
||||
template [[host_name("spmm_addmm_coo_" #DTYPE)]] kernel void \
|
||||
spmm_addmm_coo<DTYPE>( \
|
||||
device const long* indices2d [[buffer(0)]], \
|
||||
device const DTYPE* vals [[buffer(1)]], \
|
||||
device const DTYPE* dense [[buffer(2)]], \
|
||||
device const DTYPE* t_in [[buffer(3)]], \
|
||||
device DTYPE* out [[buffer(4)]], \
|
||||
constant uint3& dims [[buffer(5)]], \
|
||||
constant float2& alpha_beta [[buffer(6)]], \
|
||||
constant uint& nnz [[buffer(7)]], \
|
||||
uint3 tid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_ADDMM_COO);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(float);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(half);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(bfloat);
|
||||
@ -93,7 +93,3 @@
|
||||
This operator does not support cudagraphs. The presence of this tag on an operator will cause
|
||||
Inductor to split the graph around this operator. Note that operators without this tag may still
|
||||
not support CUDAGraphs. Inductor may have other hardcoded lists around that.
|
||||
- tag: reduction
|
||||
desc: |
|
||||
This tag indicates that an operator performs a reduction operation, computing aggregate values
|
||||
(sum, mean, max, min, etc.) across one or more dimensions of the input tensor(s).
|
||||
|
||||
@ -1751,8 +1751,8 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
|
||||
f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
log.exception("Failed to save memory snapshot")
|
||||
except Exception as e:
|
||||
log.error("Failed to save memory snapshot, %s", e)
|
||||
|
||||
torch.cuda.memory._record_memory_history(enabled=None)
|
||||
|
||||
|
||||
@ -124,7 +124,7 @@ with open(MODELS_FILENAME) as fh:
|
||||
continue
|
||||
batch_size = int(batch_size)
|
||||
BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
|
||||
assert BATCH_SIZE_KNOWN_MODELS
|
||||
assert len(BATCH_SIZE_KNOWN_MODELS)
|
||||
|
||||
|
||||
try:
|
||||
|
||||
@ -296,8 +296,8 @@ class OperatorInputsLoader:
|
||||
for key in self.operator_db.keys():
|
||||
try:
|
||||
op = eval(key)
|
||||
except AttributeError:
|
||||
log.warning("Evaluating an op name into an OpOverload", exc_info=True)
|
||||
except AttributeError as ae:
|
||||
log.warning("Evaluating an op name into an OpOverload: %s", ae)
|
||||
continue
|
||||
yield op
|
||||
|
||||
|
||||
@ -3,7 +3,6 @@ import sys
|
||||
from benchmark_base import BenchmarkBase
|
||||
|
||||
import torch
|
||||
from torch._dynamo.utils import CompileTimeInstructionCounter
|
||||
|
||||
|
||||
class Benchmark(BenchmarkBase):
|
||||
@ -33,11 +32,7 @@ class Benchmark(BenchmarkBase):
|
||||
def _work(self):
|
||||
# enable_cpp_symbolic_shape_guards has impact on this benchmark
|
||||
# Keep using False value for consistency.
|
||||
with (
|
||||
torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False),
|
||||
torch._export.config.patch(use_new_tracer_experimental=True),
|
||||
CompileTimeInstructionCounter.record(),
|
||||
):
|
||||
with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False):
|
||||
torch.export.export(self.m, (self.input,), strict=True)
|
||||
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1719000000,0.1
|
||||
|
||||
|
||||
|
||||
sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1
|
||||
sum_floordiv_regression,compile_time_instruction_count,966100000,0.1
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -127,7 +127,7 @@ def trainbench(
|
||||
bwd_time = bwd_start_event.elapsed_time(bwd_end_event)
|
||||
return fwd_time, bwd_time
|
||||
|
||||
creator_args = {
|
||||
creator_args = creator_args = {
|
||||
"seqLength": seqLength,
|
||||
"numLayers": numLayers,
|
||||
"inputSize": inputSize,
|
||||
|
||||
@ -12,7 +12,7 @@ def modeldef(request, net_name, executor, fuser):
|
||||
|
||||
# Given a 'net_name' provided by generate_tests, build the thing
|
||||
name, rnn_creator, context = get_nn_runners(net_name)[0]
|
||||
creator_args = {
|
||||
creator_args = creator_args = {
|
||||
"seqLength": 100,
|
||||
"numLayers": 1,
|
||||
"inputSize": 512,
|
||||
|
||||
@ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler:
|
||||
cur_state_dict[f"{fqn}.weight"] = int8_weight
|
||||
cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
|
||||
elif isinstance(mod, ConditionalFeedForward):
|
||||
for weight_idx in range(3):
|
||||
for weight_idx in range(0, 3):
|
||||
weight_name = f"w{weight_idx + 1}"
|
||||
scales_name = f"scales{weight_idx + 1}"
|
||||
weight = getattr(mod, weight_name)
|
||||
|
||||
@ -1729,8 +1729,10 @@ def define_buck_targets(
|
||||
"torch/csrc/jit/backends/backend_debug_info.cpp",
|
||||
"torch/csrc/jit/backends/backend_interface.cpp",
|
||||
],
|
||||
compiler_flags = get_pt_compiler_flags(),
|
||||
fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
|
||||
compiler_flags = get_pt_compiler_flags() + select({
|
||||
"DEFAULT": [],
|
||||
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags
|
||||
}),
|
||||
# @lint-ignore BUCKLINT link_whole
|
||||
link_whole = True,
|
||||
linker_flags = get_no_as_needed_linker_flag(),
|
||||
@ -2023,6 +2025,9 @@ def define_buck_targets(
|
||||
"ovr_config//os:android-x86_64": [
|
||||
"-mssse3",
|
||||
],
|
||||
}) + select({
|
||||
"DEFAULT": [],
|
||||
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags,
|
||||
}),
|
||||
exported_preprocessor_flags = get_aten_preprocessor_flags(),
|
||||
exported_deps = [
|
||||
|
||||
@ -46,7 +46,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
|
||||
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
|
||||
const size_t interval_end =
|
||||
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
|
||||
"kRoundUpPowerOfTwoIntervals mismatch");
|
||||
|
||||
@ -65,7 +65,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
|
||||
std::numeric_limits<size_t>::max() / kMB;
|
||||
|
||||
size_t val_env = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
val_env >= min_allowed_split_size_mb,
|
||||
"CachingAllocator option max_split_size_mb too small, must be >= ",
|
||||
min_allowed_split_size_mb);
|
||||
@ -84,7 +84,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
|
||||
std::numeric_limits<size_t>::max() / kMB;
|
||||
|
||||
size_t val_env = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
val_env >= min_allowed_split_size_mb,
|
||||
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
|
||||
min_allowed_split_size_mb);
|
||||
@ -99,7 +99,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
double val_env = tokenizer.toDouble(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
val_env > 0 && val_env < 1.0,
|
||||
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
|
||||
garbage_collection_threshold_ = val_env;
|
||||
@ -120,7 +120,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
size_t value_index = i;
|
||||
tokenizer.checkToken(++i, ":");
|
||||
size_t value = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
value == 0 || llvm::isPowerOf2_64(value),
|
||||
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
|
||||
|
||||
@ -128,13 +128,12 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
std::fill(
|
||||
std::next(
|
||||
roundup_power2_divisions_.begin(),
|
||||
static_cast<std::vector<size_t>::difference_type>(
|
||||
last_index + 1)),
|
||||
static_cast<std::vector<size_t>::difference_type>(last_index)),
|
||||
roundup_power2_divisions_.end(),
|
||||
value);
|
||||
} else {
|
||||
size_t boundary = tokenizer.toSizeT(value_index);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(boundary),
|
||||
"For roundups, the intervals have to be power of 2 ");
|
||||
|
||||
@ -164,7 +163,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
|
||||
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
|
||||
} else { // Keep this for backwards compatibility
|
||||
size_t value = tokenizer.toSizeT(i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(value),
|
||||
"For roundups, the divisions has to be power of 2 ");
|
||||
std::fill(
|
||||
@ -224,7 +223,7 @@ void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
|
||||
// If a device-specific configuration parser hook is registered, it will
|
||||
// check if the key is unrecognized.
|
||||
if (device_config_parser_hook_) {
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
getKeys().find(key) != getKeys().end(),
|
||||
"Unrecognized key '",
|
||||
key,
|
||||
|
||||
@ -76,7 +76,7 @@ class ConfigTokenizer {
|
||||
} else if (token == "False") {
|
||||
return false;
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Expected 'True' or 'False' at index ",
|
||||
i,
|
||||
@ -253,7 +253,7 @@ class C10_API AcceleratorAllocatorConfig {
|
||||
device_config_parser_hook_ = std::move(hook);
|
||||
auto& mutable_keys = getMutableKeys();
|
||||
for (auto& key : keys) {
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
mutable_keys.insert(key).second,
|
||||
"Duplicated key '",
|
||||
key,
|
||||
|
||||
@ -102,7 +102,7 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
|
||||
} else {
|
||||
std::random_device rd;
|
||||
// limit to 53 bits to ensure unique representation in double
|
||||
s = (((static_cast<uint64_t>(rd())) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
|
||||
s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -20,8 +20,7 @@ void maybeApplyRefcountedDeleter(const c10::Storage& storage) {
|
||||
std::lock_guard<std::mutex> guard(replace_data_ptr_mutex);
|
||||
c10::DataPtr& data_ptr = storage.mutable_data_ptr();
|
||||
|
||||
if (reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
|
||||
reinterpret_cast<const void*>(&c10::refcounted_deleter)) {
|
||||
if ((void*)data_ptr.get_deleter() == (void*)&c10::refcounted_deleter) {
|
||||
// Data pointer is already shared
|
||||
return;
|
||||
}
|
||||
|
||||
@ -52,19 +52,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT)
|
||||
#undef DEFINE_CONSTANT
|
||||
|
||||
inline const char* toString(ScalarType t) {
|
||||
#define DEFINE_CASE(_, name) \
|
||||
case ScalarType::name: \
|
||||
return #name;
|
||||
|
||||
switch (t) {
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
|
||||
default:
|
||||
return "UNKNOWN_SCALAR";
|
||||
}
|
||||
#undef DEFINE_CASE
|
||||
}
|
||||
|
||||
inline size_t elementSize(ScalarType t) {
|
||||
#define CASE_ELEMENTSIZE_CASE(ctype, name) \
|
||||
case ScalarType::name: \
|
||||
@ -308,12 +295,6 @@ inline bool canCast(const ScalarType from, const ScalarType to) {
|
||||
|
||||
C10_API ScalarType promoteTypes(ScalarType a, ScalarType b);
|
||||
|
||||
inline std::ostream& operator<<(
|
||||
std::ostream& stream,
|
||||
at::ScalarType scalar_type) {
|
||||
return stream << toString(scalar_type);
|
||||
}
|
||||
|
||||
// Returns a pair of strings representing the names for each dtype.
|
||||
// The returned pair is (name, legacy_name_if_applicable)
|
||||
C10_API std::pair<std::string, std::string> getDtypeNames(
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
#include <c10/core/SymBool.h>
|
||||
#include <c10/core/SymInt.h>
|
||||
#include <c10/core/SymNodeImpl.h>
|
||||
|
||||
namespace c10 {
|
||||
@ -111,4 +112,17 @@ bool SymBool::has_hint() const {
|
||||
return toSymNodeImpl()->has_hint();
|
||||
}
|
||||
|
||||
SymInt SymBool::toSymInt() const {
|
||||
// If concrete bool, return concrete SymInt
|
||||
if (auto ma = maybe_as_bool()) {
|
||||
return SymInt(*ma ? 1 : 0);
|
||||
}
|
||||
|
||||
// Symbolic case: use sym_ite to convert bool to int (0 or 1)
|
||||
auto node = toSymNodeImpl();
|
||||
auto one_node = node->wrap_int(1);
|
||||
auto zero_node = node->wrap_int(0);
|
||||
return SymInt(node->sym_ite(one_node, zero_node));
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -12,6 +12,8 @@
|
||||
|
||||
namespace c10 {
|
||||
|
||||
class SymInt;
|
||||
|
||||
class C10_API SymBool {
|
||||
public:
|
||||
/*implicit*/ SymBool(bool b) : data_(b) {}
|
||||
@ -80,6 +82,10 @@ class C10_API SymBool {
|
||||
return toSymNodeImplUnowned()->constant_bool();
|
||||
}
|
||||
|
||||
// Convert SymBool to SymInt (0 or 1)
|
||||
// This is the C++ equivalent of Python's cast_symbool_to_symint_guardless
|
||||
SymInt toSymInt() const;
|
||||
|
||||
bool is_heap_allocated() const {
|
||||
return ptr_;
|
||||
}
|
||||
|
||||
@ -83,7 +83,7 @@ DEFINE_BINARY(max_slow_path, sym_max, SymInt)
|
||||
|
||||
SymInt::operator SymFloat() const {
|
||||
if (auto ma = maybe_as_int()) {
|
||||
return SymFloat(static_cast<double>(*ma));
|
||||
return SymFloat(double(*ma));
|
||||
} else {
|
||||
return SymFloat(toSymNodeImplUnowned()->sym_float());
|
||||
}
|
||||
|
||||
@ -44,8 +44,7 @@ bool has_simple_data_ptr(const c10::StorageImpl& storage) {
|
||||
}
|
||||
|
||||
bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
|
||||
return reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
|
||||
reinterpret_cast<const void*>(&cow::cow_deleter);
|
||||
return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
|
||||
}
|
||||
|
||||
c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
|
||||
|
||||
@ -20,7 +20,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
|
||||
tokenizer.checkToken(++i, ":");
|
||||
i++; // Move to the value after the colon
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
|
||||
(tokenizer[i] == PYTORCH_TOKEN2)),
|
||||
"Unknown allocator backend, "
|
||||
@ -36,7 +36,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
|
||||
" != ",
|
||||
get()->name());
|
||||
#else // USE_ROCM
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
|
||||
"Unknown allocator backend, "
|
||||
"options are native and " PYTORCH_TOKEN1);
|
||||
@ -109,7 +109,7 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
||||
} else {
|
||||
const auto& keys =
|
||||
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
keys.find(key) != keys.end(),
|
||||
"Unrecognized key '",
|
||||
key,
|
||||
@ -151,12 +151,12 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
size_t val2 = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
llvm::isPowerOf2_64(val2),
|
||||
"Number of register threads has to be power of 2, got ",
|
||||
val2);
|
||||
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
|
||||
TORCH_CHECK_VALUE(
|
||||
TORCH_CHECK(
|
||||
val2 <= maxThreads,
|
||||
"Number of register threads should be less than or equal to ",
|
||||
maxThreads,
|
||||
@ -171,8 +171,7 @@ size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
size_t val2 = tokenizer.toSizeT(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
val2 > 0, "Pinned reserve segment size has to be greater than 0");
|
||||
TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0");
|
||||
m_pinned_reserve_segment_size_mb = val2;
|
||||
return i;
|
||||
}
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/CUDAMacros.h>
|
||||
#include <c10/util/Deprecated.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/env.h>
|
||||
|
||||
@ -18,14 +17,9 @@ enum class Expandable_Segments_Handle_Type : int {
|
||||
// Environment config parser
|
||||
class C10_CUDA_API CUDAAllocatorConfig {
|
||||
public:
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
|
||||
static size_t max_split_size() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
|
||||
static double garbage_collection_threshold() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
garbage_collection_threshold();
|
||||
@ -70,8 +64,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
return instance().m_pinned_num_register_threads;
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
|
||||
static bool pinned_use_background_threads() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
pinned_use_background_threads();
|
||||
@ -88,15 +80,11 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
return 128;
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
|
||||
static size_t roundup_power2_divisions(size_t size) {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
roundup_power2_divisions(size);
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
|
||||
static std::vector<size_t> roundup_power2_divisions() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
roundup_power2_divisions();
|
||||
@ -107,8 +95,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
max_non_split_rounding_size();
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
|
||||
static std::string last_allocator_settings() {
|
||||
return c10::CachingAllocator::getAllocatorSettings();
|
||||
}
|
||||
|
||||
@ -512,7 +512,7 @@ struct ExpandableSegment {
|
||||
header.segment_size = segment_size_;
|
||||
header.num_handles = end - begin;
|
||||
|
||||
buf.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));
|
||||
buf.write((const char*)&header, sizeof(ShareHeader));
|
||||
for (auto i : c10::irange(begin, end)) {
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
auto& handle = handles_.at(i).value();
|
||||
@ -528,9 +528,7 @@ struct ExpandableSegment {
|
||||
TORCH_CHECK(
|
||||
handle.shareable_handle != std::nullopt,
|
||||
"shareable_handle is null");
|
||||
buf.write(
|
||||
reinterpret_cast<const char*>(&*handle.shareable_handle),
|
||||
sizeof(int));
|
||||
buf.write((const char*)&*handle.shareable_handle, sizeof(int));
|
||||
} else {
|
||||
if (!handle.shareable_handle) {
|
||||
CUmemFabricHandle fabric_handle;
|
||||
@ -543,8 +541,7 @@ struct ExpandableSegment {
|
||||
handle.shareable_handle != std::nullopt,
|
||||
"shareable_handle is null");
|
||||
buf.write(
|
||||
reinterpret_cast<const char*>(&*handle.shareable_handle),
|
||||
sizeof(CUmemFabricHandle));
|
||||
(const char*)&*handle.shareable_handle, sizeof(CUmemFabricHandle));
|
||||
}
|
||||
}
|
||||
return rangeFromHandles(begin, end);
|
||||
@ -555,7 +552,7 @@ struct ExpandableSegment {
|
||||
std::vector<c10::DeviceIndex> peers,
|
||||
std::istream& buf) {
|
||||
ShareHeader header{};
|
||||
buf.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));
|
||||
buf.read((char*)&header, sizeof(ShareHeader));
|
||||
auto segment = std::make_unique<ExpandableSegment>(
|
||||
device, std::nullopt, header.segment_size, std::move(peers));
|
||||
// older build setups (e.g. multiwheels) do not have this syscall, added 2020
|
||||
@ -577,11 +574,11 @@ struct ExpandableSegment {
|
||||
for (auto i : c10::irange(header.num_handles)) {
|
||||
(void)i;
|
||||
int fd = 0;
|
||||
buf.read(reinterpret_cast<char*>(&fd), sizeof(int));
|
||||
buf.read((char*)&fd, sizeof(int));
|
||||
auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
|
||||
if (myfd == -1) {
|
||||
auto err = errno;
|
||||
close(static_cast<int>(pidfd));
|
||||
close((int)pidfd);
|
||||
for (auto& h : segment->handles_) {
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
@ -601,16 +598,15 @@ struct ExpandableSegment {
|
||||
(void*)(uintptr_t)myfd,
|
||||
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
|
||||
LOG(INFO) << "use posix fd to import expandable segments.";
|
||||
close(static_cast<int>(myfd));
|
||||
close((int)myfd);
|
||||
segment->handles_.emplace_back(Handle{handle, std::nullopt});
|
||||
}
|
||||
close(static_cast<int>(pidfd));
|
||||
close((int)pidfd);
|
||||
} else {
|
||||
for (auto i : c10::irange(header.num_handles)) {
|
||||
(void)i;
|
||||
CUmemFabricHandle fabric_handle;
|
||||
buf.read(
|
||||
reinterpret_cast<char*>(&fabric_handle), sizeof(CUmemFabricHandle));
|
||||
buf.read((char*)&fabric_handle, sizeof(CUmemFabricHandle));
|
||||
CUmemGenericAllocationHandle handle = 0;
|
||||
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
|
||||
&handle,
|
||||
@ -1063,7 +1059,7 @@ class RingBuffer {
|
||||
|
||||
void setMaxEntries(size_t size) {
|
||||
std::lock_guard<std::mutex> lk(alloc_trace_lock);
|
||||
alloc_trace_max_entries_ = std::max(static_cast<size_t>(1), size);
|
||||
alloc_trace_max_entries_ = std::max(size_t(1), size);
|
||||
}
|
||||
|
||||
void insertEntries(const T& entry) {
|
||||
@ -1274,7 +1270,7 @@ class DeviceCachingAllocator {
|
||||
large_blocks(/*small=*/false),
|
||||
small_blocks(/*small=*/true) {
|
||||
stats.max_split_size =
|
||||
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
|
||||
static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
|
||||
context_recorder_.store(nullptr);
|
||||
}
|
||||
|
||||
@ -1409,8 +1405,7 @@ class DeviceCachingAllocator {
|
||||
// Do garbage collection if the flag is set.
|
||||
if (C10_UNLIKELY(
|
||||
set_fraction &&
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() >
|
||||
0.0)) {
|
||||
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
|
||||
garbage_collect_cached_blocks(context);
|
||||
}
|
||||
// Attempt allocate
|
||||
@ -1662,7 +1657,7 @@ class DeviceCachingAllocator {
|
||||
stats.active_bytes[stat_type].increase(block->size);
|
||||
stats.requested_bytes[stat_type].increase(block->requested_size);
|
||||
});
|
||||
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_allocations.increase(1);
|
||||
|
||||
auto allocated_bytes_gauge =
|
||||
@ -1931,7 +1926,7 @@ class DeviceCachingAllocator {
|
||||
block->pool->owner_MempoolId(),
|
||||
context ? context : block->context_when_allocated);
|
||||
|
||||
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_allocations.decrease(1);
|
||||
|
||||
// If the block has been used on more than one stream, handle accordingly.
|
||||
@ -1995,16 +1990,15 @@ class DeviceCachingAllocator {
|
||||
while (base_block->prev) {
|
||||
base_block = base_block->prev;
|
||||
}
|
||||
offset = static_cast<const char*>(block->ptr) -
|
||||
static_cast<const char*>(base_block->ptr);
|
||||
offset = (char*)block->ptr - (char*)base_block->ptr;
|
||||
cudaIpcMemHandle_t handle;
|
||||
C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
|
||||
ss.write(reinterpret_cast<const char*>(&handle), CUDA_IPC_HANDLE_SIZE);
|
||||
ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE);
|
||||
} else {
|
||||
ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
|
||||
auto full_range = block->expandable_segment_->share(
|
||||
SegmentRange(block->ptr, block->size), ss);
|
||||
offset = static_cast<const char*>(block->ptr) - full_range.ptr;
|
||||
offset = (char*)block->ptr - full_range.ptr;
|
||||
}
|
||||
return ShareableHandle{offset, ss.str()};
|
||||
}
|
||||
@ -2505,8 +2499,7 @@ class DeviceCachingAllocator {
|
||||
if (size < kMinBlockSize) {
|
||||
return kMinBlockSize;
|
||||
} else {
|
||||
auto divisions =
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
|
||||
auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
|
||||
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
|
||||
return roundup_power2_next_division(size, divisions);
|
||||
} else {
|
||||
@ -3000,7 +2993,7 @@ class DeviceCachingAllocator {
|
||||
if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
|
||||
return remaining >= kMinBlockSize;
|
||||
} else {
|
||||
return (size < AcceleratorAllocatorConfig::max_split_size()) &&
|
||||
return (size < CUDAAllocatorConfig::max_split_size()) &&
|
||||
(remaining > kSmallSize);
|
||||
}
|
||||
}
|
||||
@ -3020,7 +3013,7 @@ class DeviceCachingAllocator {
|
||||
|
||||
if (C10_UNLIKELY(
|
||||
set_fraction &&
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
|
||||
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
|
||||
// Track block reuse interval only when garbage collection is enabled.
|
||||
++pool.get_free_blocks_call_count;
|
||||
}
|
||||
@ -3062,13 +3055,13 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
|
||||
// Do not return an oversized block for a large request
|
||||
if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
|
||||
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
|
||||
if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
|
||||
((*it)->size >= CUDAAllocatorConfig::max_split_size()))
|
||||
return false;
|
||||
// Allow oversized block size to be rounded up but within a limit
|
||||
if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
|
||||
if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
|
||||
((*it)->size >=
|
||||
p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
|
||||
p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
|
||||
return false;
|
||||
p.block = *it;
|
||||
pool.blocks.erase(it);
|
||||
@ -3091,7 +3084,7 @@ class DeviceCachingAllocator {
|
||||
// therefore should be of less overheads.
|
||||
|
||||
size_t gc_threshold = static_cast<size_t>(
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() *
|
||||
CUDAAllocatorConfig::garbage_collection_threshold() *
|
||||
static_cast<double>(allowed_memory_maximum));
|
||||
// No need to trigger GC yet
|
||||
if (total_allocated_memory <= gc_threshold) {
|
||||
@ -3234,13 +3227,12 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
|
||||
total_allocated_memory += size;
|
||||
p.block = new Block(
|
||||
p.device(), p.stream(), size, p.pool, static_cast<char*>(ptr));
|
||||
p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
|
||||
for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
|
||||
stats.segment[stat_type].increase(1);
|
||||
stats.reserved_bytes[stat_type].increase(size);
|
||||
});
|
||||
if (size >= AcceleratorAllocatorConfig::max_split_size())
|
||||
if (size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_segments.increase(1);
|
||||
auto reserved_bytes_gauge =
|
||||
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
|
||||
@ -3269,7 +3261,7 @@ class DeviceCachingAllocator {
|
||||
bool release_available_cached_blocks(
|
||||
const AllocParams& p,
|
||||
const std::shared_ptr<GatheredContext>& context) {
|
||||
if (AcceleratorAllocatorConfig::max_split_size() ==
|
||||
if (CUDAAllocatorConfig::max_split_size() ==
|
||||
std::numeric_limits<size_t>::max())
|
||||
return false;
|
||||
BlockPool& pool = *p.pool;
|
||||
@ -3277,8 +3269,8 @@ class DeviceCachingAllocator {
|
||||
// because of std::unique_ptr, block cannot be trivially copied
|
||||
// Use constructor for search key.
|
||||
Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
|
||||
key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
|
||||
? AcceleratorAllocatorConfig::max_split_size()
|
||||
key.size = (key.size < CUDAAllocatorConfig::max_split_size())
|
||||
? CUDAAllocatorConfig::max_split_size()
|
||||
: key.size;
|
||||
auto it = pool.blocks.lower_bound(&key);
|
||||
if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
|
||||
@ -3291,7 +3283,7 @@ class DeviceCachingAllocator {
|
||||
--it; // Back up one item. Now on the largest block for the correct
|
||||
// stream
|
||||
while ((totalReleased < key.size) &&
|
||||
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
|
||||
((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
|
||||
((*it)->stream == p.stream())) {
|
||||
auto cur = it;
|
||||
bool is_first = cur == pool.blocks.begin();
|
||||
@ -3416,7 +3408,7 @@ class DeviceCachingAllocator {
|
||||
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
|
||||
.current);
|
||||
|
||||
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_segments.decrease(1);
|
||||
pool->blocks.erase(block);
|
||||
delete block;
|
||||
@ -3783,7 +3775,7 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
allocated_blocks;
|
||||
|
||||
static size_t get_mutex_shard_id(void* ptr) {
|
||||
return twang_mix64(reinterpret_cast<uintptr_t>(ptr)) % kNumMutexShard;
|
||||
return twang_mix64((size_t)ptr) % kNumMutexShard;
|
||||
}
|
||||
|
||||
void add_allocated_block(Block* block) {
|
||||
@ -3820,8 +3812,8 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
if (size < device_count) {
|
||||
device_allocator.resize(device_count);
|
||||
for (const auto i : c10::irange(size, device_count)) {
|
||||
device_allocator[i] = std::make_unique<DeviceCachingAllocator>(
|
||||
static_cast<c10::DeviceIndex>(i));
|
||||
device_allocator[i] =
|
||||
std::make_unique<DeviceCachingAllocator>(c10::DeviceIndex(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4067,8 +4059,8 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
|
||||
auto& md = result.config_metadata;
|
||||
md.garbage_collection_threshold =
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold();
|
||||
md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
|
||||
CUDAAllocatorConfig::garbage_collection_threshold();
|
||||
md.max_split_size = CUDAAllocatorConfig::max_split_size();
|
||||
md.pinned_num_register_threads =
|
||||
CUDAAllocatorConfig::pinned_num_register_threads();
|
||||
md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
|
||||
@ -4076,12 +4068,11 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
CUDAAllocatorConfig::release_lock_on_cudamalloc();
|
||||
md.pinned_use_host_register =
|
||||
CUDAAllocatorConfig::pinned_use_cuda_host_register();
|
||||
md.last_allocator_settings =
|
||||
AcceleratorAllocatorConfig::last_allocator_settings();
|
||||
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
|
||||
md.graph_capture_record_stream_reuse =
|
||||
CUDAAllocatorConfig::graph_capture_record_stream_reuse();
|
||||
md.roundup_power2_divisions =
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions();
|
||||
CUDAAllocatorConfig::roundup_power2_divisions();
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -4350,7 +4341,7 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
// SHARABLE_CUDA_MALLOC
|
||||
if (type == SHAREABLE_CUDA_MALLOC) {
|
||||
cudaIpcMemHandle_t cuda_handle;
|
||||
ss.read(reinterpret_cast<char*>(&cuda_handle), CUDA_IPC_HANDLE_SIZE);
|
||||
ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE);
|
||||
C10_CUDA_CHECK(cudaIpcOpenMemHandle(
|
||||
&cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess));
|
||||
} else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) {
|
||||
@ -4459,12 +4450,11 @@ CUDAAllocator* allocator();
|
||||
} // namespace CudaMallocAsync
|
||||
|
||||
struct BackendStaticInitializer {
|
||||
// Parses the environment configuration for CUDA/ROCm allocator backend at
|
||||
// load time. This duplicates some logic from CUDAAllocatorConfig to ensure
|
||||
// lazy initialization without triggering global static constructors. The
|
||||
// function looks for the key "backend" and returns the appropriate allocator
|
||||
// instance based on its value. If no valid configuration is found, it falls
|
||||
// back to the default Native allocator.
|
||||
// Parses env for backend at load time, duplicating some logic from
|
||||
// CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
|
||||
// runtime). Defers verbose exceptions and error checks, including Cuda
|
||||
// version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
|
||||
// works, maybe we should move all of CUDAAllocatorConfig here?
|
||||
CUDAAllocator* parseEnvForBackend() {
|
||||
auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
|
||||
#ifdef USE_ROCM
|
||||
@ -4473,35 +4463,34 @@ struct BackendStaticInitializer {
|
||||
val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
|
||||
}
|
||||
#endif
|
||||
if (!val.has_value()) {
|
||||
val = c10::utils::get_env("PYTORCH_ALLOC_CONF");
|
||||
}
|
||||
if (val.has_value()) {
|
||||
c10::CachingAllocator::ConfigTokenizer tokenizer(val.value());
|
||||
for (size_t i = 0; i < tokenizer.size(); i++) {
|
||||
const auto& key = tokenizer[i];
|
||||
if (key == "backend") {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
i++; // Move to the value after the colon
|
||||
if (tokenizer[i] == "cudaMallocAsync"
|
||||
const std::string& config = val.value();
|
||||
|
||||
std::regex exp("[\\s,]+");
|
||||
std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
|
||||
std::sregex_token_iterator end;
|
||||
std::vector<std::string> options(it, end);
|
||||
|
||||
for (auto option : options) {
|
||||
std::regex exp2("[:]+");
|
||||
std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
|
||||
std::sregex_token_iterator end2;
|
||||
std::vector<std::string> kv(it2, end2);
|
||||
if (kv.size() >= 2) {
|
||||
if (kv[0] == "backend") {
|
||||
#ifdef USE_ROCM
|
||||
// convenience for ROCm users to allow either CUDA or HIP env var
|
||||
|| tokenizer[i] == "hipMallocAsync"
|
||||
// convenience for ROCm users to allow either CUDA or HIP env var
|
||||
if (kv[1] == "cudaMallocAsync" || kv[1] == "hipMallocAsync")
|
||||
#else
|
||||
if (kv[1] == "cudaMallocAsync")
|
||||
#endif
|
||||
) {
|
||||
return CudaMallocAsync::allocator();
|
||||
return CudaMallocAsync::allocator();
|
||||
if (kv[1] == "native")
|
||||
return &Native::allocator;
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
// Skip the key and its value
|
||||
i = tokenizer.skipKey(i);
|
||||
}
|
||||
if (i + 1 < tokenizer.size()) {
|
||||
tokenizer.checkToken(++i, ",");
|
||||
}
|
||||
}
|
||||
}
|
||||
// Default fallback allocator.
|
||||
return &Native::allocator;
|
||||
}
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
|
||||
|
||||
struct UsageStreamHash {
|
||||
size_t operator()(const UsageStream& us) const noexcept {
|
||||
return std::hash<void*>{}(us.stream) + static_cast<size_t>(us.device);
|
||||
return std::hash<void*>{}(us.stream) + size_t(us.device);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -128,7 +128,7 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
|
||||
} else if (s.isExt()) {
|
||||
stream << "EXT";
|
||||
} else {
|
||||
stream << "PRIORITY " << static_cast<int>(s.getStreamType());
|
||||
stream << "PRIORITY " << int(s.getStreamType());
|
||||
}
|
||||
return stream;
|
||||
}
|
||||
|
||||
@ -328,21 +328,5 @@ struct pair {
|
||||
T2 second;
|
||||
};
|
||||
|
||||
#define INSTANTIATE_FOR_ALL_TYPES(MACRO) \
|
||||
MACRO(float); \
|
||||
MACRO(half); \
|
||||
MACRO(bfloat); \
|
||||
MACRO(float2); \
|
||||
MACRO(long); \
|
||||
MACRO(char); \
|
||||
MACRO(uchar); \
|
||||
MACRO(short); \
|
||||
MACRO(int);
|
||||
|
||||
#define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \
|
||||
MACRO(float); \
|
||||
MACRO(half); \
|
||||
MACRO(bfloat);
|
||||
|
||||
} // namespace metal
|
||||
} // namespace c10
|
||||
|
||||
@ -67,8 +67,8 @@ TEST(AllocatorConfigTest, allocator_config_test) {
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
|
||||
// EXPECT_EQ(
|
||||
// AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1);
|
||||
EXPECT_EQ(
|
||||
@ -101,8 +101,8 @@ TEST(AllocatorConfigTest, allocator_config_test) {
|
||||
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
|
||||
// EXPECT_EQ(
|
||||
// AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
|
||||
EXPECT_EQ(
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2);
|
||||
|
||||
|
||||
@ -46,8 +46,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
|
||||
for (const auto i : c10::irange(replicates)) {
|
||||
auto delta_ns = end_times[i].t_ - start_times_[i].t_;
|
||||
auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
|
||||
scale_factors[i] =
|
||||
static_cast<double>(delta_ns) / static_cast<double>(delta_approx);
|
||||
scale_factors[i] = (double)delta_ns / (double)delta_approx;
|
||||
}
|
||||
std::sort(scale_factors.begin(), scale_factors.end());
|
||||
long double scale_factor = scale_factors[replicates / 2 + 1];
|
||||
@ -65,8 +64,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
|
||||
for (const auto i : c10::irange(replicates)) {
|
||||
auto dt = start_times_[i].t_ - t0;
|
||||
auto dt_approx =
|
||||
static_cast<double>(start_times_[i].approx_t_ - t0_approx) *
|
||||
scale_factor;
|
||||
(double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
|
||||
t0_correction[i] = dt - (time_t)dt_approx; // NOLINT
|
||||
}
|
||||
t0 += t0_correction[t0_correction.size() / 2 + 1]; // NOLINT
|
||||
@ -74,9 +72,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
|
||||
return [=](approx_time_t t_approx) {
|
||||
// See above for why this is more stable than `A * t_approx + B`.
|
||||
return t_approx > t0_approx
|
||||
? static_cast<time_t>(
|
||||
static_cast<double>(t_approx - t0_approx) * scale_factor) +
|
||||
t0
|
||||
? (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0
|
||||
: 0;
|
||||
};
|
||||
}
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
@ -40,200 +41,106 @@ namespace c10 {
|
||||
///
|
||||
/// This is intended to be trivially copyable, so it should be passed by
|
||||
/// value.
|
||||
///
|
||||
/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct
|
||||
/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of
|
||||
/// the underlying constexpr calls, we rely on apparent-type dispatch for
|
||||
/// inheritance. This should be fine because their memory format is the same,
|
||||
/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods.
|
||||
/// However, you should prefer to use ArrayRef when possible, because its use
|
||||
/// of TORCH_CHECK will lead to better user-facing error messages.
|
||||
template <typename T>
|
||||
class ArrayRef final {
|
||||
class ArrayRef final : public HeaderOnlyArrayRef<T> {
|
||||
public:
|
||||
using iterator = const T*;
|
||||
using const_iterator = const T*;
|
||||
using size_type = size_t;
|
||||
using value_type = T;
|
||||
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
|
||||
private:
|
||||
/// The start of the array, in an external buffer.
|
||||
const T* Data;
|
||||
|
||||
/// The number of elements.
|
||||
size_type Length;
|
||||
|
||||
void debugCheckNullptrInvariant() {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
Data != nullptr || Length == 0,
|
||||
"created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal");
|
||||
}
|
||||
|
||||
public:
|
||||
/// @name Constructors
|
||||
/// @name Constructors, all inherited from HeaderOnlyArrayRef except for
|
||||
/// SmallVector.
|
||||
/// @{
|
||||
|
||||
/// Construct an empty ArrayRef.
|
||||
/* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
|
||||
using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef;
|
||||
|
||||
/// Construct an ArrayRef from a single element.
|
||||
// TODO Make this explicit
|
||||
constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
|
||||
|
||||
/// Construct an ArrayRef from a pointer and length.
|
||||
constexpr ArrayRef(const T* data, size_t length)
|
||||
: Data(data), Length(length) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a range.
|
||||
constexpr ArrayRef(const T* begin, const T* end)
|
||||
: Data(begin), Length(end - begin) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
/// Construct an ArrayRef from a std::vector.
|
||||
/// This constructor is identical to the one in HeaderOnlyArrayRef, but we
|
||||
/// include it to help with Class Template Argument Deduction (CTAD).
|
||||
/// Without it, CTAD can fail sometimes due to the indirect constructor
|
||||
/// inheritance. So we explicitly include this constructor.
|
||||
template <typename A>
|
||||
/* implicit */ ArrayRef(const std::vector<T, A>& Vec)
|
||||
: HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
|
||||
|
||||
/// Construct an ArrayRef from a SmallVector. This is templated in order to
|
||||
/// avoid instantiating SmallVectorTemplateCommon<T> whenever we
|
||||
/// copy-construct an ArrayRef.
|
||||
/// NOTE: this is the only constructor that is not inherited from
|
||||
/// HeaderOnlyArrayRef.
|
||||
template <typename U>
|
||||
/* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
|
||||
: Data(Vec.data()), Length(Vec.size()) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
template <
|
||||
typename Container,
|
||||
typename U = decltype(std::declval<Container>().data()),
|
||||
typename = std::enable_if_t<
|
||||
(std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
|
||||
/* implicit */ ArrayRef(const Container& container)
|
||||
: Data(container.data()), Length(container.size()) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a std::vector.
|
||||
// The enable_if stuff here makes sure that this isn't used for
|
||||
// std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
|
||||
// bitfield.
|
||||
template <typename A>
|
||||
/* implicit */ ArrayRef(const std::vector<T, A>& Vec)
|
||||
: Data(Vec.data()), Length(Vec.size()) {
|
||||
static_assert(
|
||||
!std::is_same_v<T, bool>,
|
||||
"ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a std::array
|
||||
template <size_t N>
|
||||
/* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
|
||||
: Data(Arr.data()), Length(N) {}
|
||||
|
||||
/// Construct an ArrayRef from a C array.
|
||||
template <size_t N>
|
||||
// NOLINTNEXTLINE(*c-arrays*)
|
||||
/* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
|
||||
|
||||
/// Construct an ArrayRef from a std::initializer_list.
|
||||
/* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
|
||||
: Data(
|
||||
std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
|
||||
: std::begin(Vec)),
|
||||
Length(Vec.size()) {}
|
||||
: HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
|
||||
|
||||
/// @}
|
||||
/// @name Simple Operations
|
||||
/// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef
|
||||
/// @{
|
||||
|
||||
constexpr iterator begin() const {
|
||||
return Data;
|
||||
}
|
||||
constexpr iterator end() const {
|
||||
return Data + Length;
|
||||
}
|
||||
|
||||
// These are actually the same as iterator, since ArrayRef only
|
||||
// gives you const iterators.
|
||||
constexpr const_iterator cbegin() const {
|
||||
return Data;
|
||||
}
|
||||
constexpr const_iterator cend() const {
|
||||
return Data + Length;
|
||||
}
|
||||
|
||||
constexpr reverse_iterator rbegin() const {
|
||||
return reverse_iterator(end());
|
||||
}
|
||||
constexpr reverse_iterator rend() const {
|
||||
return reverse_iterator(begin());
|
||||
}
|
||||
|
||||
/// Check if all elements in the array satisfy the given expression
|
||||
constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
|
||||
return std::all_of(cbegin(), cend(), pred);
|
||||
}
|
||||
|
||||
/// empty - Check if the array is empty.
|
||||
constexpr bool empty() const {
|
||||
return Length == 0;
|
||||
}
|
||||
|
||||
constexpr const T* data() const {
|
||||
return Data;
|
||||
}
|
||||
|
||||
/// size - Get the array size.
|
||||
constexpr size_t size() const {
|
||||
return Length;
|
||||
}
|
||||
|
||||
/// front - Get the first element.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& front() const {
|
||||
TORCH_CHECK(
|
||||
!empty(), "ArrayRef: attempted to access front() of empty list");
|
||||
return Data[0];
|
||||
!this->empty(), "ArrayRef: attempted to access front() of empty list");
|
||||
return this->Data[0];
|
||||
}
|
||||
|
||||
/// back - Get the last element.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& back() const {
|
||||
TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
|
||||
return Data[Length - 1];
|
||||
}
|
||||
|
||||
/// equals - Check for element-wise equality.
|
||||
constexpr bool equals(ArrayRef RHS) const {
|
||||
return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
|
||||
TORCH_CHECK(
|
||||
!this->empty(), "ArrayRef: attempted to access back() of empty list");
|
||||
return this->Data[this->Length - 1];
|
||||
}
|
||||
|
||||
/// slice(n, m) - Take M elements of the array starting at element N
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr ArrayRef<T> slice(size_t N, size_t M) const {
|
||||
TORCH_CHECK(
|
||||
N + M <= size(),
|
||||
N + M <= this->size(),
|
||||
"ArrayRef: invalid slice, N = ",
|
||||
N,
|
||||
"; M = ",
|
||||
M,
|
||||
"; size = ",
|
||||
size());
|
||||
return ArrayRef<T>(data() + N, M);
|
||||
this->size());
|
||||
return ArrayRef<T>(this->data() + N, M);
|
||||
}
|
||||
|
||||
/// slice(n) - Chop off the first N elements of the array.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr ArrayRef<T> slice(size_t N) const {
|
||||
TORCH_CHECK(
|
||||
N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
|
||||
return slice(N, size() - N);
|
||||
N <= this->size(),
|
||||
"ArrayRef: invalid slice, N = ",
|
||||
N,
|
||||
"; size = ",
|
||||
this->size());
|
||||
return slice(N, this->size() - N); // should this slice be this->slice?
|
||||
}
|
||||
|
||||
/// @}
|
||||
/// @name Operator Overloads
|
||||
/// @{
|
||||
constexpr const T& operator[](size_t Index) const {
|
||||
return Data[Index];
|
||||
}
|
||||
|
||||
/// Vector compatibility
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& at(size_t Index) const {
|
||||
TORCH_CHECK(
|
||||
Index < Length,
|
||||
Index < this->Length,
|
||||
"ArrayRef: invalid index Index = ",
|
||||
Index,
|
||||
"; Length = ",
|
||||
Length);
|
||||
return Data[Index];
|
||||
this->Length);
|
||||
return this->Data[Index];
|
||||
}
|
||||
|
||||
/// Disallow accidental assignment from a temporary.
|
||||
@ -253,13 +160,6 @@ class ArrayRef final {
|
||||
std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
|
||||
std::initializer_list<U>) = delete;
|
||||
|
||||
/// @}
|
||||
/// @name Expensive Operations
|
||||
/// @{
|
||||
std::vector<T> vec() const {
|
||||
return std::vector<T>(Data, Data + Length);
|
||||
}
|
||||
|
||||
/// @}
|
||||
};
|
||||
|
||||
|
||||
@ -132,15 +132,15 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) {
|
||||
int div_base_log = 0;
|
||||
switch (flags & std::ios::basefield) {
|
||||
case std::ios::hex:
|
||||
div = static_cast<uint64_t>(0x1000000000000000u); // 16^15
|
||||
div = (uint64_t)0x1000000000000000u; // 16^15
|
||||
div_base_log = 15;
|
||||
break;
|
||||
case std::ios::oct:
|
||||
div = static_cast<uint64_t>(01000000000000000000000u); // 8^21
|
||||
div = (uint64_t)01000000000000000000000u; // 8^21
|
||||
div_base_log = 21;
|
||||
break;
|
||||
default: // std::ios::dec
|
||||
div = static_cast<uint64_t>(10000000000000000000u); // 10^19
|
||||
div = (uint64_t)10000000000000000000u; // 10^19
|
||||
div_base_log = 19;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -74,7 +74,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
|
||||
)
|
||||
|
||||
code.append(" " + OutType + "* op = &out[rangeIndex * block_size];")
|
||||
for i in range(uf):
|
||||
for i in range(0, uf):
|
||||
j = 8 * i
|
||||
code.append(" __m256 vop" + str(j) + " = _mm256_setzero_ps();")
|
||||
|
||||
@ -158,7 +158,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
|
||||
"&input[idx_pref_T0 * fused_block_size];"
|
||||
)
|
||||
|
||||
for i in range(uf):
|
||||
for i in range(0, uf):
|
||||
j = 8 * i
|
||||
cachelinesize = 64
|
||||
byteoffset = sizeof[InType] * j
|
||||
@ -170,7 +170,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
|
||||
code.append(" if (!normalize_by_lengths || length == 0) {")
|
||||
else:
|
||||
code.append(" if (!normalize_by_lengths || lengths[rangeIndex] == 0) {")
|
||||
for i in range(uf):
|
||||
for i in range(0, uf):
|
||||
j = 8 * i
|
||||
code.append(" _mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
|
||||
code.append(" } else {")
|
||||
@ -181,7 +181,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
|
||||
code.append(
|
||||
" __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);"
|
||||
)
|
||||
for i in range(uf):
|
||||
for i in range(0, uf):
|
||||
j = 8 * i
|
||||
code.append(
|
||||
" _mm256_storeu_ps(&op["
|
||||
|
||||
@ -224,12 +224,6 @@ AMD/ROCm/HIP
|
||||
- Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
|
||||
- (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
|
||||
|
||||
XPU/Intel GPU
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
- Eikan Wang (`EikanWang <https://github.com/EikanWang>`__)
|
||||
- Guangye Yu (`guangyey <https://github.com/guangyey>`__)
|
||||
|
||||
Build + CI
|
||||
~~~~~~~~~~
|
||||
|
||||
|
||||
47
example3.py
Normal file
47
example3.py
Normal file
@ -0,0 +1,47 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
# 1. Define a simple neural network.
|
||||
class SimpleNet(nn.Module):
|
||||
def __init__(self):
|
||||
super(SimpleNet, self).__init__()
|
||||
self.fc1 = nn.Linear(5, 3)
|
||||
self.fc2 = nn.Linear(3, 1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.fc2(x)
|
||||
return x
|
||||
|
||||
# Instantiate the model.
|
||||
model = SimpleNet()
|
||||
|
||||
# Create dummy input data.
|
||||
input_data = torch.randn(1, 5)
|
||||
|
||||
# --- After the forward pass, before the backward pass ---
|
||||
# 2. Perform a forward pass.
|
||||
output = model(input_data)
|
||||
|
||||
# Print gradients before backward(). They will be None.
|
||||
print("--- Gradients after forward pass (before backward pass) ---")
|
||||
print("fc1.weight.grad:", model.fc1.weight.grad)
|
||||
print("fc1.bias.grad:", model.fc1.bias.grad)
|
||||
print("fc2.weight.grad:", model.fc2.weight.grad)
|
||||
print("fc2.bias.grad:", model.fc2.bias.grad)
|
||||
print("-" * 50)
|
||||
|
||||
# 3. Calculate a scalar loss.
|
||||
target_data = torch.randn(1, 1)
|
||||
loss = nn.MSELoss()(output, target_data)
|
||||
|
||||
# --- After the backward pass ---
|
||||
# 4. Perform the backward pass to compute gradients.
|
||||
loss.backward()
|
||||
|
||||
# Print gradients after backward(). They will now be populated with values.
|
||||
print("--- Gradients after backward pass ---")
|
||||
print("fc1.weight.grad:\n", model.fc1.weight.grad)
|
||||
print("fc1.bias.grad:\n", model.fc1.bias.grad)
|
||||
print("fc2.weight.grad:\n", model.fc2.weight.grad)
|
||||
print("fc2.bias.grad:\n", model.fc2.bias.grad)
|
||||
@ -159,6 +159,8 @@ ignore = [
|
||||
"EXE001",
|
||||
"F405",
|
||||
"FURB122", # writelines
|
||||
# these ignores are from flake8-logging-format; please fix!
|
||||
"G101",
|
||||
# these ignores are from ruff NPY; please fix!
|
||||
"NPY002",
|
||||
# these ignores are from ruff PERF; please fix!
|
||||
@ -202,10 +204,14 @@ select = [
|
||||
"NPY",
|
||||
"PERF",
|
||||
"PGH004",
|
||||
"PIE",
|
||||
"PIE790",
|
||||
"PIE794",
|
||||
"PIE800",
|
||||
"PIE804",
|
||||
"PIE807",
|
||||
"PIE810",
|
||||
"PLC0131", # type bivariance
|
||||
"PLC0132", # type param mismatch
|
||||
"PLC1802", # len({expression}) used as condition without comparison
|
||||
"PLC0205", # string as __slots__
|
||||
"PLC3002", # unnecessary-direct-lambda-call
|
||||
"PLE",
|
||||
@ -213,7 +219,6 @@ select = [
|
||||
"PLR0206", # property with params
|
||||
"PLR1722", # use sys exit
|
||||
"PLR1736", # unnecessary list index
|
||||
"PLW0127", # Self-assignment of variable
|
||||
"PLW0129", # assert on string literal
|
||||
"PLW0131", # named expr without context
|
||||
"PLW0133", # useless exception statement
|
||||
|
||||
@ -23,12 +23,10 @@ project-includes = [
|
||||
project-excludes = [
|
||||
# ==== below will be enabled directory by directory ====
|
||||
# ==== to test Pyrefly on a specific directory, simply comment it out ====
|
||||
"torch/_inductor/runtime",
|
||||
"torch/_inductor/codegen/triton.py",
|
||||
"tools/linter/adapters/test_device_bias_linter.py",
|
||||
"tools/code_analyzer/gen_operators_yaml.py",
|
||||
"torch/_inductor/runtime/triton_heuristics.py",
|
||||
"torch/_inductor/runtime/triton_helpers.py",
|
||||
"torch/_inductor/runtime/halide_helpers.py",
|
||||
# formatting issues, will turn on after adjusting where suppressions can be
|
||||
# in import statements
|
||||
"tools/flight_recorder/components/types.py",
|
||||
|
||||
@ -190,7 +190,7 @@ class TestActivationSparsifier(TestCase):
|
||||
if features is None:
|
||||
assert torch.all(mask * input_data == output)
|
||||
else:
|
||||
for feature_idx in range(len(features)):
|
||||
for feature_idx in range(0, len(features)):
|
||||
feature = torch.Tensor(
|
||||
[features[feature_idx]], device=input_data.device
|
||||
).long()
|
||||
@ -378,7 +378,7 @@ class TestActivationSparsifier(TestCase):
|
||||
# some dummy data
|
||||
data_list = []
|
||||
num_data_points = 5
|
||||
for _ in range(num_data_points):
|
||||
for _ in range(0, num_data_points):
|
||||
rand_data = torch.randn(16, 1, 28, 28)
|
||||
activation_sparsifier.model(rand_data)
|
||||
data_list.append(rand_data)
|
||||
|
||||
@ -143,7 +143,7 @@ class TestBaseDataScheduler(TestCase):
|
||||
|
||||
# checking step count
|
||||
step_cnt = 5
|
||||
for _ in range(step_cnt):
|
||||
for _ in range(0, step_cnt):
|
||||
sparsifier.step()
|
||||
scheduler.step()
|
||||
|
||||
|
||||
@ -123,7 +123,7 @@ class _BaseDataSparsiferTestCase(TestCase):
|
||||
|
||||
step_count = 3
|
||||
|
||||
for _ in range(step_count):
|
||||
for _ in range(0, step_count):
|
||||
sparsifier.step()
|
||||
for some_data in all_data:
|
||||
name, data, _ = self._get_name_data_config(some_data)
|
||||
|
||||
@ -472,8 +472,8 @@ class TestNearlyDiagonalSparsifier(TestCase):
|
||||
else:
|
||||
height, width = mask.shape
|
||||
dist_to_diagonal = nearliness // 2
|
||||
for row in range(height):
|
||||
for col in range(width):
|
||||
for row in range(0, height):
|
||||
for col in range(0, width):
|
||||
if abs(row - col) <= dist_to_diagonal:
|
||||
assert mask[row, col] == 1
|
||||
else:
|
||||
|
||||
@ -7,6 +7,7 @@ set(AOTI_ABI_CHECK_TEST_SRCS
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
|
||||
|
||||
52
test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
Normal file
52
test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
using torch::headeronly::HeaderOnlyArrayRef;
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestEmpty) {
|
||||
HeaderOnlyArrayRef<float> arr;
|
||||
ASSERT_TRUE(arr.empty());
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestSingleton) {
|
||||
float val = 5.0f;
|
||||
HeaderOnlyArrayRef<float> arr(val);
|
||||
ASSERT_FALSE(arr.empty());
|
||||
EXPECT_EQ(arr.size(), 1);
|
||||
EXPECT_EQ(arr[0], val);
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestAPIs) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr(vec);
|
||||
ASSERT_FALSE(arr.empty());
|
||||
EXPECT_EQ(arr.size(), 7);
|
||||
for (size_t i = 0; i < arr.size(); i++) {
|
||||
EXPECT_EQ(arr[i], i + 1);
|
||||
EXPECT_EQ(arr.at(i), i + 1);
|
||||
}
|
||||
EXPECT_EQ(arr.front(), 1);
|
||||
EXPECT_EQ(arr.back(), 7);
|
||||
ASSERT_TRUE(arr.slice(3, 4).equals(arr.slice(3)));
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestFromInitializerList) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr({1, 2, 3, 4, 5, 6, 7});
|
||||
auto res_vec = arr.vec();
|
||||
for (size_t i = 0; i < vec.size(); i++) {
|
||||
EXPECT_EQ(vec[i], res_vec[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestFromRange) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr(vec.data() + 3, vec.data() + 7);
|
||||
auto res_vec = arr.vec();
|
||||
for (size_t i = 0; i < res_vec.size(); i++) {
|
||||
EXPECT_EQ(vec[i + 3], res_vec[i]);
|
||||
}
|
||||
}
|
||||
@ -53,3 +53,24 @@ TEST_FORALL(AT_FORALL_COMPLEX_TYPES, 2)
|
||||
|
||||
#undef DEFINE_CHECK
|
||||
#undef TEST_FORALL
|
||||
|
||||
TEST(TestScalarType, toString) {
|
||||
using torch::headeronly::ScalarType;
|
||||
|
||||
#define DEFINE_CHECK(_, name) EXPECT_EQ(toString(ScalarType::name), #name);
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
TEST(TestScalarType, operator_left_shift) {
|
||||
using torch::headeronly::ScalarType;
|
||||
|
||||
#define DEFINE_CHECK(_, name) \
|
||||
{ \
|
||||
std::stringstream ss; \
|
||||
ss << ScalarType::name; \
|
||||
EXPECT_EQ(ss.str(), #name); \
|
||||
}
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
@ -311,10 +311,9 @@ void boxed_fill_infinity(
|
||||
}
|
||||
|
||||
Tensor my_pad(Tensor t) {
|
||||
std::vector<int64_t> padding = {1, 2, 2, 1};
|
||||
std::string mode = "constant";
|
||||
double value = 0.0;
|
||||
return pad(t, padding, mode, value);
|
||||
return pad(t, {1, 2, 2, 1}, mode, value);
|
||||
}
|
||||
|
||||
void boxed_my_pad(
|
||||
@ -342,6 +341,9 @@ void boxed_my_narrow(
|
||||
}
|
||||
|
||||
Tensor my_new_empty_dtype_variant(Tensor t) {
|
||||
// Still using a std::vector below even though people can just pass in an
|
||||
// initializer list (which will be implicitly converted to an HeaderOnlyArrayRef)
|
||||
// directly.
|
||||
std::vector<int64_t> sizes = {2, 5};
|
||||
auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
|
||||
return new_empty(t, sizes, dtype);
|
||||
@ -353,9 +355,8 @@ void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, ui
|
||||
}
|
||||
|
||||
Tensor my_new_zeros_dtype_variant(Tensor t) {
|
||||
std::vector<int64_t> sizes = {2, 5};
|
||||
auto dtype = std::make_optional(at::ScalarType::Float);
|
||||
return new_zeros(t, sizes, dtype);
|
||||
return new_zeros(t, {2, 5}, dtype);
|
||||
}
|
||||
|
||||
void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
@ -429,8 +430,7 @@ void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs)
|
||||
}
|
||||
|
||||
Tensor my_amax_vec(Tensor t) {
|
||||
std::vector<int64_t> v = {0,1};
|
||||
return amax(t, v, false);
|
||||
return amax(t, {0,1}, false);
|
||||
}
|
||||
|
||||
void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
|
||||
@ -1166,7 +1166,7 @@ class TestFullyShardPrefetch(FSDPTest):
|
||||
loss = model(inp)
|
||||
events.clear()
|
||||
loss.sum().backward()
|
||||
expected_backward_events = [
|
||||
expected_backward_events = expected_backward_events = [
|
||||
("unshard", "norm, output", TrainingState.PRE_BACKWARD),
|
||||
# root explicit prefetch layers.2
|
||||
("unshard", "layers.2", TrainingState.PRE_BACKWARD),
|
||||
|
||||
@ -79,7 +79,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
dist.init_process_group(
|
||||
store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
|
||||
)
|
||||
group = list(range(self.world_size))
|
||||
group = list(range(0, self.world_size))
|
||||
group_id = dist.group.WORLD
|
||||
self._test_all_gather(
|
||||
group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16
|
||||
@ -94,7 +94,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
dist.init_process_group(
|
||||
store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
|
||||
)
|
||||
group = list(range(self.world_size))
|
||||
group = list(range(0, self.world_size))
|
||||
group_id = dist.group.WORLD
|
||||
self._test_all_gather(
|
||||
group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16
|
||||
@ -111,7 +111,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
dist.init_process_group(
|
||||
store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
|
||||
)
|
||||
group = list(range(self.world_size))
|
||||
group = list(range(0, self.world_size))
|
||||
group_id = dist.new_group(range(self.world_size))
|
||||
rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
|
||||
self._test_all_to_all(
|
||||
@ -135,7 +135,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
dist.init_process_group(
|
||||
store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
|
||||
)
|
||||
group = list(range(self.world_size))
|
||||
group = list(range(0, self.world_size))
|
||||
group_id = dist.new_group(range(self.world_size))
|
||||
rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
|
||||
self._test_all_to_all(
|
||||
@ -158,7 +158,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
dist.init_process_group(
|
||||
store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
|
||||
)
|
||||
group = list(range(self.world_size))
|
||||
group = list(range(0, self.world_size))
|
||||
group_id = dist.new_group(range(self.world_size))
|
||||
rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
|
||||
self._test_all_to_all_single(
|
||||
@ -181,7 +181,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
dist.init_process_group(
|
||||
store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
|
||||
)
|
||||
group = list(range(self.world_size))
|
||||
group = list(range(0, self.world_size))
|
||||
group_id = dist.new_group(range(self.world_size))
|
||||
rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
|
||||
self._test_all_to_all_single(
|
||||
|
||||
@ -66,7 +66,7 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
|
||||
shards_metadata = []
|
||||
local_shards = []
|
||||
for idx in range(world_size * shards_per_rank):
|
||||
for idx in range(0, world_size * shards_per_rank):
|
||||
shard_rank = idx // shards_per_rank
|
||||
shard_md = ShardMetadata(
|
||||
shard_offsets=[idx * shard_size],
|
||||
|
||||
@ -45,7 +45,7 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
def create_sharded_tensor(rank, world_size, shards_per_rank):
|
||||
shards_metadata = []
|
||||
local_shards = []
|
||||
for idx in range(world_size * shards_per_rank):
|
||||
for idx in range(0, world_size * shards_per_rank):
|
||||
shard_rank = idx // shards_per_rank
|
||||
shard_md = ShardMetadata(
|
||||
shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"
|
||||
|
||||
@ -633,7 +633,7 @@ class SimpleElasticAgentTest(unittest.TestCase):
|
||||
worker_group = agent.get_worker_group()
|
||||
|
||||
num_restarts = 3
|
||||
for _ in range(num_restarts):
|
||||
for _ in range(0, num_restarts):
|
||||
agent._restart_workers(worker_group)
|
||||
self.assertEqual(WorkerState.HEALTHY, worker_group.state)
|
||||
|
||||
|
||||
@ -146,7 +146,7 @@ def echo_large(size: int) -> dict[int, str]:
|
||||
returns a large output ({0: test0", 1: "test1", ..., (size-1):f"test{size-1}"})
|
||||
"""
|
||||
out = {}
|
||||
for idx in range(size):
|
||||
for idx in range(0, size):
|
||||
out[idx] = f"test{idx}"
|
||||
return out
|
||||
|
||||
|
||||
@ -191,7 +191,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
|
||||
"""
|
||||
client = timer.FileTimerClient(file_path)
|
||||
sem.release()
|
||||
for _ in range(n):
|
||||
for _ in range(0, n):
|
||||
client.acquire("test_scope", 0)
|
||||
time.sleep(interval)
|
||||
|
||||
|
||||
@ -102,7 +102,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
|
||||
|
||||
world_size = 8
|
||||
processes = []
|
||||
for i in range(world_size):
|
||||
for i in range(0, world_size):
|
||||
if i % 2 == 0:
|
||||
p = spawn_ctx.Process(target=_stuck_function, args=(i, mp_queue))
|
||||
else:
|
||||
@ -110,7 +110,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
for i in range(world_size):
|
||||
for i in range(0, world_size):
|
||||
p = processes[i]
|
||||
p.join()
|
||||
if i % 2 == 0:
|
||||
|
||||
@ -127,7 +127,7 @@ if not INVALID_PLATFORMS:
|
||||
interval seconds. Releases the given semaphore once before going to work.
|
||||
"""
|
||||
sem.release()
|
||||
for i in range(n):
|
||||
for i in range(0, n):
|
||||
mp_queue.put(TimerRequest(i, "test_scope", 0))
|
||||
time.sleep(interval)
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ class CyclingIteratorTest(unittest.TestCase):
|
||||
def generator(self, epoch, stride, max_epochs):
|
||||
# generate an continuously incrementing list each epoch
|
||||
# e.g. [0,1,2] [3,4,5] [6,7,8] ...
|
||||
return iter([stride * epoch + i for i in range(stride)])
|
||||
return iter([stride * epoch + i for i in range(0, stride)])
|
||||
|
||||
def test_cycling_iterator(self):
|
||||
stride = 3
|
||||
@ -25,7 +25,7 @@ class CyclingIteratorTest(unittest.TestCase):
|
||||
return self.generator(epoch, stride, max_epochs)
|
||||
|
||||
it = CyclingIterator(n=max_epochs, generator_fn=generator_fn)
|
||||
for i in range(stride * max_epochs):
|
||||
for i in range(0, stride * max_epochs):
|
||||
self.assertEqual(i, next(it))
|
||||
|
||||
with self.assertRaises(StopIteration):
|
||||
|
||||
@ -124,7 +124,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
model = MyModel().to(device_type)
|
||||
num_node_devices = torch.accelerator.device_count()
|
||||
shard_rank_lists = (
|
||||
list(range(num_node_devices // 2)),
|
||||
list(range(0, num_node_devices // 2)),
|
||||
list(range(num_node_devices // 2, num_node_devices)),
|
||||
)
|
||||
shard_groups = (
|
||||
@ -175,7 +175,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
model = MyModel().to(device_type)
|
||||
num_node_devices = torch.accelerator.device_count()
|
||||
shard_rank_lists = (
|
||||
list(range(num_node_devices // 2)),
|
||||
list(range(0, num_node_devices // 2)),
|
||||
list(range(num_node_devices // 2, num_node_devices)),
|
||||
)
|
||||
shard_groups = (
|
||||
|
||||
@ -337,70 +337,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
if self.rank == self.world_size - 1:
|
||||
self.assertTrue(len(losses) > 0, "Losses should be computed during eval()")
|
||||
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize(
|
||||
"ScheduleClass",
|
||||
[
|
||||
ScheduleGPipe,
|
||||
Schedule1F1B,
|
||||
ScheduleInterleaved1F1B,
|
||||
ScheduleLoopedBFS,
|
||||
ScheduleInterleavedZeroBubble,
|
||||
],
|
||||
)
|
||||
def test_return_output(self, ScheduleClass):
|
||||
num_microbatches = 4
|
||||
if ScheduleClass in [
|
||||
ScheduleInterleaved1F1B,
|
||||
ScheduleLoopedBFS,
|
||||
ScheduleInterleavedZeroBubble,
|
||||
]:
|
||||
# Multi-stage schedules
|
||||
stages_per_rank = 2
|
||||
n_stages = stages_per_rank * self.world_size
|
||||
mod, _, x, target, loss_fn = setup_models_and_data(
|
||||
self.config, n_layers=n_stages
|
||||
)
|
||||
|
||||
# Create multi-stage pipeline
|
||||
stages, stage_modules, _ = create_multi_stage_pipeline(
|
||||
self.config, mod, stages_per_rank, n_stages
|
||||
)
|
||||
schedule = ScheduleClass(
|
||||
stages,
|
||||
num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
else:
|
||||
# Single-stage schedules
|
||||
mod, _, x, target, loss_fn = setup_models_and_data(self.config)
|
||||
|
||||
# Create single-stage pipeline
|
||||
stage, stage_module, _ = create_single_stage_pipeline(
|
||||
self.config, mod, x, num_microbatches
|
||||
)
|
||||
schedule = ScheduleClass(
|
||||
stage,
|
||||
num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
|
||||
losses = []
|
||||
|
||||
if self.rank == self.world_size - 1:
|
||||
output = schedule.step(target=target, losses=losses, return_outputs=False)
|
||||
else:
|
||||
schedule.step(x)
|
||||
|
||||
# Verify that output is None
|
||||
if self.rank == self.world_size - 1:
|
||||
self.assertTrue(output is None, "Output should be None")
|
||||
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
|
||||
@ -771,40 +771,5 @@ class TestCPCustomOps(DTensorTestBase):
|
||||
torch.library.opcheck(flex_cp_allgather, example)
|
||||
|
||||
|
||||
class TestSharding(DTensorTestBase):
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return 2
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@with_comms
|
||||
def test_context_parallel_shard(self) -> None:
|
||||
B = 4
|
||||
seq_len = 32
|
||||
|
||||
device_mesh = init_device_mesh(
|
||||
mesh_shape=(2,), mesh_dim_names=("cp",), device_type=self.device_type
|
||||
)
|
||||
freqs_cis = torch.arange(0, seq_len, device=self.device_type)
|
||||
q = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
|
||||
k = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
|
||||
v = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
|
||||
|
||||
load_balancer = _HeadTailLoadBalancer(
|
||||
seq_len, self.world_size, torch.device(self.device_type)
|
||||
)
|
||||
freqs_cis_shard, q_shard, k_shard, v_shard = _context_parallel_shard(
|
||||
device_mesh, [freqs_cis, q, k, v], [0, 1, 1, 1], load_balancer=load_balancer
|
||||
)
|
||||
self.assertEqual(freqs_cis_shard.size(), (seq_len // 2,))
|
||||
chunks = freqs_cis.chunk(self.world_size * 2)
|
||||
self.assertEqual(
|
||||
freqs_cis_shard,
|
||||
torch.cat(
|
||||
[chunks[self.rank], chunks[self.world_size * 2 - self.rank - 1]], dim=0
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -802,7 +802,7 @@ class TestLocalDTensorOps(TestDTensorOps):
|
||||
self.run_opinfo_test(dtype, op)
|
||||
|
||||
def test_mean(self):
|
||||
with LocalTensorMode(frozenset(range(self.world_size))):
|
||||
with LocalTensorMode(frozenset(range(0, self.world_size))):
|
||||
self.run_mean()
|
||||
|
||||
def test_one_hot(self):
|
||||
@ -811,7 +811,7 @@ class TestLocalDTensorOps(TestDTensorOps):
|
||||
def run_opinfo_test(
|
||||
self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
|
||||
):
|
||||
with LocalTensorMode(frozenset(range(self.world_size))):
|
||||
with LocalTensorMode(frozenset(range(0, self.world_size))):
|
||||
super().run_opinfo_test(dtype, op, requires_grad, sample_inputs_filter)
|
||||
|
||||
def assertEqualOnRank(self, x, y, msg=None, *, rank=0):
|
||||
|
||||
@ -55,7 +55,7 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
|
||||
# load_tests from common_utils is used to automatically filter tests for
|
||||
# sharding on sandcastle. This line silences flake warnings
|
||||
load_tests = load_tests # noqa: PLW0127
|
||||
load_tests = load_tests
|
||||
|
||||
if platform == "darwin":
|
||||
LOOPBACK = "lo0"
|
||||
|
||||
@ -1459,7 +1459,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
|
||||
@requires_gloo()
|
||||
def test_reduce_checks(self):
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
pg = self._create_process_group_gloo(
|
||||
pg = pg = self._create_process_group_gloo(
|
||||
store, self.rank, self.world_size, self.opts()
|
||||
)
|
||||
|
||||
|
||||
@ -21,7 +21,7 @@ except ImportError:
|
||||
|
||||
# load_tests from common_utils is used to automatically filter tests for
|
||||
# sharding on sandcastle. This line silences flake warnings
|
||||
load_tests = load_tests # noqa: PLW0127
|
||||
load_tests = load_tests
|
||||
|
||||
if not c10d.is_available():
|
||||
print("c10d not available, skipping tests", file=sys.stderr)
|
||||
|
||||
@ -536,7 +536,7 @@ class DeviceMeshTestNDim(DTensorTestBase):
|
||||
# Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7))
|
||||
# and assign the correct shard group to each rank
|
||||
shard_rank_lists = (
|
||||
list(range(self.world_size // 2)),
|
||||
list(range(0, self.world_size // 2)),
|
||||
list(range(self.world_size // 2, self.world_size)),
|
||||
)
|
||||
shard_groups = (
|
||||
|
||||
@ -53,13 +53,7 @@ class ProcessGroupTest(TestCase):
|
||||
|
||||
|
||||
class Dist2MultiProcessTestCase(MultiProcessTestCase):
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
raise NotImplementedError
|
||||
|
||||
# @device.setter
|
||||
# def device(self, value: torch.device) -> None:
|
||||
# self._device = value
|
||||
device: torch.device
|
||||
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
@ -263,9 +257,7 @@ class Dist2MultiProcessTestCase(MultiProcessTestCase):
|
||||
|
||||
|
||||
class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
return torch.device("cpu")
|
||||
device = torch.device("cpu")
|
||||
|
||||
@requires_gloo()
|
||||
def new_group(self) -> torch.distributed.ProcessGroup:
|
||||
@ -282,10 +274,6 @@ class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
|
||||
|
||||
|
||||
class ProcessGroupNCCLTest(Dist2MultiProcessTestCase):
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
return torch.device("cuda", self.rank)
|
||||
|
||||
@requires_nccl()
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def new_group(self) -> torch.distributed.ProcessGroup:
|
||||
@ -294,6 +282,8 @@ class ProcessGroupNCCLTest(Dist2MultiProcessTestCase):
|
||||
os.environ["MASTER_ADDR"] = "127.0.0.1"
|
||||
os.environ["MASTER_PORT"] = "29501"
|
||||
|
||||
self.device = torch.device("cuda", self.rank)
|
||||
|
||||
return dist2.new_group(
|
||||
backend="nccl",
|
||||
timeout=timedelta(seconds=60),
|
||||
|
||||
@ -30,7 +30,7 @@ from torch.testing._internal.common_utils import (
|
||||
|
||||
# load_tests from common_utils is used to automatically filter tests for
|
||||
# sharding on sandcastle. This line silences flake warnings
|
||||
load_tests = load_tests # noqa: PLW0127
|
||||
load_tests = load_tests
|
||||
|
||||
nGPUs = torch.cuda.device_count()
|
||||
if not TEST_CUDA:
|
||||
|
||||
@ -43,7 +43,7 @@ from torch.testing._internal.common_utils import (
|
||||
|
||||
# load_tests from common_utils is used to automatically filter tests for
|
||||
# sharding on sandcastle. This line silences flake warnings
|
||||
load_tests = load_tests # noqa: PLW0127
|
||||
load_tests = load_tests
|
||||
|
||||
if platform == "darwin":
|
||||
LOOPBACK = "lo0"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user