Compare commits

..

1 Commits

Author SHA1 Message Date
d2096e78e5 Manylinux ROCm docker images. use devtoolset-13 2025-10-31 18:56:53 -04:00
381 changed files with 1415 additions and 4147 deletions

View File

@ -149,7 +149,7 @@ FROM cpu_final as rocm_final
ARG ROCM_VERSION=6.0
ARG PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
ARG DEVTOOLSET_VERSION=11
ARG DEVTOOLSET_VERSION=13
ENV LDFLAGS="-Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64 -Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib"
# Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path,
# below workaround helps avoid error

View File

@ -184,9 +184,6 @@ if [[ "$TEST_CONFIG" == *crossref* ]]; then
export PYTORCH_TEST_WITH_CROSSREF=1
fi
# Give full Dynamo stack traces on error in CI
export TORCHDYNAMO_VERBOSE=1
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
# regression in ROCm 6.0 on MI50 CI runners due to hipblaslt; remove in 6.1
export VALGRIND=OFF

View File

@ -60,11 +60,9 @@ performance-*,
readability-container-size-empty,
readability-delete-null-pointer,
readability-duplicate-include,
readability-named-parameter,
readability-misplaced-array-index,
readability-redundant*,
readability-simplify-subscript-expr,
readability-static-definition-in-anonymous-namespace
readability-string-compare,
-readability-redundant-access-specifiers,
-readability-redundant-control-flow,

View File

@ -1,319 +0,0 @@
---
name: add-uint-support
description: Add unsigned integer (uint) type support to PyTorch operators by updating AT_DISPATCH macros. Use when adding support for uint16, uint32, uint64 types to operators, kernels, or when user mentions enabling unsigned types, barebones unsigned types, or uint support.
---
# Add Unsigned Integer (uint) Support to Operators
This skill helps add support for unsigned integer types (uint16, uint32, uint64) to PyTorch operators by updating their AT_DISPATCH macros.
## When to use this skill
Use this skill when:
- Adding uint16, uint32, or uint64 support to an operator
- User mentions "unsigned types", "uint support", "barebones unsigned types"
- Enabling support for kUInt16, kUInt32, kUInt64 in kernels
- Working with operator implementations that need expanded type coverage
## Quick reference
**Add unsigned types to existing dispatch:**
```cpp
// Before
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES));
// After (method 1: add unsigned types explicitly)
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
// After (method 2: use V2 integral types if AT_INTEGRAL_TYPES present)
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES));
```
## Type group reference
**Unsigned type groups:**
- `AT_BAREBONES_UNSIGNED_TYPES`: kUInt16, kUInt32, kUInt64
- `AT_INTEGRAL_TYPES_V2`: AT_INTEGRAL_TYPES + AT_BAREBONES_UNSIGNED_TYPES
**Relationship:**
```cpp
AT_INTEGRAL_TYPES // kByte, kChar, kInt, kLong, kShort
AT_BAREBONES_UNSIGNED_TYPES // kUInt16, kUInt32, kUInt64
AT_INTEGRAL_TYPES_V2 // INTEGRAL_TYPES + BAREBONES_UNSIGNED_TYPES
```
## Instructions
### Step 1: Determine if conversion to V2 is needed
Check if the file uses AT_DISPATCH_V2:
**If using old AT_DISPATCH:**
- First convert to AT_DISPATCH_V2 using the at-dispatch-v2 skill
- Then proceed with adding uint support
**If already using AT_DISPATCH_V2:**
- Proceed directly to Step 2
### Step 2: Analyze the current dispatch macro
Identify what type groups are currently in use:
```cpp
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
// body
}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
^^^^^^^^^^^^^^^^^^^^^^^^^
Current type coverage
```
Common patterns:
- `AT_EXPAND(AT_ALL_TYPES)` → includes AT_INTEGRAL_TYPES + AT_FLOATING_TYPES
- `AT_EXPAND(AT_INTEGRAL_TYPES)` → signed integers only
- `AT_EXPAND(AT_FLOATING_TYPES)` → floating point types
### Step 3: Choose the uint addition method
Two approaches:
**Method 1: Add AT_BAREBONES_UNSIGNED_TYPES explicitly**
- Use when: You want to be explicit about adding uint support
- Add `AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)` to the type list
**Method 2: Substitute AT_INTEGRAL_TYPES with AT_INTEGRAL_TYPES_V2**
- Use when: The dispatch already uses `AT_EXPAND(AT_INTEGRAL_TYPES)`
- More concise: replaces one type group with its superset
- Only applicable if AT_INTEGRAL_TYPES is present
### Step 4: Apply the transformation
**Method 1 example:**
```cpp
// Before
AT_DISPATCH_V2(
dtype,
"min_values_cuda",
AT_WRAP([&]() {
kernel_impl<scalar_t>(iter);
}),
AT_EXPAND(AT_ALL_TYPES),
kBFloat16, kHalf, kBool
);
// After (add unsigned types)
AT_DISPATCH_V2(
dtype,
"min_values_cuda",
AT_WRAP([&]() {
kernel_impl<scalar_t>(iter);
}),
AT_EXPAND(AT_ALL_TYPES),
AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
kBFloat16, kHalf, kBool
);
```
**Method 2 example:**
```cpp
// Before
AT_DISPATCH_V2(
dtype,
"integral_op",
AT_WRAP([&]() {
kernel<scalar_t>();
}),
AT_EXPAND(AT_INTEGRAL_TYPES)
);
// After (substitute with V2)
AT_DISPATCH_V2(
dtype,
"integral_op",
AT_WRAP([&]() {
kernel<scalar_t>();
}),
AT_EXPAND(AT_INTEGRAL_TYPES_V2)
);
```
### Step 5: Handle AT_ALL_TYPES vs individual type groups
If the dispatch uses `AT_EXPAND(AT_ALL_TYPES)`:
- `AT_ALL_TYPES` = `AT_INTEGRAL_TYPES` + `AT_FLOATING_TYPES`
- To add uint: add `AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)` to the list
If the dispatch separately lists INTEGRAL and FLOATING:
```cpp
// Before
AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
// After (Method 2 preferred)
AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES)
```
### Step 6: Verify all dispatch sites
Check the file for ALL dispatch macros that need uint support:
- Some operators have multiple dispatch sites (CPU, CUDA, different functions)
- Apply the transformation consistently across all sites
- Ensure each gets the same type coverage updates
### Step 7: Validate the changes
Check that:
- [ ] AT_DISPATCH_V2 format is used (not old AT_DISPATCH)
- [ ] Unsigned types are added via one of the two methods
- [ ] All relevant dispatch sites in the file are updated
- [ ] Type groups use `AT_EXPAND()`
- [ ] Arguments are properly formatted and comma-separated
## Common patterns
### Pattern 1: AT_ALL_TYPES + extras
```cpp
// Before
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
// After
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
```
### Pattern 2: Separate INTEGRAL + FLOATING
```cpp
// Before
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
// After
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES));
```
### Pattern 3: Old dispatch needs conversion first
```cpp
// Before (needs v2 conversion first)
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "op", [&]() {
kernel<scalar_t>();
});
// After v2 conversion
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
// After adding uint support
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
```
## Multiple dispatch sites example
For a file with multiple functions:
```cpp
void min_values_kernel_cuda(TensorIterator& iter) {
AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
impl<scalar_t>(iter);
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// Added uint support
}
void min_launch_kernel(TensorIterator &iter) {
AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
gpu_reduce_kernel<scalar_t>(iter);
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// Added uint support here too
}
```
## Decision tree
Use this decision tree to determine the approach:
```
Is the file using AT_DISPATCH_V2?
├─ No → Use at-dispatch-v2 skill first, then continue
└─ Yes
└─ Does it use AT_EXPAND(AT_INTEGRAL_TYPES)?
├─ Yes → Replace with AT_EXPAND(AT_INTEGRAL_TYPES_V2)
└─ No → Add AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES) to type list
```
## Edge cases
### Case 1: Dispatch with only floating types
If the operator only supports floating point types, don't add uint support:
```cpp
// Leave as-is - floating point only operator
AT_DISPATCH_V2(dtype, "float_op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_FLOATING_TYPES), kHalf);
```
### Case 2: Complex types present
Unsigned types work alongside complex types:
```cpp
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES),
AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
AT_EXPAND(AT_COMPLEX_TYPES),
kHalf, kBFloat16);
```
### Case 3: Already has uint support
Check if uint types are already present:
- If `AT_INTEGRAL_TYPES_V2` is used → already has uint support
- If `AT_BAREBONES_UNSIGNED_TYPES` is already in list → already has uint support
- Skip the file if uint support is already present
## Workflow
When asked to add uint support:
1. Read the target file
2. Check if using AT_DISPATCH_V2:
- If not → use at-dispatch-v2 skill first
3. Identify all dispatch macro sites
4. For each dispatch:
- Analyze current type groups
- Choose method (add BAREBONES_UNSIGNED or upgrade to V2)
- Apply transformation with Edit tool
5. Show the user the changes
6. Explain what was modified
## Important notes
- Always check if v2 conversion is needed first
- Apply changes consistently across all dispatch sites in the file
- Method 2 (AT_INTEGRAL_TYPES_V2) is cleaner when applicable
- Method 1 (explicit AT_BAREBONES_UNSIGNED_TYPES) is more explicit
- Unsigned types are: kUInt16, kUInt32, kUInt64 (not kByte which is uint8)
- Some operators may not semantically support unsigned types - use judgment
## Testing
After adding uint support, the operator should accept uint16, uint32, and uint64 tensors. The user is responsible for functional testing.

View File

@ -1,305 +0,0 @@
---
name: at-dispatch-v2
description: Convert PyTorch AT_DISPATCH macros to AT_DISPATCH_V2 format in ATen C++ code. Use when porting AT_DISPATCH_ALL_TYPES_AND*, AT_DISPATCH_FLOATING_TYPES*, or other dispatch macros to the new v2 API. For ATen kernel files, CUDA kernels, and native operator implementations.
---
# AT_DISPATCH to AT_DISPATCH_V2 Converter
This skill helps convert PyTorch's legacy AT_DISPATCH macros to the new AT_DISPATCH_V2 format, as defined in `aten/src/ATen/Dispatch_v2.h`.
## When to use this skill
Use this skill when:
- Converting AT_DISPATCH_* macros to AT_DISPATCH_V2
- Porting ATen kernels to use the new dispatch API
- Working with files in `aten/src/ATen/native/` that use dispatch macros
- User mentions "AT_DISPATCH", "dispatch v2", "Dispatch_v2.h", or macro conversion
## Quick reference
**Old format:**
```cpp
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, dtype, "kernel_name", [&]() {
// lambda body
});
```
**New format:**
```cpp
AT_DISPATCH_V2(dtype, "kernel_name", AT_WRAP([&]() {
// lambda body
}), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, kBool);
```
## Key transformations
1. **Reorder arguments**: `scalar_type` and `name` come first, then lambda, then types
2. **Wrap the lambda**: Use `AT_WRAP(lambda)` to handle internal commas
3. **Expand type groups**: Use `AT_EXPAND(AT_ALL_TYPES)` instead of implicit expansion
4. **List individual types**: Add extra types (kHalf, kBFloat16, etc.) after expanded groups
5. **Add include**: `#include <ATen/Dispatch_v2.h>` near other Dispatch includes
## Instructions
### Step 1: Add the Dispatch_v2.h include
Add the v2 header near the existing `#include <ATen/Dispatch.h>`:
```cpp
#include <ATen/Dispatch.h>
#include <ATen/Dispatch_v2.h>
```
Keep the old Dispatch.h include for now (other code may still need it).
### Step 2: Identify the old dispatch pattern
Common patterns to convert:
- `AT_DISPATCH_ALL_TYPES_AND{2,3,4}(type1, type2, ..., scalar_type, name, lambda)`
- `AT_DISPATCH_FLOATING_TYPES_AND{2,3}(type1, type2, ..., scalar_type, name, lambda)`
- `AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND{2,3}(type1, ..., scalar_type, name, lambda)`
- `AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND{2,3}(type1, ..., scalar_type, name, lambda)`
### Step 3: Map the old macro to type groups
Identify which type group macro corresponds to the base types:
| Old macro base | AT_DISPATCH_V2 type group |
|----------------|---------------------------|
| `ALL_TYPES` | `AT_EXPAND(AT_ALL_TYPES)` |
| `FLOATING_TYPES` | `AT_EXPAND(AT_FLOATING_TYPES)` |
| `INTEGRAL_TYPES` | `AT_EXPAND(AT_INTEGRAL_TYPES)` |
| `COMPLEX_TYPES` | `AT_EXPAND(AT_COMPLEX_TYPES)` |
| `ALL_TYPES_AND_COMPLEX` | `AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX)` |
For combined patterns, use multiple `AT_EXPAND()` entries:
```cpp
// Old: AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(...)
// New: AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), type1, type2
```
### Step 4: Extract the individual types
From `AT_DISPATCH_*_AND2(type1, type2, ...)` or `AT_DISPATCH_*_AND3(type1, type2, type3, ...)`, extract the individual types (type1, type2, etc.).
These become the trailing arguments after the type group:
```cpp
AT_DISPATCH_V2(..., AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, kBool)
^^^^^^^^^^^^^^^^^^^^^^^^
Individual types from AND3
```
### Step 5: Transform to AT_DISPATCH_V2
Apply the transformation:
**Pattern:**
```cpp
AT_DISPATCH_V2(
scalar_type, // 1st: The dtype expression
"name", // 2nd: The debug string
AT_WRAP(lambda), // 3rd: The lambda wrapped in AT_WRAP
type_groups, // 4th+: Type groups with AT_EXPAND()
individual_types // Last: Individual types
)
```
**Example transformation:**
```cpp
// BEFORE
AT_DISPATCH_ALL_TYPES_AND3(
kBFloat16, kHalf, kBool,
iter.dtype(),
"min_values_cuda",
[&]() {
min_values_kernel_cuda_impl<scalar_t>(iter);
}
);
// AFTER
AT_DISPATCH_V2(
iter.dtype(),
"min_values_cuda",
AT_WRAP([&]() {
min_values_kernel_cuda_impl<scalar_t>(iter);
}),
AT_EXPAND(AT_ALL_TYPES),
kBFloat16, kHalf, kBool
);
```
### Step 6: Handle multi-line lambdas
For lambdas with internal commas or complex expressions, AT_WRAP is essential:
```cpp
AT_DISPATCH_V2(
dtype,
"complex_kernel",
AT_WRAP([&]() {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter,
MinOps<scalar_t>{},
thrust::pair<scalar_t, int64_t>(upper_bound(), 0) // Commas inside!
);
}),
AT_EXPAND(AT_ALL_TYPES)
);
```
### Step 7: Verify the conversion
Check that:
- [ ] `AT_WRAP()` wraps the entire lambda
- [ ] Type groups use `AT_EXPAND()`
- [ ] Individual types don't have `AT_EXPAND()` (just `kBFloat16`, not `AT_EXPAND(kBFloat16)`)
- [ ] Argument order is: scalar_type, name, lambda, types
- [ ] Include added: `#include <ATen/Dispatch_v2.h>`
## Type group reference
Available type group macros (use with `AT_EXPAND()`):
```cpp
AT_INTEGRAL_TYPES // kByte, kChar, kInt, kLong, kShort
AT_FLOATING_TYPES // kDouble, kFloat
AT_COMPLEX_TYPES // kComplexDouble, kComplexFloat
AT_QINT_TYPES // kQInt8, kQUInt8, kQInt32
AT_ALL_TYPES // INTEGRAL_TYPES + FLOATING_TYPES
AT_ALL_TYPES_AND_COMPLEX // ALL_TYPES + COMPLEX_TYPES
AT_INTEGRAL_TYPES_V2 // INTEGRAL_TYPES + unsigned types
AT_BAREBONES_UNSIGNED_TYPES // kUInt16, kUInt32, kUInt64
AT_FLOAT8_TYPES // Float8 variants
```
## Common patterns
### Pattern: AT_DISPATCH_ALL_TYPES_AND2
```cpp
// Before
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "op", [&]() {
kernel<scalar_t>(data);
});
// After
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>(data);
}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
```
### Pattern: AT_DISPATCH_FLOATING_TYPES_AND3
```cpp
// Before
AT_DISPATCH_FLOATING_TYPES_AND3(kHalf, kBFloat16, kFloat8_e4m3fn,
tensor.scalar_type(), "float_op", [&] {
process<scalar_t>(tensor);
});
// After
AT_DISPATCH_V2(tensor.scalar_type(), "float_op", AT_WRAP([&] {
process<scalar_t>(tensor);
}), AT_EXPAND(AT_FLOATING_TYPES), kHalf, kBFloat16, kFloat8_e4m3fn);
```
### Pattern: AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2
```cpp
// Before
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
kComplexHalf, kHalf,
self.scalar_type(),
"complex_op",
[&] {
result = compute<scalar_t>(self);
}
);
// After
AT_DISPATCH_V2(
self.scalar_type(),
"complex_op",
AT_WRAP([&] {
result = compute<scalar_t>(self);
}),
AT_EXPAND(AT_ALL_TYPES),
AT_EXPAND(AT_COMPLEX_TYPES),
kComplexHalf,
kHalf
);
```
## Edge cases
### Case 1: No extra types (rare)
```cpp
// Before
AT_DISPATCH_ALL_TYPES(dtype, "op", [&]() { kernel<scalar_t>(); });
// After
AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES));
```
### Case 2: Many individual types (AND4, AND5, etc.)
```cpp
// Before
AT_DISPATCH_FLOATING_TYPES_AND4(kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2,
dtype, "float8_op", [&]() { kernel<scalar_t>(); });
// After
AT_DISPATCH_V2(dtype, "float8_op", AT_WRAP([&]() {
kernel<scalar_t>();
}), AT_EXPAND(AT_FLOATING_TYPES), kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2);
```
### Case 3: Lambda with no captures
```cpp
// Before
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, dtype, "op", []() {
static_kernel<scalar_t>();
});
// After
AT_DISPATCH_V2(dtype, "op", AT_WRAP([]() {
static_kernel<scalar_t>();
}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBool);
```
## Benefits of AT_DISPATCH_V2
1. **No arity in macro name**: Don't need different macros for AND2, AND3, AND4
2. **Composable type sets**: Mix and match type groups with `AT_EXPAND()`
3. **Extensible**: Easy to add more types without hitting macro limits
4. **Clearer**: Type groups are explicit, not implicit in macro name
## Important notes
- Keep `#include <ATen/Dispatch.h>` - other code may need it
- The `AT_WRAP()` is mandatory - prevents comma parsing issues in the lambda
- Type groups need `AT_EXPAND()`, individual types don't
- The v2 API is in `aten/src/ATen/Dispatch_v2.h` - refer to it for full docs
- See the header file for the Python script to regenerate the macro implementation
## Workflow
When asked to convert AT_DISPATCH macros:
1. Read the file to identify all AT_DISPATCH uses
2. Add `#include <ATen/Dispatch_v2.h>` if not present
3. For each dispatch macro:
- Identify the pattern and extract components
- Map the base type group
- Extract individual types
- Construct the AT_DISPATCH_V2 call
- Apply with Edit tool
4. Show the user the complete converted file
5. Explain what was changed
Do NOT compile or test the code - focus on accurate conversion only.

View File

@ -1 +1 @@
cfbc5c2f1c798991715a6b06bb3ce46478c4487c
218d2ab791d437309f91e0486eb9fa7f00badc17

View File

@ -1 +1 @@
c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
df6798dfb931ce7c7fe5bed2447cd1092a5981af

1
.gitignore vendored
View File

@ -398,4 +398,3 @@ CLAUDE.local.md
/test_*.py
/debug_*.py
CLAUDE_CONTEXT/
/.claude/settings.local.json

View File

@ -11,6 +11,7 @@ aspects of contributing to PyTorch.
<!-- toc -->
- [Developing PyTorch](#developing-pytorch)
- [Setup the development environment](#setup-the-development-environment)
- [Tips and Debugging](#tips-and-debugging)
- [Nightly Checkout & Pull](#nightly-checkout--pull)
- [Codebase structure](#codebase-structure)
@ -66,6 +67,23 @@ aspects of contributing to PyTorch.
Follow the instructions for [installing PyTorch from source](https://github.com/pytorch/pytorch#from-source). If you get stuck when developing PyTorch on your machine, check out the [tips and debugging](#tips-and-debugging) section below for common solutions.
### Setup the development environment
First, you need to [fork the PyTorch project on GitHub](https://github.com/pytorch/pytorch/fork) and follow the instructions at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh) to setup your SSH authentication credentials.
Then clone the PyTorch project and setup the development environment:
```bash
git clone git@github.com:<USERNAME>/pytorch.git
cd pytorch
git remote add upstream git@github.com:pytorch/pytorch.git
make setup-env
# Or run `make setup-env-cuda` for pre-built CUDA binaries
# Or run `make setup-env-rocm` for pre-built ROCm binaries
source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows
```
### Tips and Debugging
* If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.

View File

@ -181,7 +181,7 @@ c10::intrusive_ptr<c10::TensorImpl> CPUGeneratorImpl::get_state() const {
static const size_t size = sizeof(CPUGeneratorImplState);
static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
auto state_tensor = at::detail::empty_cpu({static_cast<int64_t>(size)}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
auto rng_state = state_tensor.data_ptr();
// accumulate generator data to be copied into byte tensor

View File

@ -223,7 +223,7 @@ void Context::setSDPPriorityOrder(const std::vector<int64_t>& order) {
"setSDPPriority order expected ", sdp_priority_order.size() - 1, " but got ",
at::num_sdp_backends, " unique backends specified in priority order.");
for (uint32_t i = 0; i < order.size(); i++) {
sdp_priority_order[i] = static_cast<at::SDPBackend>(order[i]);
sdp_priority_order[i] = (at::SDPBackend) order[i];
}
}
@ -825,14 +825,6 @@ void Context::setDisplayVmapFallbackWarnings(bool enabled) {
display_vmap_fallback_warnings_ = enabled;
}
bool Context::warnOnAccumulateGradStreamMismatch() const {
return warn_on_accumulate_grad_stream_mismatch_;
}
void Context::setWarnOnAccumulateGradStreamMismatch(bool enabled) {
warn_on_accumulate_grad_stream_mismatch_ = enabled;
}
bool Context::isDefaultMobileCPUAllocatorSet() {
return prev_allocator_ptr_ != nullptr;
}

View File

@ -404,9 +404,6 @@ class TORCH_API Context {
void setDisplayVmapFallbackWarnings(bool enabled);
bool areVmapFallbackWarningsEnabled() const;
void setWarnOnAccumulateGradStreamMismatch(bool enabled);
bool warnOnAccumulateGradStreamMismatch() const;
bool isDefaultMobileCPUAllocatorSet();
void setDefaultMobileCPUAllocator();
void unsetDefaultMobileCPUAllocator();
@ -497,7 +494,6 @@ class TORCH_API Context {
bool release_original_weights = false;
#endif
bool display_vmap_fallback_warnings_ = false;
bool warn_on_accumulate_grad_stream_mismatch_ = true;
std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;
bool enable_sparse_tensor_invariant_checks = false;
bool allow_fp16_reduction_cpu = false;

View File

@ -197,7 +197,6 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
/* don't use TYPE again in case it is an expensive or side-effect op */ \
at::ScalarType _st = ::detail::scalar_type(the_type); \
RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st); \
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") \
switch (_st) { \
__VA_ARGS__ \
default: \
@ -209,7 +208,6 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
toString(_st), \
"'"); \
} \
C10_DIAGNOSTIC_POP() \
}()
#define AT_DISPATCH_CASE_FLOATING_TYPES(...) \

View File

@ -252,13 +252,13 @@ MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd,
if (!(flags_ & ALLOCATOR_MAPPED_FROMFD)) {
if (flags_ & ALLOCATOR_MAPPED_SHARED) {
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
if ((fd = open(filename_.c_str(), flags, static_cast<mode_t>(0600))) == -1) {
if ((fd = open(filename_.c_str(), flags, (mode_t)0600)) == -1) {
TORCH_CHECK(false, "unable to open file <", filename_, "> in read-write mode: ", c10::utils::str_error(errno), " (", errno, ")");
}
} else if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
#ifdef HAVE_SHM_OPEN
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
if((fd = shm_open(filename_.c_str(), flags, static_cast<mode_t>(0600))) == -1) {
if((fd = shm_open(filename_.c_str(), flags, (mode_t)0600)) == -1) {
TORCH_CHECK(false, "unable to open shared memory object <", filename_, "> in read-write mode: ", c10::utils::str_error(errno), " (", errno, ")");
}
#else
@ -503,7 +503,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *fi
void RefcountedMapAllocator::initializeAlloc() {
TORCH_CHECK(base_ptr_, "base_ptr_ is null");
MapInfo *map_info = static_cast<MapInfo*>(base_ptr_);
MapInfo *map_info = (MapInfo*)base_ptr_;
#ifdef _WIN32
ReleaseContext* r_ctx = new ReleaseContext;
@ -539,7 +539,7 @@ void RefcountedMapAllocator::close() {
}
#else /* _WIN32 */
MapInfo *info = static_cast<MapInfo*>(data);
MapInfo *info = (MapInfo*)(data);
if (--info->refcount == 0) {
#ifdef HAVE_SHM_UNLINK
if (shm_unlink(filename_.c_str()) == -1) {

View File

@ -862,7 +862,7 @@ void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) {
shape_[dim] = size;
view_offsets_[dim] += start;
for (auto& op : operands_) {
op.data = (static_cast<char*>(op.data)) + op.stride_bytes[dim] * start;
op.data = ((char*)op.data) + op.stride_bytes[dim] * start;
}
if (size == 1 && !is_reduction_) {
coalesce_dimensions();
@ -873,7 +873,7 @@ void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indic
TORCH_INTERNAL_ASSERT(start_dim <= ndim());
for (const auto i : c10::irange(start_dim, ndim())) {
for (auto& op : operands_) {
op.data = (static_cast<char*>(op.data)) + op.stride_bytes[i] * indices[i - start_dim];
op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim];
}
shape_[i] = 1;
}

View File

@ -41,7 +41,7 @@ inline void serial_for_each(
IntArrayRef strides,
char** base_ptrs,
size_t ntensors,
TensorIteratorBase::loop2d_t loop,
typename TensorIteratorBase::loop2d_t loop,
Range range) {
const auto ndim = shape.size();
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(

View File

@ -190,14 +190,12 @@ class IListRef;
* it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
*/
#define TORCH_ILISTREF_UNWRAP(TAG, BODY) \
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") \
switch (TAG) { \
TORCH_ILISTREF_FORALL_TAGS(TORCH_ILISTREF_UNWRAP_CASE, BODY) \
break; \
default: \
TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag."); \
} \
C10_DIAGNOSTIC_POP()
}
enum class IListRefTag {
#define DEFINE_TAG(tag, ...) tag,

View File

@ -56,7 +56,7 @@ C10_HOST_DEVICE inline T uniform_int_full_range(V val) {
* in this overloaded version
*/
template <typename T, typename V>
C10_HOST_DEVICE inline std::enable_if_t<!std::is_floating_point_v<T>, T>uniform_int(V val) {
C10_HOST_DEVICE inline std::enable_if_t<!(std::is_floating_point_v<T>), T>uniform_int(V val) {
if constexpr (std::is_same_v<T, bool>) {
return static_cast<bool>(val & 1);
} else if constexpr (std::is_same_v<T, int64_t>) {

View File

@ -114,25 +114,25 @@ inline typename remove_symint<T>::type unpackSymInt(T x) {
}
template <>
inline remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
inline typename remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
return x.guard_int(__FILE__, __LINE__);
}
template <>
inline remove_symint<c10::SymIntArrayRef>::type unpackSymInt(
inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(
c10::SymIntArrayRef x) {
return C10_AS_INTARRAYREF_SLOW(x);
}
template <>
inline remove_symint<std::optional<c10::SymInt>>::type unpackSymInt(
inline typename remove_symint<std::optional<c10::SymInt>>::type unpackSymInt(
std::optional<c10::SymInt> x) {
return x.has_value() ? std::make_optional(x->guard_int(__FILE__, __LINE__))
: std::nullopt;
}
template <>
inline remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(
inline typename remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(
at::OptionalSymIntArrayRef x) {
return x.has_value() ? std::make_optional(C10_AS_INTARRAYREF_SLOW(*x))
: std::nullopt;

View File

@ -631,8 +631,8 @@ call_functor_with_args_from_stack_(
Stack* stack,
std::index_sequence<ivalue_arg_indices...> /*unused*/,
guts::typelist::typelist<ArgTypes...>* /*unused*/) {
(void)stack; // when sizeof...(ivalue_arg_indices) == 0, this argument would
// be unused and we have to silence the compiler warning.
(void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
// be unused and we have to silence the compiler warning.
// We're explicitly filtering out DispatchKeySet from the argument list.
// Some kernels take a DispatchKeySet as their first argument in order to

View File

@ -18,7 +18,6 @@ struct TORCH_API EnumType : public NamedType {
TypePtr value,
std::vector<EnumNameValue> enum_names_values,
std::weak_ptr<::torch::jit::CompilationUnit> cu) {
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
switch (value->kind()) {
case TypeKind::IntType:
case TypeKind::FloatType:
@ -35,7 +34,6 @@ struct TORCH_API EnumType : public NamedType {
value->str(),
"', only int, float and string are supported");
}
C10_DIAGNOSTIC_POP()
}
std::string str() const override {

View File

@ -601,8 +601,8 @@ std::ostream& IValue::repr(
double d = v.toDouble();
int c = std::fpclassify(d);
if ((c == FP_NORMAL || c == FP_ZERO ) && std::abs(d) < 1e10) {
int64_t i = static_cast<int64_t>(d);
if (static_cast<double>(i) == d) {
int64_t i = int64_t(d);
if (double(i) == d) {
// -0.0 (signed zero) needs to be parsed as -0.
if (i == 0 && std::signbit(d)) {
return out << "-" << i << ".";
@ -799,8 +799,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
double d = v.toDouble();
int c = std::fpclassify(d);
if (c == FP_NORMAL || c == FP_ZERO) {
int64_t i = static_cast<int64_t>(d);
if (static_cast<double>(i) == d) {
int64_t i = int64_t(d);
if (double(i) == d) {
return out << i << ".";
}
}

View File

@ -41,7 +41,7 @@ void standardizeVectorForUnion(std::vector<TypePtr>* to_flatten);
inline bool is_contiguous_strides(
const IntArrayRef sizes,
const IntArrayRef strides) {
size_t n_dim = sizes.size();
int n_dim = static_cast<int>(sizes.size());
if (n_dim == 0) {
return true;
}
@ -50,7 +50,7 @@ inline bool is_contiguous_strides(
return false;
}
for (int i = static_cast<int>(n_dim) - 2; i >= 0; i--) {
for (int i = n_dim - 2; i >= 0; i--) {
if (strides[i] != strides[i + 1] * sizes[i + 1]) {
return false;
}
@ -922,7 +922,6 @@ struct TORCH_API DictType : public SharedType {
if (auto dyn = key->castRaw<DynamicType>()) {
kind = dyn->dynamicKind();
}
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
switch (kind) {
case TypeKind::AnyType:
case TypeKind::IntType:
@ -939,7 +938,6 @@ struct TORCH_API DictType : public SharedType {
key->str(),
"', only int, float, complex, Tensor, device and string keys are supported");
}
C10_DIAGNOSTIC_POP()
}
// aligned with the format in FunctionSchema
@ -2373,7 +2371,7 @@ private:
};
template<>
inline detail::CastReturnType<NamedType>::type Type::cast<NamedType>() {
inline typename detail::CastReturnType<NamedType>::type Type::cast<NamedType>() {
if (kind() == TypeKind::TupleType || kind() == TypeKind::FunctionType ||
kind() == TypeKind::ClassType || kind() == TypeKind::InterfaceType) {
return std::static_pointer_cast<NamedType>(static_cast<NamedType *>(this)->shared_from_this());
@ -2382,7 +2380,7 @@ inline detail::CastReturnType<NamedType>::type Type::cast<NamedType>() {
}
template<>
inline detail::CastConstReturnType<NamedType>::type Type::cast<NamedType>() const {
inline typename detail::CastConstReturnType<NamedType>::type Type::cast<NamedType>() const {
if (kind() == TypeKind::TupleType || kind() == TypeKind::FunctionType ||
kind() == TypeKind::ClassType || kind() == TypeKind::InterfaceType) {
return std::static_pointer_cast<const NamedType>(static_cast<const NamedType *>(this)->shared_from_this());

View File

@ -514,7 +514,7 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
using value_type = c10::qint8::underlying;
using value_type = typename c10::qint8::underlying;
public:
using Vectorizedqi::Vectorizedqi;
@ -727,7 +727,7 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
using value_type = c10::quint8::underlying;
using value_type = typename c10::quint8::underlying;
public:
using Vectorizedqi::Vectorizedqi;

View File

@ -567,7 +567,7 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
using float_vec_return_type = std::array<Vectorized<float>, 4>;
using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
using value_type = c10::qint8::underlying;
using value_type = typename c10::qint8::underlying;
public:
using Vectorizedqi::Vectorizedqi;
@ -804,7 +804,7 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
using float_vec_return_type = std::array<Vectorized<float>, 4>;
using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
using value_type = c10::quint8::underlying;
using value_type = typename c10::quint8::underlying;
public:
using Vectorizedqi::Vectorizedqi;

View File

@ -672,7 +672,7 @@ struct Vectorized {
return map(std::sqrt);
}
Vectorized<T> reciprocal() const {
return map([](T x) { return (T)1 / x; });
return map([](T x) { return (T)(1) / x; });
}
Vectorized<T> rsqrt() const {
return map([](T x) { return (T)1 / std::sqrt(x); });

View File

@ -46,7 +46,7 @@ inline void vrsqrt(scalar_t* out, scalar_t* in, int64_t size) {
parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) {
map(
[](const Vectorized<scalar_t>& x) {
return Vectorized<scalar_t>((scalar_t)1) / x.sqrt();
return Vectorized<scalar_t>((scalar_t)(1)) / x.sqrt();
},
out + begin,
in + begin,

View File

@ -194,8 +194,8 @@ void CUDAGeneratorState::unregister_graph(cuda::CUDAGraph* graph) {
void CUDAGeneratorState::capture_prologue() {
capturing_ = true;
offset_intragraph_ = 0;
seed_extragraph_.fill_(static_cast<int64_t>(seed_));
offset_extragraph_.fill_(0);
seed_extragraph_.fill_(int64_t(seed_));
offset_extragraph_.fill_(int64_t(0));
}
/**
@ -216,8 +216,8 @@ void CUDAGeneratorState::replay_prologue(uint64_t wholegraph_increment) {
at::cuda::assertNotCapturing(
"Cannot prepare for replay during capturing stage.");
if (wholegraph_increment) {
seed_extragraph_.fill_(static_cast<int64_t>(seed_));
offset_extragraph_.fill_(static_cast<int64_t>(philox_offset_per_thread_));
seed_extragraph_.fill_(int64_t(seed_));
offset_extragraph_.fill_(int64_t(philox_offset_per_thread_));
// Applies the total increment achieved during previous captures to update the
// offset.
increase(wholegraph_increment);
@ -329,7 +329,7 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
constexpr size_t offset_size = sizeof(int64_t);
constexpr size_t total_size = seed_size + offset_size;
auto state_tensor = at::detail::empty_cpu({static_cast<int64_t>(total_size)}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
auto rng_state = state_tensor.data_ptr<uint8_t>();
auto current_seed = this->current_seed();
auto offset = static_cast<int64_t>(this->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic<int64_t>

View File

@ -155,8 +155,8 @@ size_t parseChosenWorkspaceSize() {
while (next != end) {
std::smatch match = *next;
TORCH_CHECK(match.size() == 3, "Expected CUBLAS_WORKSPACE_SPACE_CONFIG match of size 3 (Format :SIZE:COUNT)");
size_t curr_size = std::stoull(match.str(1));
size_t count = std::stoull(match.str(2));
size_t curr_size = (size_t) std::stoi(match.str(1));
size_t count = (size_t) std::stoi(match.str(2));
total_size += curr_size * 1024 * count;
next++;
}

View File

@ -2,6 +2,8 @@
#include <ATen/Tensor.h>
#include <ATen/cuda/Exceptions.h>
#include <mutex>
namespace at {
namespace cuda {
namespace detail {
@ -10,36 +12,39 @@ __device__ __constant__ float cublas_one_device;
__device__ __constant__ float cublas_zero_device;
float *get_cublas_device_one() {
static float *ptr = nullptr;
static auto init_flag = [&]() {
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
const float one = 1.f;
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
return true;
}();
});
float *ptr;
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
return ptr;
}
float *get_cublas_device_zero() {
static float *ptr = nullptr;
static auto init_flag = [&]() {
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
const float zero = 0.f;
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
return true;
}();
});
float *ptr;
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
return ptr;
}
float *get_user_alpha_ptr() {
static float *alpha_ptr;
static bool init_flag [[maybe_unused]] = []() {
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
return true;
}();
});
return alpha_ptr;
}

View File

@ -3,7 +3,6 @@
#include <ATen/ATen.h>
#include <c10/util/irange.h>
#include <array>
#include <iostream>
#include <sstream>
@ -137,9 +136,9 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
"Weight strides: ", t.strides(), "\n",
"cuDNN suggested memory_format: ", memory_format);
std::array<int, CUDNN_DIM_MAX> size;
int size[CUDNN_DIM_MAX];
for (const auto i : c10::irange(dim)) {
size[i] = static_cast<int>(t.size(i));
size[i] = (int) t.size(i);
}
for (const auto i : c10::irange(dim, pad)) {
size[i] = 1;
@ -157,7 +156,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
default:
TORCH_INTERNAL_ASSERT(false, "unsupported memory_format for cuDNN filters");
}
set(getDataType(t), static_cast<int>(dim), size.data(), filter_format);
set(getDataType(t), static_cast<int>(dim), size, filter_format);
}
std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {

View File

@ -198,7 +198,7 @@ static void autogradBasedTransformSendToNext(
}
// Step 6
stack->erase(stack->end() - static_cast<std::ptrdiff_t>(args_size + ret_size), stack->end() - static_cast<std::ptrdiff_t>(ret_size));
stack->erase(stack->end() - std::ptrdiff_t(args_size + ret_size), stack->end() - std::ptrdiff_t(ret_size));
}
void GradInterpreterPtr::processImpl(

View File

@ -443,14 +443,14 @@ static bool has_same_shape(
if (!tensor.defined()) {
return true;
}
if (rankWithoutBatchDim(tensor, tensor_bdim) != static_cast<int64_t>(normalized_shape.size())) {
if (rankWithoutBatchDim(tensor, tensor_bdim) != (int64_t) normalized_shape.size()) {
return false;
}
const auto tensor_shape = tensor.sizes();
for (const auto i : c10::irange(normalized_shape.size())) {
auto j = i;
// (0, 1, 2), 1 -> (0, 2, 3)
if (tensor_bdim.has_value() && static_cast<int64_t>(i) >= tensor_bdim.value()) {
if (tensor_bdim.has_value() && (int64_t)i >= tensor_bdim.value()) {
j = j + 1;
}
if (normalized_shape[i] != tensor_shape[j]) {

View File

@ -135,7 +135,7 @@ static void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit
reduction_case = ReductionCase::DimArray;
dims = arguments[dim_arg_pos].toIntList().vec();
if (dims.empty()) {
auto all_dims = range(0, std::max(static_cast<int64_t>(1), logical_dim));
auto all_dims = range(0, std::max((int64_t)1, logical_dim));
dims = std::vector<int64_t>(all_dims.begin(), all_dims.end());
}
} else if (arguments[dim_arg_pos].isInt()) {

View File

@ -432,7 +432,7 @@ namespace {
// Eg. Given `indexed_shape.size()` is 5 and
// shape of `values` is (N, 2, 3), then following block
// will reshape `values` to (N, 1, 1, 2, 3).
if ( static_cast<int64_t>(indexed_shape.size()) > values_.dim()) {
if ( (int64_t) indexed_shape.size() > values_.dim()) {
auto values_sizes = values_.sym_sizes();
// number of unit dims (for broadcasting value to indexed_shape)

View File

@ -109,7 +109,7 @@ std::tuple<Tensor, std::optional<int64_t>> repeat_batch_rule(
SymDimVector sizes_with_bdim = { sizes.begin(), sizes.end() };
sizes_with_bdim.insert(sizes_with_bdim.begin(), 1);
auto self_ = moveBatchDimToFront(self, self_bdim);
while (self_.dim() < static_cast<int64_t>(sizes_with_bdim.size())) {
while (self_.dim() < (int64_t)sizes_with_bdim.size()) {
self_ = self_.unsqueeze(1);
}
return std::make_tuple(self_.repeat_symint(sizes_with_bdim), 0);

View File

@ -191,7 +191,7 @@ static void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, t
// simplicity. When that is not the case, this code should be updated.
const auto& argument = (*stack)[arguments_begin + arg_idx];
if (batched_tensor_inputs_pos_iter == batched_tensor_inputs_position.end()
|| static_cast<int64_t>(arg_idx) != *batched_tensor_inputs_pos_iter) {
|| (int64_t)arg_idx != *batched_tensor_inputs_pos_iter) {
// argument isn't a BatchedTensor
torch::jit::push(stack, argument);
continue;
@ -345,7 +345,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
// simplicity. When that is not the case, this code should be updated.
const auto& argument = (*stack)[arguments_begin + arg_idx];
if (batched_tensor_inputs_pos_iter == batched_tensor_inputs_position.end()
|| static_cast<int64_t>(arg_idx) != *batched_tensor_inputs_pos_iter) {
|| (int64_t)arg_idx != *batched_tensor_inputs_pos_iter) {
// argument isn't a BatchedTensor
torch::jit::push(stack, argument);
continue;
@ -473,7 +473,7 @@ void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::ji
// simplicity. When that is not the case, this code should be updated.
const auto& argument = (*stack)[arguments_begin + arg_idx];
if (batched_tensor_inputs_pos_iter == batched_tensor_inputs_position.end()
|| static_cast<int64_t>(arg_idx) != *batched_tensor_inputs_pos_iter) {
|| (int64_t)arg_idx != *batched_tensor_inputs_pos_iter) {
// argument isn't a BatchedTensor
torch::jit::push(stack, argument);
continue;

View File

@ -157,7 +157,7 @@ Tensor& squeeze__batching_rule(Tensor& self) {
const auto physical_shape = batched->value().sizes();
auto how_many_dims_of_size_1_before_bdim = 0;
for (const auto i : c10::irange(0, physical_shape.size())) {
if (static_cast<int64_t>(i) == bdim) {
if ((int64_t)i == bdim) {
break;
}
if (physical_shape[i] == 1) {
@ -573,7 +573,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
}
auto new_dim = bdim_size.has_value() ? dim + 1 : dim;
std::optional<int64_t> new_bdim = bdim_size.has_value() ? std::make_optional(static_cast<int64_t>(0)) : std::nullopt;
std::optional<int64_t> new_bdim = bdim_size.has_value() ? std::make_optional((int64_t)0) : std::nullopt;
auto result = at::cat(tensors_to_cat, new_dim);
return makeBatched(result, new_bdim, get_current_level());
}

View File

@ -1,5 +1,7 @@
// Copyright © 2022 Apple Inc.
#include <c10/util/CallOnce.h>
#include <ATen/mps/IndexKernels.h>
#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSDevice.h>
@ -8,6 +10,9 @@
namespace at::mps {
static std::unique_ptr<MPSDevice> mps_device;
static c10::once_flag mpsdev_init;
static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
// MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
// host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
@ -16,8 +21,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
}
MPSDevice* MPSDevice::getInstance() {
static MPSDevice mps_device;
return &mps_device;
c10::call_once(mpsdev_init, [] { mps_device = std::unique_ptr<MPSDevice>(new MPSDevice()); });
return mps_device.get();
}
MPSDevice::~MPSDevice() {

View File

@ -198,9 +198,9 @@ void avg_pool3d_out_frame(
int64_t hend = std::min(hstart + kH, iheight + padH);
int64_t wend = std::min(wstart + kW, iwidth + padW);
int64_t pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
tstart = std::max(tstart, static_cast<int64_t>(0));
hstart = std::max(hstart, static_cast<int64_t>(0));
wstart = std::max(wstart, static_cast<int64_t>(0));
tstart = std::max(tstart, (int64_t) 0);
hstart = std::max(hstart, (int64_t) 0);
wstart = std::max(wstart, (int64_t) 0);
tend = std::min(tend, itime);
hend = std::min(hend, iheight);
wend = std::min(wend, iwidth);
@ -377,9 +377,9 @@ void avg_pool3d_backward_out_frame(
int64_t hend = std::min(hstart + kH, iheight + padH);
int64_t wend = std::min(wstart + kW, iwidth + padW);
int64_t pool_size = (tend -tstart) * (hend - hstart) * (wend - wstart);
tstart = std::max(tstart, static_cast<int64_t>(0));
hstart = std::max(hstart, static_cast<int64_t>(0));
wstart = std::max(wstart, static_cast<int64_t>(0));
tstart = std::max(tstart, (int64_t) 0);
hstart = std::max(hstart, (int64_t) 0);
wstart = std::max(wstart, (int64_t) 0);
tend = std::min(tend, itime);
hend = std::min(hend, iheight);
wend = std::min(wend, iwidth);

View File

@ -946,10 +946,10 @@ void apply_lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& in
}
};
// avoid overflow
auto matrix_rank = std::min(m, n);
float matrix_rank = float(std::min(m, n));
// A heuristic tested on a 32 core/socket ICX system
// https://github.com/pytorch/pytorch/pull/93037#discussion_r1090112948
int64_t chunk_size_per_thread = static_cast<int64_t>(
int64_t chunk_size_per_thread = int64_t(
std::min(1.0, 3200.0 / (matrix_rank * matrix_rank * matrix_rank)));
int64_t grain_size = chunk_size_per_thread * at::get_num_threads();
at::parallel_for(0, batch_size, grain_size, loop);

View File

@ -267,7 +267,7 @@ _scaled_mm_out_cpu_emulated(const Tensor& mat1, const Tensor& mat2,
float input_scale = scale_a.item<float>();
float weight_scale = scale_b.item<float>();
float output_scale = 1.0f;
float output_scale = float(1.0);
if (scale_result.has_value() &&
(*out_dtype == ScalarType::Float8_e4m3fn ||
*out_dtype == ScalarType::Float8_e5m2)) {

View File

@ -331,7 +331,7 @@ bool gemv_use_fast_path<double>(
[[maybe_unused]] double beta,
int64_t incy) {
return gemv_use_fast_path<float>(
trans, m, n, static_cast<float>(alpha), lda, incx, static_cast<float>(beta), incy);
trans, m, n, (float)alpha, lda, incx, (float)beta, incy);
}
template <>
@ -523,8 +523,8 @@ static inline void scal(int64_t n, scalar_t a, scalar_t *x, int64_t incx)
if (n == 1) incx = 1;
#if AT_BUILD_WITH_BLAS()
if (blas_impl::scal_use_fast_path<scalar_t>(n, incx)) {
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_n = (int)n;
int i_incx = (int)incx;
blas_impl::scal_fast_path<scalar_t>(&i_n, &a, x, &i_incx);
return;
}
@ -545,11 +545,11 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, const scalar_t *a, i
#if AT_BUILD_WITH_BLAS()
if (blas_impl::gemv_use_fast_path<scalar_t>(trans, m, n, alpha, lda, incx, beta, incy)) {
TORCH_CHECK(lda >= std::max<int64_t>(1L, m), "lda should be at least max(1,", m, "), but have ", lda);
int i_m = static_cast<int>(m);
int i_n = static_cast<int>(n);
int i_lda = static_cast<int>(lda);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_m = (int)m;
int i_n = (int)n;
int i_lda = (int)lda;
int i_incx = (int)incx;
int i_incy = (int)incy;
blas_impl::gemv_fast_path<scalar_t>(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
return;
}

View File

@ -680,9 +680,9 @@ void axpy(int64_t n, double a, const double *x, int64_t incx, double *y, int64_t
#if AT_BUILD_WITH_BLAS()
if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
{
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
#if C10_IOS
cblas_daxpy(i_n, a, x, i_incx, y, i_incy);
#else
@ -705,9 +705,9 @@ void axpy(int64_t n, float a, const float *x, int64_t incx, float *y, int64_t in
#if AT_BUILD_WITH_BLAS()
if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
{
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
#if C10_IOS
cblas_saxpy(i_n, a, x, i_incx, y, i_incy);
#else
@ -730,9 +730,9 @@ void axpy(int64_t n, c10::complex<double> a, const c10::complex<double> *x, int6
#if AT_BUILD_WITH_BLAS()
if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
{
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
#if C10_IOS
cblas_zaxpy(i_n, &a, x, i_incx, y, i_incy);
#else
@ -755,9 +755,9 @@ void axpy(int64_t n, c10::complex<float> a, const c10::complex<float> *x, int64_
#if AT_BUILD_WITH_BLAS()
if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
{
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
#if C10_IOS
cblas_caxpy(i_n, &a, x, i_incx, y, i_incy);
#else
@ -781,9 +781,9 @@ void copy(int64_t n, const double *x, int64_t incx, double *y, int64_t incy) {
}
#if AT_BUILD_WITH_BLAS()
if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) {
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
#if C10_IOS
cblas_dcopy(i_n, x, i_incx, y, i_incy);
#else
@ -805,9 +805,9 @@ void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy) {
}
#if AT_BUILD_WITH_BLAS()
if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) {
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
#if C10_IOS
cblas_scopy(i_n, x, i_incx, y, i_incy);
#else
@ -829,9 +829,9 @@ void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<d
}
#if AT_BUILD_WITH_BLAS()
if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) {
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
#if C10_IOS
cblas_zcopy(i_n, x, i_incx, y, i_incy);
#else
@ -853,9 +853,9 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
}
#if AT_BUILD_WITH_BLAS()
if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) {
int i_n = static_cast<int>(n);
int i_incx = static_cast<int>(incx);
int i_incy = static_cast<int>(incy);
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
#if C10_IOS
cblas_ccopy(i_n, &x, i_incx, y, i_incy);
#else
@ -1082,7 +1082,7 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
M,
N,
K,
1,
int64_t(1),
ld_a,
ld_b,
ld_c,
@ -1096,7 +1096,7 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
M,
N,
K,
1,
int64_t(1),
ld_a,
ld_b,
ld_c,

View File

@ -487,17 +487,17 @@ static Tensor _grid_sampler_2d_cpu_quantized(
int64_t out_sC = output.stride(1);
int64_t out_sH = output.stride(2);
int64_t out_sW = output.stride(3);
const uint8_t* inp_ptr = input.const_data_ptr<uint8_t>();
uint8_t* out_ptr = output.data_ptr<uint8_t>();
const float* grid_ptr = grid.const_data_ptr<float>();
uint8_t* inp_ptr = (uint8_t*)input.data_ptr<quint8>();
uint8_t* out_ptr = (uint8_t*)output.data_ptr<quint8>();
float* grid_ptr = grid.data_ptr<float>();
at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
for (const auto n : c10::irange(start, end)) {
const float* grid_ptr_N = grid_ptr + n * grid_sN;
const uint8_t* inp_ptr_N = inp_ptr + n * inp_sN;
float* grid_ptr_N = grid_ptr + n * grid_sN;
uint8_t* inp_ptr_N = inp_ptr + n * inp_sN;
for (const auto h : c10::irange(out_H)) {
for (const auto w : c10::irange(out_W)) {
// get the corresponding input x, y, z coordinates from grid
const float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
float x = *grid_ptr_NHW;
float y = grid_ptr_NHW[grid_sCoor];
@ -527,7 +527,7 @@ static Tensor _grid_sampler_2d_cpu_quantized(
float se = (ix - ix_nw) * (iy - iy_nw);
// calculate bilinear weighted pixel value and set output pixel
const uint8_t* inp_ptr_NC = inp_ptr_N;
uint8_t* inp_ptr_NC = inp_ptr_N;
uint8_t* out_ptr_NCHW =
out_ptr + n * out_sN + h * out_sH + w * out_sW;
for (int64_t c = 0; c < C;

View File

@ -318,7 +318,7 @@ static std::vector<Tensor>& histogramdd_bin_edges_out(const Tensor& self, IntArr
const int64_t N = self.size(-1);
const int64_t M = std::accumulate(self.sizes().begin(), self.sizes().end() - 1,
static_cast<int64_t>(1), std::multiplies<int64_t>());
(int64_t)1, std::multiplies<int64_t>());
Tensor reshaped_self = self.reshape({ M, N });
auto outer_bin_edges = select_outer_bin_edges(reshaped_self, range);

View File

@ -40,7 +40,7 @@ Tensor do_trapezoid(const Tensor& y, const Tensor& dx, int64_t dim) {
// When dx is constant, the above formula simplifies
// to dx * [(\sum_{i=1}^n y_i) - (y_1 + y_n)/2]
Tensor do_trapezoid(const Tensor& y, double dx, int64_t dim) {
return (y.sum(dim) - (y.select(dim, 0) + y.select(dim, -1)) * 0.5) * dx;
return (y.sum(dim) - (y.select(dim, 0) + y.select(dim, -1)) * (0.5)) * dx;
}
Tensor zeros_like_except(const Tensor& y, int64_t dim) {

View File

@ -201,7 +201,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
out_size.reserve(out_num_dim);
for (auto& d : lro) out_size.push_back(left.sym_size(d));
for (auto& d : lo) out_size.push_back(left.sym_size(d));
for (auto& d : sum_dims_) { out_size.emplace_back(1); (void)d; }; // avoid warning about not using d
for (auto& d : sum_dims_) { out_size.emplace_back(1); (void)(d); }; // avoid warning about not using d
for (auto& d : ro) out_size.push_back(right.sym_size(d));
std::vector<int64_t> lpermutation(lro);
@ -640,7 +640,7 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
}
}
return std::move(ops[0]);
return ops[0];
}
// _trilinear computes a trilinear einstein sum with an unrolled dimension
@ -805,7 +805,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
std::vector<SymInt> rsizes; // rsizes: sizes of the result
p1.reserve(input1.dim());
p2.reserve(input2.dim());
rsizes.reserve(input1.dim() + input2.dim() - static_cast<int64_t>(dims1.size()));
rsizes.reserve(input1.dim() + input2.dim() - (int64_t) dims1.size());
SymInt size1 = 1; // number of non-contracted elements in input1
SymInt size2 = 1; // number of non-contracted elements in input2

View File

@ -1655,7 +1655,7 @@ static inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self,
auto s0 = self.accessor<const scalar_t, 3>();
auto m0 = mat2.accessor<const scalar_t, 3>();
int64_t grain_size = std::max(internal::GRAIN_SIZE / (is * js * ks), static_cast<int64_t>(1));
int64_t grain_size = std::max(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
using opmath_t = at::opmath_type<scalar_t>;
parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
for (const auto b : c10::irange(b_begin, b_end)) {

View File

@ -235,7 +235,7 @@ void nll_loss_out_frame(
constexpr int64_t cascade_sum_num_levels = 8;
const int64_t level_power =
std::max(static_cast<int64_t>(4), utils::CeilLog2(batch_size) / cascade_sum_num_levels);
std::max(int64_t(4), utils::CeilLog2(batch_size) / cascade_sum_num_levels);
const int64_t level_step = (1 << level_power);
const int64_t level_mask = level_step - 1;

View File

@ -129,7 +129,7 @@ void nll_loss2d_forward_out_frame(
for (const auto b : c10::irange(start, end)) {
for (const auto h : c10::irange(H)) {
for (const auto w : c10::irange(W)) {
const int64_t cur_target = target_acc[b][h][w];
const int64_t cur_target = (int64_t)target_acc[b][h][w];
if (cur_target == ignore_index) {
output_acc[b][h][w] = static_cast<scalar_t>(0);
@ -188,7 +188,7 @@ void nll_loss2d_forward_out_frame(
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
scalar_t loss_partial_sums[cascade_sum_num_levels] = {0};
const int64_t level_power =
std::max(static_cast<int64_t>(4), utils::CeilLog2(numiter) / cascade_sum_num_levels);
std::max(int64_t(4), utils::CeilLog2(numiter) / cascade_sum_num_levels);
const int64_t level_step = (1 << level_power);
const int64_t level_mask = level_step - 1;

View File

@ -192,7 +192,7 @@ Date: February 1996
x = x - (std::erf(x) - y) / ((static_cast<T>(2.0)/static_cast<T>(std::sqrt(c10::pi<double>)))*std::exp(-x*x));
x = x - (std::erf(x) - y) / ((static_cast<T>(2.0)/static_cast<T>(std::sqrt(c10::pi<double>)))*std::exp(-x*x));
return x;
return(x);
}
#undef CENTRAL_RANGE
@ -3819,7 +3819,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) {
if (std::sin(std::acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
return std::cos((n + T(0.5)) * std::acos(x + x - T(1.0))) / std::cos(std::acos(x + x - T(1.0)) / T(2.0));
return std::cos(((n) + T(0.5)) * std::acos(x + x - T(1.0))) / std::cos(std::acos(x + x - T(1.0)) / T(2.0));
}
if (n % 2 == 0) {

View File

@ -193,22 +193,22 @@ Tensor _nnpack_spatial_convolution(
const size_t input_channels = input.size(1);
const size_t output_channels = weight.size(0);
const struct nnp_size input_size = {
.width = static_cast<size_t>(input.size(3)),
.height = static_cast<size_t>(input.size(2)),
.width = (size_t)input.size(3),
.height = (size_t)input.size(2),
};
const struct nnp_padding input_padding = {
.top = static_cast<size_t>(padding[0]),
.right = static_cast<size_t>(padding[1]),
.bottom = static_cast<size_t>(padding[0]),
.left = static_cast<size_t>(padding[1]),
.top = (size_t)padding[0],
.right = (size_t)padding[1],
.bottom = (size_t)padding[0],
.left = (size_t)padding[1],
};
const struct nnp_size kernel_size = {
.width = static_cast<size_t>(weight.size(3)),
.height = static_cast<size_t>(weight.size(2)),
.width = (size_t)weight.size(3),
.height = (size_t)weight.size(2),
};
const struct nnp_size output_size = {
.width = static_cast<size_t>(output.size(3)),
.height = static_cast<size_t>(output.size(2)),
.width = (size_t)output.size(3),
.height = (size_t)output.size(2),
};
const nnp_size output_subsample = {
.width = static_cast<std::size_t>(stride[1]),

View File

@ -248,8 +248,8 @@ void slow_conv_transpose3d_out_cpu_template(
Tensor weight = weight_.contiguous();
Tensor bias = bias_.defined() ? bias_.contiguous() : bias_;
const auto n_input_plane = weight.size(0);
const auto n_output_plane = weight.size(1);
const int n_input_plane = (int)weight.size(0);
const int n_output_plane = (int)weight.size(1);
bool is_batch = false;
if (input.dim() == 4) {

View File

@ -84,8 +84,8 @@ static std::vector<int64_t> aligned_size(
DimnameList aligned_names,
bool is_aligning_two_tensors) {
std::vector<int64_t> expanded_sizes(aligned_names.size(), 1);
ptrdiff_t dim = static_cast<ptrdiff_t>(tensor_sizes.size()) - 1;
ptrdiff_t idx = static_cast<ptrdiff_t>(aligned_names.size()) - 1;
ptrdiff_t dim = (ptrdiff_t)tensor_sizes.size() - 1;
ptrdiff_t idx = (ptrdiff_t)aligned_names.size() - 1;
for (; idx >= 0 && dim >= 0; --idx) {
if (tensor_names[dim] != aligned_names[idx]) {
continue;

View File

@ -25,7 +25,7 @@ std::tuple<Tensor, Tensor> _rowwise_prune_helper(
auto mask_contig = mask.contiguous();
auto mask_data = mask_contig.data_ptr<bool>();
for (const auto i : c10::irange(mask.numel())) {
num_non_masked_rows += ((mask_data[i] == true) ? 1 : 0);
num_non_masked_rows += (((mask_data[i] == true)) ? 1 : 0);
}
int num_cols = weights.size(1);
auto pruned_2d_tensor = at::empty({num_non_masked_rows, num_cols},

View File

@ -176,7 +176,7 @@ void host_softmax(
scalar_t* input_data_base = input.data_ptr<scalar_t>();
scalar_t* output_data_base = output.data_ptr<scalar_t>();
bool* mask_data_base = mask;
int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, static_cast<int64_t>(1));
int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
parallel_for(
0, outer_size * inner_size, grain_size,
[&](int64_t begin, int64_t end) {
@ -265,7 +265,7 @@ void host_softmax_backward(
scalar_t* output_data_base = output.data_ptr<scalar_t>();
scalar_t* gradOutput_data_base = grad.data_ptr<scalar_t>();
bool* mask_data_base = mask;
int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, static_cast<int64_t>(1));
int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
parallel_for(
0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
for (const auto i : c10::irange(begin, end)) {

View File

@ -1701,13 +1701,13 @@ Tensor& index_select_out_cpu_(
TORCH_CHECK_INDEX(
(self_i >= 0) && (self_i < self_dim_size),
"index out of range in self");
auto self_data = const_cast<char*>(static_cast<const char*>(
selfSlice_data)) +
auto self_data = static_cast<const char*>(selfSlice_data) +
self_i * self_stride_bytes;
auto result_data = static_cast<char*>(resultSlice_data) +
i * result_stride_bytes;
sub_iter.unsafe_replace_operand(0, result_data);
sub_iter.unsafe_replace_operand(1, self_data);
sub_iter.unsafe_replace_operand(
1, const_cast<char*>(self_data));
copy_stub(sub_iter.device_type(), sub_iter, false);
};
});

View File

@ -1382,7 +1382,7 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) {
// use no-initialization Fischer-Yates variant
// https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_.22inside-out.22_algorithm
for (int64_t i = 0; i < n; i++) {
int64_t z = static_cast<int64_t>(generator->random64() % (i + 1));
int64_t z = (int64_t)(generator->random64() % (i + 1));
r__data[i * r__stride_0] = i;
r__data[i * r__stride_0] = r__data[z * r__stride_0];
r__data[z * r__stride_0] = i;

View File

@ -40,7 +40,7 @@ at::Tensor PackedLinearWeightQnnp::apply_dynamic_impl<false>(
"quantized_sparse_linear(): Input tensor rank should be >= 2");
const auto rows_input = c10::multiply_integers(input.sizes().begin(), input.sizes().end() - 1);
const auto cols_input = input.size(input.dim() - 1);
const auto cols_input = static_cast<int64_t>(input.size(input.dim() - 1));
TORCH_CHECK(
cols_input == input_channels_,
"quantized_sparse_linear: Input tensor's last and weight tensor's"

View File

@ -65,8 +65,8 @@ LinearPackedSerializationType PackedLinearWeight::unpack() {
#ifdef USE_PYTORCH_QNNPACK
LinearPackedSerializationType PackedLinearWeightQnnp::unpack() {
const int64_t N = output_channels_;
const int64_t K = input_channels_;
const int64_t N = static_cast<int64_t>(output_channels_);
const int64_t K = static_cast<int64_t>(input_channels_);
float* w_scales_ptr = w_scales_.data_ptr<float>();

View File

@ -998,7 +998,7 @@ void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, con
auto threshold = threshold_.to<float>();
const Vec beta_vec(beta);
const Vec threshold_vec(threshold);
const Vec one_vec(1.0f);
const Vec one_vec(static_cast<float>(1.0));
cpu_kernel_vec(
iter,
[beta, threshold](scalar_t a, scalar_t b) -> scalar_t {

View File

@ -17,7 +17,7 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue)
} uf32_t;
uf32_t new_value, old_value;
std::atomic<unsigned>* dst_intV = (std::atomic<unsigned>*)dst;
std::atomic<unsigned>* dst_intV = (std::atomic<unsigned>*)(dst);
old_value.floatV = *dst;
new_value.floatV = old_value.floatV + fvalue;

View File

@ -851,7 +851,7 @@ void sigmoid_backward_kernel(TensorIteratorBase& iter) {
});
});
} else if (iter.dtype() == kBFloat16) {
auto one_vec = Vectorized<float>((float)1);
auto one_vec = Vectorized<float>((float)(1));
cpu_kernel_vec(
iter,
[=](BFloat16 a, BFloat16 b) -> BFloat16 {

View File

@ -298,7 +298,7 @@ void unfolded2d_copy(
memcpy(
dst + (size_t)y * output_width + x,
src + (size_t)iy * input_width + ix,
sizeof(scalar_t) * 1);
sizeof(scalar_t) * (1));
}
}
}
@ -317,7 +317,7 @@ void unfolded2d_copy(
memcpy(
dst + (size_t)y * output_width + x,
src + (size_t)iy * input_width + ix + x * dW,
sizeof(scalar_t) * 1);
sizeof(scalar_t) * (1));
}
}
}

View File

@ -342,7 +342,7 @@ void upsample_avx_bilinear_bicubic_uint8(
if (need_horizontal) {
int interp_dim = 3;
auto stride = skip_unpacking ? num_channels : 4;
auto stride = (skip_unpacking) ? num_channels : 4;
std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
F::compute_index_ranges_int16_weights(
/*input_size=*/xin,
@ -358,7 +358,7 @@ void upsample_avx_bilinear_bicubic_uint8(
if (need_vertical) {
int interp_dim = 2;
auto stride = skip_unpacking ? num_channels * xout : 4 * xout;
auto stride = (skip_unpacking) ? num_channels * xout : 4 * xout;
std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
F::compute_index_ranges_int16_weights(
/*input_size=*/yin,
@ -377,17 +377,17 @@ void upsample_avx_bilinear_bicubic_uint8(
// horizontal-only or vertical-only interpolation, and if the tensor doesn't
// need repacking
if (need_horizontal && (need_vertical || !skip_packing)) {
auto c = skip_unpacking ? num_channels : 4;
auto c = (skip_unpacking) ? num_channels : 4;
buffer_horiz = at::empty({c, yin, xout}, input.options());
}
if (need_vertical && !skip_packing) {
auto c = skip_unpacking ? num_channels : 4;
auto c = (skip_unpacking) ? num_channels : 4;
buffer_vert = at::empty({c, yout, xout}, input.options());
}
for (const auto i : c10::irange(batch_size)) {
at::Tensor unpacked_input = skip_unpacking ? input[i] : unpack_rgb(input[i]);
at::Tensor unpacked_input = (skip_unpacking) ? input[i] : unpack_rgb(input[i]);
at::Tensor unpacked_output;
if (need_horizontal) {
@ -411,7 +411,7 @@ void upsample_avx_bilinear_bicubic_uint8(
unpacked_output = unpacked_input = unpacked_output_temp;
}
if (need_vertical) {
unpacked_output = skip_packing ? output[i] : buffer_vert;
unpacked_output = (skip_packing) ? output[i] : buffer_vert;
ImagingResampleVertical(
unpacked_output,
@ -502,7 +502,7 @@ void ImagingResampleHorizontalConvolution8u4x(
// RGBA: b4_delta = b4_delta_soft = 3
// RGB : b4_delta = 5
// RGB : b4_delta_soft = 4
const auto b4_delta = (stride == 4) ? 3 : (is_last_line ? 5 : 4);
const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4);
// In block 2 (2 means we process 2 weights values together), we read input data
// with _mm_loadl_epi64, i.e. 8 bytes, per one line:
@ -515,7 +515,7 @@ void ImagingResampleHorizontalConvolution8u4x(
// RGBA: b2_delta = b2_delta_soft = 1
// RGB : b2_delta = 2
// RGB : b2_delta_soft = 1
const auto b2_delta = (stride == 4) ? 1 : (is_last_line ? 2 : 1);
const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1);
const auto max_out_x_strided = out_xsize * stride;
const auto max_in_x_strided = in_xsize * stride;
@ -819,7 +819,7 @@ void ImagingResampleHorizontalConvolution8u(
// RGBA: b8_delta = b8_delta_soft = 7
// RGB : b8_delta = 10
// RGB : b8_delta_soft = 9
const auto b8_delta = (stride == 4) ? 7 : (is_last_line ? 10 : 9);
const auto b8_delta = (stride == 4) ? 7 : ((is_last_line) ? 10 : 9);
// In block 4 (4 means we process 4 weight values together), we read
// 16 bytes of input data.
@ -832,7 +832,7 @@ void ImagingResampleHorizontalConvolution8u(
// RGBA: b4_delta = b4_delta_soft = 3
// RGB : b4_delta = 5
// RGB : b4_delta_soft = 4
const auto b4_delta = (stride == 4) ? 3 : (is_last_line ? 5 : 4);
const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4);
// In block 2 (2 means we process 2 weight values together), we read
// 8 bytes of input data.
@ -845,7 +845,7 @@ void ImagingResampleHorizontalConvolution8u(
// RGBA: b2_delta = b2_delta_soft = 1
// RGB : b2_delta = 2
// RGB : b2_delta_soft = 1
const auto b2_delta = (stride == 4) ? 1 : (is_last_line ? 2 : 1);
const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1);
const auto max_out_x_strided = out_xsize * stride;
const auto max_in_x_strided = in_xsize * stride;

View File

@ -644,8 +644,8 @@ void weight_to_int4pack_kernel(
int32_t val2 = src[(d + 32) * K + k];
int32_t val3 = src[(d + 48) * K + k];
uint8_t packed02 = ((uint8_t)val2 << 4) | ((uint8_t)val0);
uint8_t packed13 = ((uint8_t)val3 << 4) | ((uint8_t)val1);
uint8_t packed02 = (((uint8_t)(val2) << 4)) | ((uint8_t)(val0));
uint8_t packed13 = (((uint8_t)(val3) << 4)) | ((uint8_t)(val1));
dst[k * 32 + d] = packed02;
dst[k * 32 + 16 + d] = packed13;
@ -656,7 +656,7 @@ void weight_to_int4pack_kernel(
int32_t val0 = src[n * K + k];
int32_t val1 = src[n * K + K + k];
uint8_t packed = ((uint8_t)val1 << 4) | ((uint8_t)val0);
uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
dst[k * nb_size / 2 + n / 2] = packed;
}
}
@ -667,7 +667,7 @@ void weight_to_int4pack_kernel(
int32_t val0 = src[(d + 0) * K + k];
int32_t val1 = src[(d + 16) * K + k];
uint8_t packed01 = ((uint8_t)val1 << 4) | ((uint8_t)val0);
uint8_t packed01 = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
dst[k * 16 + d] = packed01;
}
} else {
@ -676,7 +676,7 @@ void weight_to_int4pack_kernel(
int32_t val0 = src[n * K + k];
int32_t val1 = src[n * K + K + k];
uint8_t packed = ((uint8_t)val1 << 4) | ((uint8_t)val0);
uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
dst[k * nb_size / 2 + n / 2] = packed;
}
}
@ -685,7 +685,7 @@ void weight_to_int4pack_kernel(
int32_t val0 = src[n * K + k];
int32_t val1 = src[n * K + K + k];
uint8_t packed = ((uint8_t)val1 << 4) | ((uint8_t)val0);
uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
dst[k * nb_size / 2 + n / 2] = packed;
}
#endif
@ -872,16 +872,16 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
for (size_t k_idx = 0; k_idx < k; ++k_idx) {
const float src0_0 = src_ptr[k_idx];
max0 = std::max(src0_0, max0);
min0 = std::min(src0_0, min0);
max0 = (std::max)(src0_0, max0);
min0 = (std::min)(src0_0, min0);
}
// Maximum/minimum int8 values
const float qmin = (float)INT8_MIN;
const float qmax = (float)INT8_MAX;
const float rmin0 = std::min(0.0f, min0);
const float rmax0 = std::max(0.0f, max0);
const float rmin0 = (std::min)(0.0f, min0);
const float rmax0 = (std::max)(0.0f, max0);
const float scale0 =
rmin0 == rmax0 ? 1.f : (qmax - qmin) / (rmax0 - rmin0);
@ -900,8 +900,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
? qmin - descaled_min0
: qmax - descaled_max0;
zero_point0 = std::max(zero_point0, qmin);
zero_point0 = std::min(zero_point0, qmax);
zero_point0 = (std::max)(zero_point0, qmin);
zero_point0 = (std::min)(zero_point0, qmax);
// Round to nearest integer
const int32_t nudged_zero_point0 = lrintf(zero_point0);
@ -909,9 +909,9 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;
// LHS offset at the beginning of the row
*((float*)dst_ptr) = recip_scale0;
*((float*)(dst_ptr)) = recip_scale0;
dst_ptr += sizeof(float);
*((int32_t*)dst_ptr) = -nudged_zero_point0;
*((int32_t*)(dst_ptr)) = -nudged_zero_point0;
dst_ptr += sizeof(int32_t);
// Quantize the channels
@ -922,8 +922,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0));
v0_s32 = v0_s32 + nudged_zero_point0;
v0_s32 = std::max(v0_s32, static_cast<int32_t>(INT8_MIN));
v0_s32 = std::min(v0_s32, static_cast<int32_t>(INT8_MAX));
v0_s32 = (std::max)(v0_s32, static_cast<int32_t>(INT8_MIN));
v0_s32 = (std::min)(v0_s32, static_cast<int32_t>(INT8_MAX));
dst_ptr[0] = (int8_t)v0_s32;
dst_ptr += sizeof(int8_t);
}
@ -988,8 +988,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
main_acc = main_acc * lhs_scale;
// Clamp (min-max) operation
main_acc = std::max(main_acc, scalar_min);
main_acc = std::min(main_acc, scalar_max);
main_acc = (std::max)(main_acc, scalar_min);
main_acc = (std::min)(main_acc, scalar_max);
dst_f32[0] = main_acc;
dst_f32 += 1;
@ -1024,15 +1024,15 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
for (size_t k_idx = 0; k_idx < k; ++k_idx) {
const float src0_0 = src_ptr[k_idx];
max0 = std::max(src0_0, max0);
min0 = std::min(src0_0, min0);
max0 = (std::max)(src0_0, max0);
min0 = (std::min)(src0_0, min0);
}
const float qmin = (float)INT8_MIN;
const float qmax = (float)INT8_MAX;
const float rmin0 = std::min(0.0f, min0);
const float rmax0 = std::max(0.0f, max0);
const float rmin0 = (std::min)(0.0f, min0);
const float rmax0 = (std::max)(0.0f, max0);
const float scale0 =
(rmin0 == rmax0) ? 1.f : (qmax - qmin) / (rmax0 - rmin0);
const float recip_scale0 = scale0 ? 1.0f / scale0 : 0.0f;
@ -1044,22 +1044,22 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
? qmin - descaled_min0
: qmax - descaled_max0;
zero_point0 = std::max(zero_point0, qmin);
zero_point0 = std::min(zero_point0, qmax);
zero_point0 = (std::max)(zero_point0, qmin);
zero_point0 = (std::min)(zero_point0, qmax);
const int32_t nudged_zero_point0 = lrintf(zero_point0);
int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;
*((float*)dst_ptr) = recip_scale0;
*((float*)(dst_ptr)) = recip_scale0;
dst_ptr += sizeof(float);
*((int32_t*)dst_ptr) = -nudged_zero_point0;
*((int32_t*)(dst_ptr)) = -nudged_zero_point0;
dst_ptr += sizeof(int32_t);
for (size_t k_idx = 0; k_idx < k; ++k_idx) {
const float src0_0 = src_ptr[k_idx];
int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0));
v0_s32 = std::max(
std::min(
v0_s32 = (std::max)(
(std::min)(
v0_s32 + nudged_zero_point0, static_cast<int32_t>(INT8_MAX)),
static_cast<int32_t>(INT8_MIN));
dst_ptr[0] = (int8_t)v0_s32;
@ -1118,8 +1118,8 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
}
main_acc = main_acc * lhs_scale;
main_acc = std::max(main_acc, scalar_min);
main_acc = std::min(main_acc, scalar_max);
main_acc = (std::max)(main_acc, scalar_min);
main_acc = (std::min)(main_acc, scalar_max);
dst_f32[0] = main_acc;
dst_f32 += 1;

View File

@ -4,6 +4,7 @@
#include <c10/util/SmallVector.h>
#include <c10/core/Scalar.h>
#include <c10/core/ScalarType.h>
#include <c10/util/Exception.h>
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/core/NamedTensor.h>

View File

@ -753,8 +753,8 @@ static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_working_copy
handle, params, uplo, n, datatype,
self_working_copy_ptr + i * matrix_stride,
lda, datatype,
static_cast<char*>(workdata_device_ptr) + i * worksize_device, worksize_device,
static_cast<char*>(workdata_host_ptr) + i * worksize_host, worksize_host,
(char*)workdata_device_ptr + i * worksize_device, worksize_device,
(char*)workdata_host_ptr + i * worksize_host, worksize_host,
infos_ptr + i
);
}

View File

@ -119,8 +119,8 @@ void setConvolutionParams(
params->input_dim = input.dim();
params->memory_format = memory_format;
for (int i = 0; i != params->input_dim; ++i) {
params->input_size[i] = static_cast<int>(input.sizes()[i]);
params->weight_size[i] = static_cast<int>(weight.sizes()[i]);
params->input_size[i] = (int)input.sizes()[i];
params->weight_size[i] = (int)weight.sizes()[i];
}
// ASSERT(padding.size() == stride.size())
// ASSERT(padding.size() == dilation.size())

View File

@ -64,7 +64,7 @@
// fastest algorithm combination with a sub optimal mathType.
constexpr size_t operator"" _TiB(unsigned long long n) {
return static_cast<size_t>(n) * 1024 * 1024 * 1024 * 1024;
return size_t(n) * 1024 * 1024 * 1024 * 1024;
}
namespace at {

View File

@ -46,7 +46,7 @@ namespace {
// TODO: remove duplicate code in Conv_v7.cpp
constexpr int64_t operator"" _TiB(unsigned long long n) {
return static_cast<size_t>(n) << 40;
return size_t(n) << 40;
}
uint8_t getAlignment(const Tensor& t) {
@ -93,10 +93,7 @@ cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(
std::vector<int64_t> strides_copy(std::begin(strides), std::end(strides));
fixSizeOneDimStride<int64_t>(
sizes.size(),
&sizes[0],
static_cast<int64_t*>(&strides_copy[0]),
channels_last);
sizes.size(), &sizes[0], (int64_t*)&strides_copy[0], channels_last);
auto r = cudnn_frontend::TensorBuilder()
.setDim(sizes.size(), sizes.data())
.setStrides(strides_copy.size(), strides_copy.data())

View File

@ -44,7 +44,6 @@ std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
#include <ATen/cudnn/Descriptors.h>
#include <ATen/cudnn/Types.h>
#include <ATen/cudnn/Utils.h>
#include <array>
#include <ATen/TensorUtils.h>
#include <c10/util/irange.h>
@ -60,11 +59,11 @@ void setSamplerDescriptor(
SpatialTransformerDescriptor& desc,
cudnnDataType_t dataType,
const at::Tensor& tensor) {
std::array<int, 4> inputSize{0};
int inputSize[4] = {0};
for (const auto i : c10::irange(tensor.dim())) {
inputSize[i] = static_cast<int>(tensor.size(i));
inputSize[i] = (int)tensor.size(i);
}
desc.set(dataType, 4, inputSize.data());
desc.set(dataType, 4, inputSize);
}
void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input) {

View File

@ -656,8 +656,7 @@ void add_projection_weights(
TORCH_INTERNAL_ASSERT(
nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim = ", min_dim);
auto elem_size = dataSize(getCudnnDataType(weight_buf));
auto offset_bytes = static_cast<const char*>(matrix_pointer) -
static_cast<const char*>(weight_buf.data_ptr());
auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
TORCH_INTERNAL_ASSERT(
offset_bytes % elem_size == 0,
"offset_bytes = ",
@ -795,8 +794,8 @@ get_parameters(
"; min_dim = ",
min_dim);
auto elem_size = dataSize(getCudnnDataType(weight_buf));
auto offset_bytes = static_cast<const char*>(matrix_pointer) -
static_cast<const char*>(weight_buf.data_ptr());
auto offset_bytes =
(char*)matrix_pointer - (char*)weight_buf.data_ptr();
TORCH_INTERNAL_ASSERT(
offset_bytes % elem_size == 0,
"offset_bytes = ",

View File

@ -330,6 +330,7 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
}
#elif AT_MKL_ENABLED()
#include <ATen/Dispatch.h>
#include <algorithm>
#include <numeric>

View File

@ -535,7 +535,7 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
float input_scale = scale_a.item<float>();
float weight_scale = scale_b.item<float>();
float output_scale = 1.0f;
float output_scale = float(1.0);
if (scale_result.has_value() &&
(*out_dtype == ScalarType::Float8_e4m3fn ||
*out_dtype == ScalarType::Float8_e5m2)) {

View File

@ -530,7 +530,7 @@ static Tensor get_mkldnn_serialized_md(const Tensor& self) {
#else
TORCH_CHECK(false, "Unexpected IDeep version to do weight serialization.");
#endif
Tensor serialized_md = at::from_blob((void*)serialized_wei_desc.data(), {static_cast<int64_t>(serialized_wei_desc.size())}, at::TensorOptions(at::kByte));
Tensor serialized_md = at::from_blob((void*)serialized_wei_desc.data(), {(int64_t)serialized_wei_desc.size()}, at::TensorOptions(at::kByte));
auto res = at::empty_like(serialized_md);
// serialized_md shares the buffer with serialized_wei_desc,
// which will be released outside of this function thus invalidating the buffer of serialized_md.

View File

@ -576,14 +576,14 @@ static void _mkldnn_gemm_i8i8i32_with_blas(
n,
k,
alpha,
static_cast<int8_t*>(self.data_ptr()),
(int8_t*)self.data_ptr(),
lda,
ao,
static_cast<int8_t*>(mat2.data_ptr()),
(int8_t*)mat2.data_ptr(),
ldb,
bo,
beta,
static_cast<int32_t*>(result.data_ptr()),
(int32_t*)result.data_ptr(),
ldc,
&co);
}

View File

@ -41,7 +41,7 @@ void woq_matmul_int4_impl(
dst_usr_dims;
dnnl::memory::dims m1_usr_strides, m2_usr_strides, scale_usr_strides,
zp_usr_strides, dst_usr_strides;
int compressed_k = k / 8;
int compressed_k = (int)(k / 8);
int num_groups = (int)(k / group_size);
m1_usr_dims = {m, k};
m1_usr_strides = {m1.stride(0), m1.stride(1)};

View File

@ -10,7 +10,6 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/aminmax.h>
#include <ATen/ops/avg_pool2d.h>
#include <ATen/ops/avg_pool2d_backward.h>
#include <ATen/ops/avg_pool2d_backward_native.h>
@ -545,9 +544,8 @@ static void max_unpool_out_mps_template(const Tensor& input,
if (indices.defined() && indices.numel() > 0) {
auto output_image_size = c10::multiply_integers(output_size_);
auto [min_idx_tensor, max_idx_tensor] = indices.aminmax();
int64_t min_idx = min_idx_tensor.item<int64_t>();
int64_t max_idx = max_idx_tensor.item<int64_t>();
int64_t min_idx = indices.min().item<int64_t>();
int64_t max_idx = indices.max().item<int64_t>();
if (min_idx < 0 || max_idx >= output_image_size) {
int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;

View File

@ -83,31 +83,6 @@ std::string get_type_str<int32_t>() {
return "int32_t";
}
// If all tensors are contiguous with the same dtype and the cat dimension is 0,
// then we can simply copy each tensor's underlying buffer contiguously into the
// output.
static void cat_out_mps_contiguous_impl(const ITensorListRef& inputs, const Tensor& output) {
MPSStream* stream = getCurrentMPSStream();
id<MTLBuffer> output_buffer = getMTLBufferStorage(output);
size_t output_offset = output.storage_offset() * output.itemsize();
for (const Tensor& input : inputs) {
if (cat_should_skip_tensor(input)) {
continue;
}
id<MTLBuffer> input_buffer = getMTLBufferStorage(input);
size_t input_offset = input.storage_offset() * input.itemsize();
auto nbytes = input.nbytes();
auto profile_id =
getMPSProfiler().beginProfileCopy(input_buffer, output_buffer, input, output, nbytes, /*non_blocking=*/true);
stream->copy(input_buffer, output_buffer, nbytes, input_offset, output_offset, profile_id, SyncType::NONE);
output_offset += nbytes;
}
}
// NOTE: `output` is expected to already have the correct size.
template <typename idx_type_t>
static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
@ -130,7 +105,7 @@ static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, co
// copy all the input tensor data into a packed buffer, which would not be
// ideal.
for (const Tensor& input : inputs) {
if (cat_should_skip_tensor(input)) {
if (input.numel() == 0) {
continue;
}
@ -268,16 +243,101 @@ TORCH_IMPL_FUNC(cat_out_mps)
if (out.numel() == 0) {
return;
}
auto materialized_inputs = inputs.materialize();
bool has_large_tensor =
isTooLargeForMPSGraph(out) || std::any_of(materialized_inputs.begin(), materialized_inputs.end(), [](auto& t) {
return !cat_should_skip_tensor(t) && isTooLargeForMPSGraph(t);
});
auto out_dtype = at::native::result_type(inputs);
if (all_contiguous && all_same_dtype && (memory_format == MemoryFormat::Contiguous) && (dimension == 0)) {
return mps::cat_out_mps_contiguous_impl(materialized_inputs, out);
} else if (has_large_tensor) {
int idx = 0;
for (const Tensor& t : materialized_inputs) {
TORCH_CHECK(t.dim() > 0, "zero-dimensional tensor (at position ", idx, ") cannot be concatenated");
auto lap = at::get_overlap_status(out, t);
TORCH_CHECK(lap != at::MemOverlapStatus::Partial && lap != at::MemOverlapStatus::Full,
"torch.cat(): unsupported operation: the input tensors cannot refer to any "
"of the output memory locations. Found overlap in input tensor ",
idx);
idx++;
}
// Check for type promotion
TORCH_CHECK(canCast(out_dtype, out.scalar_type()),
"torch.cat(): input types can't be cast to the desired output type ",
out.scalar_type());
TORCH_CHECK(!inputs.empty(), "torch.cat(): invalid number of inputs ", inputs.size());
dimension = legacy_cat_wrap_dim(dimension, materialized_inputs);
TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension);
// previously, size [0] tensors were the only possible empty tensors; thus, it
// wasn't possible to cat empty tensors unless all the other tensors were
// 1-dimensional, so we allowed these tensors to be "skipped". We maintain
// this behavior for backwards compatibility, but only for this specific size
// (i.e. other empty sizes are not skipped).
// FIXME: warn if this is the case
auto should_skip = [](const Tensor& t) { return t.dim() == 1 && t.size(0) == 0; };
at::assert_no_internal_overlap(out);
Tensor notSkippedTensor;
// Indices of tensors to be skipped because they're empty
std::vector<int64_t> skipped_tensor_indices;
// Tensors to be read
std::vector<Tensor> input_tensors;
int tensor_idx = 0;
for (const Tensor& t : materialized_inputs) {
if (t.numel() == 0 || should_skip(t)) {
skipped_tensor_indices.push_back(tensor_idx);
tensor_idx++;
continue;
}
input_tensors.push_back(t);
// TODO: Is this OK?
notSkippedTensor = t;
tensor_idx++;
}
// If all inputs are empty tensors, return an empty tensor
if (!notSkippedTensor.defined()) {
return;
}
for (const Tensor& t : inputs) {
TORCH_CHECK(t.device() == notSkippedTensor.device(),
"torch.cat(): all input tensors must be on the same device. Received ",
t.device(),
" and ",
notSkippedTensor.device());
}
TORCH_CHECK(out.device() == notSkippedTensor.device(),
"torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
notSkippedTensor.device(),
" and out is on ",
out.device());
std::vector<int64_t> size(notSkippedTensor.sizes().vec());
// Compute size of the result in the cat dimension
int64_t cat_dim_size = 0;
idx = 0;
bool has_large_tensor = false;
for (const Tensor& tensor : materialized_inputs) {
if (isTooLargeForMPSGraph(tensor)) {
has_large_tensor |= true;
}
if (!should_skip(tensor)) {
// TODO: Factor out `check_shape_except_dim`
check_shape_except_dim(notSkippedTensor, tensor, dimension, idx);
cat_dim_size += tensor.size(dimension);
idx++;
}
}
// Compute the size of the result
size[dimension] = cat_dim_size;
// skip resizing if size of result is same as expected
if (out.sizes() != size) {
out.resize_(size, MemoryFormat::Contiguous);
}
if (out.numel() == 0) {
return;
}
has_large_tensor |= isTooLargeForMPSGraph(out);
if (has_large_tensor) {
return mps::cat_out_mps_impl<int64_t>(materialized_inputs, dimension, out);
} else {
return mps::cat_out_mps_impl<int32_t>(materialized_inputs, dimension, out);

View File

@ -2602,16 +2602,12 @@
device_check: NoCheck # TensorIterator
structured_delegate: exp.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: exp_sparse
tags: [core, pointwise]
- func: exp_(Tensor(a!) self) -> Tensor(a!)
device_check: NoCheck # TensorIterator
structured_delegate: exp.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: exp_sparse_
tags: pointwise
- func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -2620,7 +2616,6 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: exp_out
SparseCPU, SparseCUDA, SparseMPS: exp_sparse_out
tags: pointwise
- func: exp2(Tensor self) -> Tensor

View File

@ -65,7 +65,7 @@ void quantize_vec(
(typename T::underlying*)dst,
count,
fbgemm::TensorQuantizationParams{
static_cast<float>(scale), static_cast<int32_t>(zero_point), precision});
(float)scale, (int32_t)zero_point, precision});
}
#if defined(__ARM_NEON__) || defined(__aarch64__)

View File

@ -40,7 +40,7 @@ inline int start_index(int out_idx, int out_len, int in_len) {
* This function computes the start index on input matrix.
*/
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
return static_cast<int>(std::floor(static_cast<float>(out_idx * in_len) / out_len));
return (int)std::floor((float)(out_idx * in_len) / out_len);
}
inline int end_index(int out_idx, int out_len, int in_len) {
@ -49,7 +49,7 @@ inline int end_index(int out_idx, int out_len, int in_len) {
* This function computes the end index on input matrix.
*/
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
return static_cast<int>(std::ceil(static_cast<float>((out_idx + 1) * in_len) / out_len));
return (int)std::ceil((float)((out_idx + 1) * in_len) / out_len);
}
// adaptive avg pool for 2D and 3D inputs

View File

@ -71,8 +71,8 @@ void avg_pool2d_out_frame(
int64_t hend = std::min(hstart + kH, inputHeight + padH);
int64_t wend = std::min(wstart + kW, inputWidth + padW);
int64_t pool_size = (hend - hstart) * (wend - wstart);
hstart = std::max(hstart, static_cast<int64_t>(0));
wstart = std::max(wstart, static_cast<int64_t>(0));
hstart = std::max(hstart, (int64_t)0);
wstart = std::max(wstart, (int64_t)0);
hend = std::min(hend, inputHeight);
wend = std::min(wend, inputWidth);

View File

@ -646,7 +646,7 @@ class QConvPackWeightInt8 final {
torch::List<int64_t> output_padding;
output_padding.reserve(kSpatialDim);
for ([[maybe_unused]] const auto idx : c10::irange(kSpatialDim)) {
output_padding.push_back(0);
output_padding.push_back((int64_t)0);
}
return _run(weight, bias, stride, padding, output_padding, dilation, groups,
/*transpose=*/false);

View File

@ -301,10 +301,6 @@ def define_qnnpack(third_party, labels = []):
"-DQNNP_PRIVATE=",
"-DQNNP_INTERNAL=",
],
fbobjc_compiler_flags = [
"-Wno-switch-enum",
"-Wno-switch-default",
],
labels = [
"supermodule:android/default/pytorch",
"supermodule:ios/default/public.pytorch",

View File

@ -134,7 +134,7 @@ class QConvPackWeightInt8Cudnn final {
torch::List<int64_t> output_padding;
output_padding.reserve(kSpatialDim);
for ([[maybe_unused]] const auto idx : c10::irange(kSpatialDim)) {
output_padding.push_back(0);
output_padding.push_back((int64_t)0);
}
return _run(weight, bias, stride, padding, output_padding, dilation, groups,
/*transpose=*/false);

View File

@ -26,8 +26,6 @@
#include <ATen/ops/erf_native.h>
#include <ATen/ops/erfinv.h>
#include <ATen/ops/erfinv_native.h>
#include <ATen/ops/exp.h>
#include <ATen/ops/exp_native.h>
#include <ATen/ops/expm1.h>
#include <ATen/ops/expm1_native.h>
#include <ATen/ops/floor.h>
@ -177,7 +175,6 @@ COALESCED_UNARY_UFUNC(atanh)
COALESCED_UNARY_UFUNC(ceil)
COALESCED_UNARY_UFUNC(deg2rad)
COALESCED_UNARY_UFUNC(erf)
COALESCED_UNARY_UFUNC(exp)
COALESCED_UNARY_UFUNC(erfinv)
COALESCED_UNARY_UFUNC(expm1)
COALESCED_UNARY_UFUNC(floor)

View File

@ -16,8 +16,8 @@ void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr) {
"cusparseXcoo2csr only supports m, nnz with the bound [val] <= ",
INT_MAX);
int i_nnz = static_cast<int>(nnz);
int i_m = static_cast<int>(m);
int i_nnz = (int)nnz;
int i_m = (int)m;
auto handle = at::cuda::getCurrentCUDASparseHandle();
TORCH_CUDASPARSE_CHECK(cusparseXcoo2csr(handle, coorowind, i_nnz, i_m, csrrowptr, CUSPARSE_INDEX_BASE_ZERO));
@ -202,7 +202,7 @@ void CreateIdentityPermutation(int64_t nnz, int *P) {
TORCH_CHECK((nnz <= INT_MAX),
"Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ",
INT_MAX);
int i_nnz = static_cast<int>(nnz);
int i_nnz = (int)nnz;
auto handle = at::cuda::getCurrentCUDASparseHandle();
cusparseCreateIdentityPermutation(handle, i_nnz, P);
@ -213,9 +213,9 @@ void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRow
TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
"Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <=",
INT_MAX);
int i_m = static_cast<int>(m);
int i_n = static_cast<int>(n);
int i_nnz = static_cast<int>(nnz);
int i_m = (int)m;
int i_n = (int)n;
int i_nnz = (int)nnz;
auto handle = at::cuda::getCurrentCUDASparseHandle();
TORCH_CUDASPARSE_CHECK(cusparseXcsrsort_bufferSizeExt(handle, i_m, i_n, i_nnz, csrRowPtr, csrColInd, pBufferSizeInBytes));
@ -226,9 +226,9 @@ void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrC
TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
"Xcsrsort only supports m, n, nnz with the bound [val] <= ",
INT_MAX);
int i_m = static_cast<int>(m);
int i_n = static_cast<int>(n);
int i_nnz = static_cast<int>(nnz);
int i_m = (int)m;
int i_n = (int)n;
int i_nnz = (int)nnz;
auto handle = at::cuda::getCurrentCUDASparseHandle();
cusparseMatDescr_t desc;
@ -242,9 +242,9 @@ void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRow
TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
"Xcoosort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ",
INT_MAX);
int i_m = static_cast<int>(m);
int i_n = static_cast<int>(n);
int i_nnz = static_cast<int>(nnz);
int i_m = (int)m;
int i_n = (int)n;
int i_nnz = (int)nnz;
auto handle = at::cuda::getCurrentCUDASparseHandle();
TORCH_CUDASPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, i_m, i_n, i_nnz, cooRows, cooCols, pBufferSizeInBytes));
@ -255,9 +255,9 @@ void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols
TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
"XcoosortByRow only supports m, n, nnz with the bound [val] <= ",
INT_MAX);
int i_m = static_cast<int>(m);
int i_n = static_cast<int>(n);
int i_nnz = static_cast<int>(nnz);
int i_m = (int)m;
int i_n = (int)n;
int i_nnz = (int)nnz;
auto handle = at::cuda::getCurrentCUDASparseHandle();
TORCH_CUDASPARSE_CHECK(cusparseXcoosortByRow(handle, i_m, i_n, i_nnz, cooRows, cooCols, P, pBuffer));

View File

@ -155,7 +155,7 @@ void set_params_fprop(Flash_fwd_params &params,
// [Minor] We want to round down since when we do the comparison we use <= instead of <
// params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0));
// params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0));
params.p_dropout_in_uint8_t = static_cast<uint8_t>(std::floor(params.p_dropout * 255.0));
params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0));
params.rp_dropout = 1.f / params.p_dropout;
params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
TORCH_CHECK(p_dropout < 1.f);
@ -307,7 +307,7 @@ inline int num_splits_heuristic(int batch_nheads_mblocks, int num_SMs, int num_n
if (!is_split_eligible(num_splits)) {
efficiency.push_back(0.f);
} else {
float n_waves = static_cast<float>(batch_nheads_mblocks * num_splits) / num_SMs;
float n_waves = float(batch_nheads_mblocks * num_splits) / num_SMs;
float eff = n_waves / ceil(n_waves);
// printf("num_splits = %d, eff = %f\n", num_splits, eff);
if (eff > max_efficiency) { max_efficiency = eff; }

View File

@ -341,7 +341,7 @@ inline bool check_grouped_query_attention(sdp_params const& params, bool debug)
const auto v_num_heads = params.value.sym_size(-3);
const bool same_kv_heads = k_num_heads == v_num_heads;
if (requires_same_num_heads && !same_kv_heads){
if (requires_same_num_heads && !(same_kv_heads)){
if (debug) {
TORCH_WARN(
"Both fused kernels require key and value to have the same num_heads and batch_size but got: ",

View File

@ -43,9 +43,9 @@ bool available(
(kFloat == weight.scalar_type()) &&
// Bias
(bias_sizes_opt.has_value() ? ((1 == bias_sizes_opt->size()) &&
(transposed ? (weight.size(Layout::Filter::input) ==
((transposed ? (weight.size(Layout::Filter::input) ==
((*bias_sizes_opt)[0] / groups))
: (weight.size(Layout::Filter::output) == ((*bias_sizes_opt)[0]))))
: (weight.size(Layout::Filter::output) == ((*bias_sizes_opt)[0])))))
: true) &&
// Padding
(padding[Layout::Parameter::height] >= 0) &&
@ -133,10 +133,10 @@ const Tensor reorder_weights_for_transpose_conv(const Tensor& weight_nhwc,
int kernel_height = weight_nhwc.size(2);
int o_offset = 1;
int h_offset = output_channels_per_group;
int w_offset = output_channels_per_group*kernel_height;
int i_offset = output_channels_per_group*kernel_height*kernel_width;
int g_offset = output_channels_per_group*kernel_height*kernel_width*input_channels_per_group;
int h_offset = (output_channels_per_group);
int w_offset = (output_channels_per_group)*(kernel_height);
int i_offset = (output_channels_per_group)*(kernel_height)*(kernel_width);
int g_offset = (output_channels_per_group)*(kernel_height)*(kernel_width)*(input_channels_per_group);
Tensor reordered = mobile::empty_with_tail_padding(
weight_nhwc.sizes(),

View File

@ -28,7 +28,6 @@
#include <c10/util/OptionalArrayRef.h>
#include <c10/util/intrusive_ptr.h>
#include <c10/macros/Export.h>
#include <c10/macros/Macros.h>
#include <ATen/core/CheckMemoryFormat.h>
#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
#include <ATen/core/DeprecatedTypeProperties.h>
@ -130,7 +129,6 @@ class TORCH_API Tensor: public TensorBase {
return *this;
}
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
switch (this->layout()) {
case at::kSparse:
case at::kSparseCsr:
@ -141,7 +139,6 @@ class TORCH_API Tensor: public TensorBase {
default:
return this->_conj();
}
C10_DIAGNOSTIC_POP()
}
// Aliased by Dimname overloads, so need explicit using

View File

@ -56,20 +56,6 @@ def list_benchmarks():
print(f"Available benchmarks: {list(BENCHMARK_REGISTRY.keys())}")
def _run_benchmark(
benchmark_cls,
script_args,
):
benchmark = benchmark_cls(script_args)
benchmark.benchmark()
benchmark.report_geomean_speedup()
if script_args.print_benchmark_result:
print(f"Benchmarking results {benchmark.name}:")
print(benchmark.profiling_results)
if script_args.visualize:
benchmark.visualize()
def run_benchmark(
benchmark_name: str,
script_args,
@ -85,7 +71,10 @@ def run_benchmark(
print("=" * 60)
benchmark_class = BENCHMARK_REGISTRY[benchmark_name]
_run_benchmark(benchmark_class, script_args)
benchmark = benchmark_class(script_args)
benchmark.benchmark()
if script_args.visualize:
benchmark.visualize()
return True
@ -98,7 +87,10 @@ def run_all_benchmarks(script_args):
for name, cls in BENCHMARK_REGISTRY.items():
print(f"\n{'=' * 20} {name.upper()} {'=' * 20}")
_run_benchmark(cls, script_args)
benchmark = cls(script_args)
benchmark.benchmark()
if script_args.visualize:
benchmark.visualize()
print()
@ -157,43 +149,8 @@ Examples:
help="Whether to exit with an error message for accuracy failure",
)
parser.add_argument(
"--print-benchmark-result",
action="store_true",
help="Whether to print the raw benchmarking result. Easier to quickly check the benchmark results on a server without GUI",
)
parser.add_argument(
"--custom-compile-name",
type=str,
default=None,
help="Name for the curve with customized compilation options",
)
parser.add_argument(
"--custom-compile-options",
type=str,
default=None,
help="Json string for the custom compile options.",
)
args = parser.parse_args()
if args.custom_compile_options:
import json
try:
args.custom_compile_options = json.loads(args.custom_compile_options)
except json.decoder.JSONDecodeError as e:
raise RuntimeError(
f"Invalid json string for --custom-compile-options: {args.custom_compile_options}"
) from e
if not args.custom_compile_options:
raise RuntimeError("Found no options for --custom-compile-options")
if not args.custom_compile_name:
raise RuntimeError("Missing label name for the custom compilation")
# Handle list option
if args.list:
list_benchmarks()

View File

@ -8,15 +8,6 @@ import torch
import torch.nn.functional as F
# more important shapes used by internal models
extra_shapes_for_norm = (
(1152 * 500, 384),
(1152 * 500, 512),
(1152 * 1000, 384),
(1152 * 1000, 512),
)
class CrossEntropyForward(BenchmarkKernel):
def __init__(self, script_args):
super().__init__(script_args)
@ -355,7 +346,7 @@ class RMSNormForward(BenchmarkKernel):
(32768, 65536),
(16384, 131072),
(8192, 262144),
) + extra_shapes_for_norm
)
def get_memory_bytes(self, args, kwargs) -> int:
x, w = args
@ -447,7 +438,8 @@ class RMSNormBackward(BenchmarkKernel):
(32768, 4096),
(32768, 8192),
(32768, 16384),
) + extra_shapes_for_norm
(32768, 32768),
)
def get_memory_bytes(self, args, kwargs) -> int:
x, w, dy = args
@ -561,7 +553,7 @@ class LayerNormForward(BenchmarkKernel):
(32768, 16384),
(32768, 32768),
(32768, 65536),
) + extra_shapes_for_norm
)
def get_memory_bytes(self, args, kwargs) -> int:
x, w = args
@ -635,7 +627,7 @@ class LayerNormBackward(BenchmarkKernel):
(32768, 16384),
(32768, 32768),
(32768, 65536),
) + extra_shapes_for_norm
)
def get_memory_bytes(self, args, kwargs) -> int:
x, w, dy = args

View File

@ -6,7 +6,6 @@ from dataclasses import dataclass
from typing import Any, Optional
import matplotlib.pyplot as plt
from scipy.stats import gmean
import torch
from torch._inductor.runtime.benchmarking import benchmarker
@ -108,18 +107,6 @@ class BenchmarkKernel:
for backend in self.available_backends:
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
res[backend] = getattr(self, backend)(args_ref, kwargs_ref)()
if (
"compiled" in self.available_backends
and self.script_args.custom_compile_options
):
torch._dynamo.reset() # cause recompile
with torch._inductor.config.patch(self.script_args.custom_compile_options):
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
res[self.script_args.custom_compile_name] = self.compiled(
args_ref, kwargs_ref
)()
gold = res["eager"]
tol = {}
@ -128,7 +115,7 @@ class BenchmarkKernel:
"atol": self.script_args.tolerance,
"rtol": self.script_args.tolerance,
}
for backend in res:
for backend in self.available_backends:
if backend == "eager":
continue
try:
@ -147,83 +134,37 @@ class BenchmarkKernel:
print("Exit right away since --exit-on-accuracy-failure is set")
sys.exit(1)
def benchmark_single_shape_for_backend(
self, backend, args, kwargs, setting, fn=None
) -> bool:
if fn is None:
fn = getattr(self, backend)
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
try:
avg_time = benchmark_kernel_in_milliseconds(fn(args_ref, kwargs_ref))
except Exception as e:
print(
f"Failed to run {backend} backend on {self.name} kernel for {setting} due to {e}"
)
self.available_backends.remove(backend) # noqa: B909
return False
mem_bytes = self.get_memory_bytes(args_ref, kwargs_ref)
perf = Performance(setting, avg_time, mem_bytes)
print(f"{self.name} kernel on {backend} backend. {perf}")
self.profiling_results[backend].append(perf)
return True
def benchmark_single_shape(
self, args, kwargs=None, should_check_accuracy=True, setting: str = ""
):
for backend in self.available_backends:
self.benchmark_single_shape_for_backend(backend, args, kwargs, setting)
if (
"compiled" in self.available_backends
and self.script_args.custom_compile_options
):
torch._dynamo.reset() # cause recompile
with torch._inductor.config.patch(self.script_args.custom_compile_options):
status = self.benchmark_single_shape_for_backend(
self.script_args.custom_compile_name,
args,
kwargs,
setting,
fn=self.compiled,
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
try:
avg_time = benchmark_kernel_in_milliseconds(
getattr(self, backend)(args_ref, kwargs_ref)
)
if not status:
self.script_args.custom_compile_options = (
None # once fail, don't run again
except Exception as e:
print(
f"Failed to run {backend} backend on {self.name} kernel for {setting} due to {e}"
)
self.available_backends.remove(backend) # noqa: B909
continue
mem_bytes = self.get_memory_bytes(args_ref, kwargs_ref)
perf = Performance(setting, avg_time, mem_bytes)
print(f"{self.name} kernel on {backend} backend. {perf}")
self.profiling_results[backend].append(perf)
if should_check_accuracy:
self.check_accuracy(args, kwargs)
def visualize(self) -> None:
device_name = torch.cuda.get_device_name(0)
visualize_comparison(
self.profiling_results,
title=f"{self.name} ({device_name})",
title=f"{self.name}",
output_path=f"{self.name}_bench",
)
return
def report_geomean_speedup(self) -> None:
print(f"Geomean speedup for benchmark {self.name}")
eager_result = {
result.setting: result for result in self.profiling_results["eager"]
}
print(f" eager {len(eager_result)} data points")
for backend, backend_result in self.profiling_results.items():
if backend == "eager":
continue
speeduplist = []
for result in backend_result:
eager_latency = eager_result[result.setting].latency
backend_latency = result.latency
speeduplist.append(
eager_latency / backend_latency if backend_latency != 0 else 0.0
)
if len(speeduplist) > 0:
print(
f" {backend} {len(speeduplist)} data points, {gmean(speeduplist):.2f}x speedup"
)
def get_backend_colors() -> dict[str, str]:
"""Get consistent color scheme for different backends."""
@ -311,6 +252,5 @@ def visualize_comparison(
os.makedirs("pics", exist_ok=True)
full_path = os.path.join("pics", output_path + ".png")
plt.savefig(full_path, dpi=300, bbox_inches="tight", facecolor="white")
print(f"Chart saved to {full_path}")
plt.close()

View File

@ -74,8 +74,7 @@ REQUIRE_HIGHER_TOLERANCE = {
REQUIRE_HIGHER_TOLERANCE_AMP = {}
REQUIRE_EVEN_HIGHER_TOLERANCE = {
"deit_base_distilled_patch16_224",
"vit_base_patch16_siglip_256",
"beit_base_patch16_224",
}
# These models need higher tolerance in MaxAutotune mode
@ -355,9 +354,7 @@ class TimmRunner(BenchmarkRunner):
if is_training:
from torch._inductor import config as inductor_config
if name == "beit_base_patch16_224":
tolerance = 16 * 1e-2
elif name in REQUIRE_EVEN_HIGHER_TOLERANCE or (
if name in REQUIRE_EVEN_HIGHER_TOLERANCE or (
inductor_config.max_autotune
and name in REQUIRE_EVEN_HIGHER_TOLERANCE_MAX_AUTOTUNE
):

View File

@ -3,13 +3,10 @@
#include <c10/core/DeviceType.h>
#include <c10/core/DispatchKey.h>
#include <c10/core/DispatchKeySet.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#include <stdexcept>
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
namespace c10 {
/**
@ -405,5 +402,3 @@ inline bool isSparseCsr(Backend b) {
}
} // namespace c10
C10_DIAGNOSTIC_POP()

View File

@ -15,8 +15,6 @@
#include <string>
#include <type_traits>
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
namespace c10 {
struct FunctionalityOffsetAndMask {
@ -968,5 +966,3 @@ using remove_DispatchKeySet_arg_from_func = guts::make_function_traits_t<
1>,
typename guts::infer_function_traits_t<FuncType>::parameter_types>>;
} // namespace c10
C10_DIAGNOSTIC_POP()

Some files were not shown because too many files have changed in this diff Show More