mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-23 14:59:34 +08:00
Compare commits
77 Commits
cpp-docs-d
...
ciflow/ind
Author | SHA1 | Date | |
---|---|---|---|
f7a13a6dfc | |||
98aff4e90e | |||
6cdf27661c | |||
ecd4542830 | |||
d0b7578e17 | |||
e22a5ecb45 | |||
9c8b449159 | |||
613b0adb13 | |||
12534b44b3 | |||
831ad1f70b | |||
1eed70b417 | |||
ef230fcd2d | |||
c283224b77 | |||
9bd3d28afa | |||
1c2ad5fe9d | |||
5c57109190 | |||
e3b463d216 | |||
e6bab78b38 | |||
b9f7dd6e77 | |||
2b6c6e3d64 | |||
e289d12b73 | |||
bb2c89fc3b | |||
d530a21122 | |||
27fb875a70 | |||
a5ecc01ef8 | |||
f7b574c862 | |||
5d6924f0a4 | |||
cf7a543013 | |||
55043b3ada | |||
7b61d461ab | |||
0ff1ddecad | |||
0f9b4aae53 | |||
d6886407ef | |||
c37103c16d | |||
d7ecbf7243 | |||
8080febac5 | |||
408ab373f8 | |||
23b1bbb810 | |||
8268316590 | |||
a56e4e5fc7 | |||
dac4caf77b | |||
1710c27341 | |||
92fde5bdc5 | |||
eafc6437d6 | |||
4a90ec1387 | |||
d9ab4e3ade | |||
ce372a06e9 | |||
67f0059726 | |||
55473096e2 | |||
2a15541b5d | |||
16875b228b | |||
808af988ae | |||
61c298ed56 | |||
2d38369728 | |||
74bd12415e | |||
a8c0c0263c | |||
e3f14cdafa | |||
b03cc2d9c8 | |||
99917e659b | |||
def4476b6b | |||
84bb803719 | |||
8266849bda | |||
f766c8ceea | |||
b3cf7bc86d | |||
6a4a8b453d | |||
cab14368a7 | |||
0b87f606ca | |||
25f68b6b5b | |||
334489bfd0 | |||
ada5d90c25 | |||
a1e3e2026b | |||
8dc4932b0c | |||
1505d7a461 | |||
1c842a9686 | |||
692827ad29 | |||
786d4646ee | |||
70df0aed59 |
@ -1202,12 +1202,6 @@ exclude_patterns = [
|
||||
'torch/_inductor/fx_passes/serialized_patterns/**',
|
||||
'torch/_inductor/autoheuristic/artifacts/**',
|
||||
'torch/utils/model_dump/preact.mjs',
|
||||
# These files are all grandfathered in, feel free to remove from this list
|
||||
# as necessary
|
||||
# NOTE: remove the patterns in the order they are listed
|
||||
'aten/src/ATen/native/[a-pA-P]*/**',
|
||||
'aten/src/ATen/[a-mA-M]*/**',
|
||||
'test/**',
|
||||
]
|
||||
init_command = [
|
||||
'python3',
|
||||
|
@ -94,11 +94,11 @@ struct PinnedReserveSegment {
|
||||
struct TORCH_API HostStats {
|
||||
// COUNT: total allocations (active)
|
||||
Stat active_requests;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. (active)
|
||||
// SUM: bytes allocated/reserved by this memory allocator. (active)
|
||||
Stat active_bytes;
|
||||
// COUNT: total allocations (active + free)
|
||||
Stat allocations;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
||||
// SUM: bytes allocated/reserved by this memory allocator. This accounts
|
||||
// for both free and in-use blocks.
|
||||
Stat allocated_bytes;
|
||||
|
||||
@ -127,7 +127,7 @@ struct alignas(64) HostStatsStaged {
|
||||
// COUNT: total allocations (active + free)
|
||||
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
||||
Stat allocations;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
||||
// SUM: bytes allocated/reserved by this memory allocator. This accounts
|
||||
// for both free and in-use blocks.
|
||||
Stat allocated_bytes;
|
||||
// COUNT: number of allocations per bucket (active)
|
||||
@ -455,7 +455,7 @@ struct CachingHostAllocatorImpl {
|
||||
}
|
||||
|
||||
void resetAccumulatedStats() {
|
||||
// Reseting accumulated memory stats requires concurrently holding both the
|
||||
// Resetting accumulated memory stats requires concurrently holding both the
|
||||
// free list mutexes and the blocks mutex. Previously, this was only done in
|
||||
// empty_cache function.
|
||||
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||
@ -482,7 +482,7 @@ struct CachingHostAllocatorImpl {
|
||||
}
|
||||
|
||||
void resetPeakStats() {
|
||||
// Reseting peak memory stats requires concurrently holding both the
|
||||
// Resetting peak memory stats requires concurrently holding both the
|
||||
// free list mutexes and the blocks mutex. Previously, this was only done in
|
||||
// empty_cache function.
|
||||
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
// Re-declaring 'DimVector' type and size inside 'at' namespace.
|
||||
// Redeclaring 'DimVector' type and size inside 'at' namespace.
|
||||
// This is done to avoid modifying every use into their 'c10'
|
||||
// equivalent.
|
||||
|
||||
|
@ -16,7 +16,7 @@ _GeneratorRegister::_GeneratorRegister(const GeneratorFuncType& func) {
|
||||
|
||||
TORCH_WARN_DEPRECATION(
|
||||
"REGISTER_GENERATOR_PRIVATEUSE1 is deprecated. \
|
||||
Please derive PrivateUse1HooksInterface to implememt getNewGenerator instead.")
|
||||
Please derive PrivateUse1HooksInterface to implement getNewGenerator instead.")
|
||||
|
||||
TORCH_CHECK(
|
||||
!GetGeneratorPrivate().has_value(),
|
||||
|
@ -149,7 +149,7 @@
|
||||
* First, keep in mind that we assume that boxed containers will
|
||||
* have to deal with `IValue` (e.g. `c10::List`). In this context,
|
||||
* what may be happening is that `IValue` doesn't store internally
|
||||
* your type `T`. Instead, it constructs a type new `T` everytime
|
||||
* your type `T`. Instead, it constructs a type new `T` every time
|
||||
* you try to get `T` for it (see `IListRef<at::OptinalTensorRef>`).
|
||||
*/
|
||||
|
||||
@ -186,7 +186,7 @@ class IListRef;
|
||||
* This macro is useful because it allows us to handle different
|
||||
* types (that correspond to different tags) to be implemented
|
||||
* only once. We can do it even when the implementation of the
|
||||
* different tags aren't syntatically the same, by dispatching
|
||||
* different tags aren't syntactically the same, by dispatching
|
||||
* it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
|
||||
*/
|
||||
#define TORCH_ILISTREF_UNWRAP(TAG, BODY) \
|
||||
|
@ -42,7 +42,7 @@ class IListRefTagImplBase<IListRefTag::Unboxed, T, ListElemT> {
|
||||
/*
|
||||
* We have these function (besides the `unwrap`s above) because the
|
||||
* implementation for both `IListRef::operator[]` and `IListRefIterator::operator*`
|
||||
* weren't syntatically equal for the existing tags at the time
|
||||
* weren't syntactically equal for the existing tags at the time
|
||||
* (`Unboxed` and `Boxed`).
|
||||
*/
|
||||
static IListRefConstRef<T> front(const list_type& lst) {
|
||||
|
@ -12,7 +12,7 @@ namespace at {
|
||||
// in order. This is most commonly used in autogenerated code,
|
||||
// where it is convenient to have a function that can uniformly
|
||||
// take arguments of different types. If your arguments
|
||||
// are homogenous consider using a std::initializer_list instead.
|
||||
// are homogeneous consider using a std::initializer_list instead.
|
||||
//
|
||||
// For examples of this in use, see torch/csrc/utils/variadic.h
|
||||
template <typename F>
|
||||
|
@ -148,7 +148,7 @@ struct TORCH_API ClassType : public NamedType {
|
||||
|
||||
void checkNotExist(const std::string& name, const std::string& what) const;
|
||||
|
||||
// Attributes are stored in a specific slot at runtime for effiency.
|
||||
// Attributes are stored in a specific slot at runtime for efficiency.
|
||||
// When emitting instructions we specify the slot so that attribute access is
|
||||
// a constant lookup
|
||||
std::optional<size_t> findAttributeSlot(const std::string& name) const {
|
||||
@ -412,7 +412,7 @@ struct TORCH_API ClassType : public NamedType {
|
||||
// Holds method attributes
|
||||
std::weak_ptr<CompilationUnit> compilation_unit_;
|
||||
|
||||
// Holds all atrributes, attribute details are found on ClassAttribute
|
||||
// Holds all attributes, attribute details are found on ClassAttribute
|
||||
std::vector<ClassAttribute> attributes_;
|
||||
// Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
|
||||
// Never fill this without using the appropriate provideNewClassAttribute method
|
||||
|
@ -111,7 +111,7 @@ void Dispatcher::waitForDef(const FunctionSchema& schema) {
|
||||
TORCH_INTERNAL_ASSERT(r,
|
||||
"Expected main interpreter to define ", schema.operator_name(),
|
||||
", but this didn't happen within timeout. Are you trying to load "
|
||||
"different models in the same torchdeploy/multipy instance? You "
|
||||
"different models in the same torchdeploy/multipy instance? You " // codespell:ignore
|
||||
"must warmup each interpreter identically, e.g., import all "
|
||||
"the same dependencies.");
|
||||
}
|
||||
@ -129,7 +129,7 @@ void Dispatcher::waitForImpl(const OperatorName& op_name, std::optional<c10::Dis
|
||||
TORCH_INTERNAL_ASSERT(r,
|
||||
"Expected main interpreter to implement ", dk, " for ", op_name,
|
||||
", but this didn't happen within timeout. Are you trying to load "
|
||||
"different models in the same torchdeploy/multipy instance? You "
|
||||
"different models in the same torchdeploy/multipy instance? You " // codespell:ignore
|
||||
"must warmup each interpreter identically, e.g., import all "
|
||||
"the same dependencies.");
|
||||
}
|
||||
@ -531,7 +531,7 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK
|
||||
|
||||
// Note: this records a sequence number for both Autograd keys, and for
|
||||
// non-Autograd keys where the dispatchKeySet still contains an autograd key.
|
||||
// This means that we might collect the same sequence nubmer two different
|
||||
// This means that we might collect the same sequence number two different
|
||||
// events if they all occurred above Autograd and still had the Autograd
|
||||
// dispatch key in the dispatch key set.
|
||||
// However, this usually doesn't happen: normally the first call will
|
||||
|
@ -222,7 +222,8 @@ class TORCH_API Dispatcher final {
|
||||
return backendFallbackKernels_[dispatch_ix].kernel.isValid();
|
||||
}
|
||||
|
||||
// Used by torchdeploy/multipy for multiple interpreters racing.
|
||||
// Used by torchdeploy/multipy for multiple // codespell:ignore: multipy
|
||||
// interpreters racing.
|
||||
void waitForDef(const FunctionSchema& schema);
|
||||
void waitForImpl(
|
||||
const OperatorName& op_name,
|
||||
@ -414,7 +415,7 @@ class TORCH_API Dispatcher final {
|
||||
std::unique_ptr<detail::RegistrationListenerList> listeners_;
|
||||
|
||||
// This condition variable gets notified whenever we add a new def/impl to the
|
||||
// dispatch table. This is primarily used by multipy/torchdeploy, when
|
||||
// dispatch table. This is primarily used by multiply/torchdeploy, when
|
||||
// we have multiple interpreters trying to register to the dispatch table.
|
||||
// In this situation, whenever the non-primary interpreter would have tried
|
||||
// to register to the dispatch table, instead it will check to see if the
|
||||
@ -585,7 +586,7 @@ class TORCH_API OperatorHandle {
|
||||
|
||||
// We need to store this iterator in order to make
|
||||
// Dispatcher::cleanup() fast -- it runs a lot on program
|
||||
// termination (and presuambly library unloading).
|
||||
// termination (and presumably library unloading).
|
||||
std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
|
||||
};
|
||||
|
||||
|
@ -261,7 +261,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
|
||||
//
|
||||
// There are 2 cases
|
||||
// 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
|
||||
// without the extra parenthesis, the c++ schem parser can not parse it.
|
||||
// without the extra parenthesis, the c++ scheme parser can not parse it.
|
||||
// 2. something like '-> ((str, str))'. Need extra parenthesis so the return
|
||||
// type is a single tuple rather than two strings.
|
||||
// PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
|
||||
|
@ -1176,7 +1176,7 @@ struct TORCH_API IValue final {
|
||||
using HashIdentityIValueMap =
|
||||
std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
|
||||
|
||||
// Chechs if this and rhs has a subvalues in common.
|
||||
// Checks if this and rhs has a subvalues in common.
|
||||
// [t1,t2] and [t2, t3] returns true.
|
||||
bool overlaps(const IValue& rhs) const;
|
||||
|
||||
|
@ -990,7 +990,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
if (completed_) {
|
||||
// This should be rare and shouldn't cause log spew. Its important to
|
||||
// log errors and thats why we have this log here.
|
||||
// log errors and that's why we have this log here.
|
||||
std::string msg = c10::str(
|
||||
"Skipping setting following error on the Future since "
|
||||
"it is already marked completed (this is not necessarily "
|
||||
@ -1501,7 +1501,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
|
||||
// However, the CompilationUnit holds ownership of the type's graphs, so
|
||||
// inserting a constant object into a Graph would create a reference cycle if
|
||||
// that constant object held a shared_ptr to its CU. For these objects we
|
||||
// instatiate them with non-owning references to its CU
|
||||
// instantiate them with non-owning references to its CU
|
||||
Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
|
||||
slots_.resize(numSlots);
|
||||
}
|
||||
|
@ -374,7 +374,7 @@ struct TORCH_API SymbolicShape {
|
||||
// Unranked shape constructor.
|
||||
SymbolicShape() : dims_(std::nullopt) {}
|
||||
|
||||
// Known rank but unknown dimentions.
|
||||
// Known rank but unknown dimensions.
|
||||
SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
|
||||
if(!rank) {
|
||||
return;
|
||||
@ -891,10 +891,10 @@ struct TORCH_API ListType
|
||||
|
||||
// global singleton
|
||||
// Given an inner type T and an identifier,
|
||||
// this function wil return the global singleton type pointer
|
||||
// this function will return the global singleton type pointer
|
||||
// the type List<T>.
|
||||
// The extra "identifier" argument is needed beccause we have multiple container types
|
||||
// that all re-use this function (List<T>, array<T, N>, etc.)
|
||||
// The extra "identifier" argument is needed because we have multiple container types
|
||||
// that all reuse this function (List<T>, array<T, N>, etc.)
|
||||
static TypePtr get(const std::string& identifier, TypePtr inner);
|
||||
|
||||
// common cast List[Tensor]
|
||||
@ -992,7 +992,7 @@ struct TORCH_API DictType : public SharedType {
|
||||
// this function will return the global singleton type pointer
|
||||
// the type List<T>.
|
||||
// The extra "identifier" argument is needed because we have multiple container types
|
||||
// that all re-use this function (Dict<K, V> and unordered_map<K, V>)
|
||||
// that all reuse this function (Dict<K, V> and unordered_map<K, V>)
|
||||
static TypePtr get(const std::string& identifier, TypePtr key, TypePtr val);
|
||||
|
||||
private:
|
||||
|
@ -21,7 +21,7 @@ namespace c10 {
|
||||
|
||||
namespace detail {
|
||||
// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
|
||||
// We do this because every argument in a function schema is expected to be convertable
|
||||
// We do this because every argument in a function schema is expected to be convertible
|
||||
// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
|
||||
// See Note [Plumbing Keys Through The Dispatcher]
|
||||
template<class KernelFunctor>
|
||||
|
@ -172,7 +172,7 @@ VaryingShape<Stride> TensorType::computeStrideProps(
|
||||
// The logic below follows what TensorIterator uses in its logic:
|
||||
// 1. Fast_set_up is the short-cut to identify a. channels_last and
|
||||
// b. contiguous format, which is what we have in the below logic.
|
||||
// 2. In more generla cases, it does best effort to preserve permutatoin.
|
||||
// 2. In more general cases, it does best effort to preserve permutatoin.
|
||||
if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
|
||||
// case 1.a. short cut channels last
|
||||
std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);
|
||||
|
@ -679,7 +679,7 @@ TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type) {
|
||||
return false;
|
||||
}
|
||||
if (elem_type->kind() == AnyType::Kind) {
|
||||
// List of Any can contains heterogenous types
|
||||
// List of Any can contains heterogeneous types
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -234,7 +234,7 @@ class Vectorized<c10::Half> : public Vectorized16<
|
||||
vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
|
||||
return vaddvq_u16(bits_vec);
|
||||
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
// use known working implmentation.
|
||||
// use known working implementation.
|
||||
__at_align__ value_type tmp[size()];
|
||||
store(tmp);
|
||||
int mask = 0;
|
||||
|
@ -1740,7 +1740,7 @@ Vectorized<int16_t> inline shift_256_16(
|
||||
|
||||
// Control masks for shuffle operation, treating 256 bits as an
|
||||
// array of 16-bit elements, and considering pairs of neighboring
|
||||
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// M!=N) is set so that shuffle will move element with index M from
|
||||
// input pair into element with index N in output pair, and element
|
||||
// with index M in output pair will be set to all 0s.
|
||||
@ -1875,7 +1875,7 @@ Vectorized<T> inline shift_256_8(
|
||||
|
||||
// Control masks for shuffle operation, treating 256 bits as an
|
||||
// array of 8-bit elements, and considering quadruples of
|
||||
// neighboring elements. Specifially, a mask named "ctl_M_N" (M,N
|
||||
// neighboring elements. Specifically, a mask named "ctl_M_N" (M,N
|
||||
// in [0,1,2,3], and M!=N) is set so that shuffle will move element
|
||||
// with index M from input quadruple into element with index N in
|
||||
// output quadruple, and other elements in output quadruple will be
|
||||
|
@ -143,7 +143,7 @@ class Vectorized<double> {
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
@ -142,7 +142,7 @@ class Vectorized<float> {
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
@ -202,7 +202,7 @@ class Vectorized<int16_t> {
|
||||
const Vectorized<int16_t>& a,
|
||||
const Vectorized<int16_t>& b,
|
||||
const Vectorized<int16_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
// warning intel style mask will not work properly
|
||||
return {
|
||||
|
@ -155,7 +155,7 @@ class Vectorized<int32_t> {
|
||||
const Vectorized<int32_t>& a,
|
||||
const Vectorized<int32_t>& b,
|
||||
const Vectorized<int32_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
// warning intel style mask will not work properly
|
||||
return {
|
||||
|
@ -119,7 +119,7 @@ class Vectorized<int64_t> {
|
||||
const Vectorized<int64_t>& a,
|
||||
const Vectorized<int64_t>& b,
|
||||
const Vectorized<int64_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
@ -397,7 +397,7 @@ inline Vectorized<bool> operator&&(
|
||||
const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
|
||||
__m512i out = _mm512_and_si512(*self_, *other_);
|
||||
Vectorized<bool> ret;
|
||||
// We do not have a constructer that takes __m512i, so we need to memcpy
|
||||
// We do not have a constructor that takes __m512i, so we need to memcpy
|
||||
std::memcpy(ret, &out, ret.size() * sizeof(bool));
|
||||
return ret;
|
||||
}
|
||||
|
@ -498,8 +498,8 @@ static inline Vectorized<T> binary_fp8_op_as_fp32(
|
||||
|
||||
// Refer to
|
||||
// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
|
||||
// -, *, /, planed to be deleted in the future and here is just to make compiler
|
||||
// happy
|
||||
// -, *, /, planned to be deleted in the future and here is just to make
|
||||
// compiler happy
|
||||
Vectorized<Float8_e4m3fn> inline operator+(
|
||||
const Vectorized<Float8_e4m3fn>& a,
|
||||
const Vectorized<Float8_e4m3fn>& b) {
|
||||
@ -585,8 +585,8 @@ class Vectorized<Float8_e5m2> : public Vectorizedf8<Float8_e5m2> {
|
||||
|
||||
// Refer to
|
||||
// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
|
||||
// -, *, /, planed to be deleted in the future and here is just to make compiler
|
||||
// happy
|
||||
// -, *, /, planned to be deleted in the future and here is just to make
|
||||
// compiler happy
|
||||
Vectorized<Float8_e5m2> inline operator+(
|
||||
const Vectorized<Float8_e5m2>& a,
|
||||
const Vectorized<Float8_e5m2>& b) {
|
||||
|
@ -1852,7 +1852,7 @@ Vectorized<T> inline shift_512_8(
|
||||
|
||||
// Control masks for shuffle operation, treating 512 bits as an
|
||||
// array of 8-bit elements, and considering pairs of neighboring
|
||||
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// M!=N) is set so that shuffle will move element with index M from
|
||||
// input pair into element with index N in output pair, and element
|
||||
// with index M in output pair will be set to all 0s.
|
||||
|
@ -1958,7 +1958,7 @@ void scaled_gemm(
|
||||
ScalarType result_dtype,
|
||||
bool use_fast_accum,
|
||||
const std::optional<Tensor>& alpha) {
|
||||
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
|
||||
// Note: see `cublasCommonArgs` for various non-intuitive manipulations
|
||||
// of input arguments to this function.
|
||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||
const auto scaleType = CUDA_R_32F;
|
||||
|
@ -311,7 +311,7 @@ CUDAGraph::~CUDAGraph() {
|
||||
// There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
|
||||
// They wait for next sync point in order to free the memory, this is to ensure that all
|
||||
// hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
|
||||
// We need to ensure all async opreations finish before deleting the object.
|
||||
// We need to ensure all async operations finish before deleting the object.
|
||||
#if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
|
||||
if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
|
||||
{
|
||||
|
@ -179,7 +179,7 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
|
||||
batch_offset * values_batch_stride * values.itemsize(),
|
||||
index_type, // data type of row offsets index
|
||||
index_type, // data type of col indices
|
||||
CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col indes
|
||||
CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col index
|
||||
value_type // data type of values
|
||||
));
|
||||
|
||||
|
@ -137,7 +137,7 @@ struct CUDACachingHostAllocatorImpl
|
||||
void free_block_slowpath(Block* block) {
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
// Users may change the allocator config at will. torch unit tests do this.
|
||||
// However, allocations using cudaHostRegister should use corresonding
|
||||
// However, allocations using cudaHostRegister should use corresponding
|
||||
// cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
|
||||
void* ptr = block->ptr_;
|
||||
bool use_register = false;
|
||||
|
@ -10,7 +10,7 @@ namespace at::cuda {
|
||||
//
|
||||
// A caching allocator for CUDA host allocations (pinned memory).
|
||||
//
|
||||
// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
|
||||
// This provides a drop-in replacement for THCudaHostAllocator, which reuses
|
||||
// freed pinned (page-locked) memory allocations. This avoids device
|
||||
// synchronizations due to cudaFreeHost calls.
|
||||
//
|
||||
@ -26,7 +26,7 @@ inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() {
|
||||
}
|
||||
|
||||
// Records an event in the specified stream. The allocation corresponding to the
|
||||
// input `ptr`/`ctx` will not be re-used until the event has occurred.
|
||||
// input `ptr`/`ctx` will not be reused until the event has occurred.
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.")
|
||||
inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include <ATen/cuda/CUDAConfig.h>
|
||||
|
||||
// NOTE: These templates are intentionally not defined in this header,
|
||||
// which aviods re-compiling them for each translation unit. If you get
|
||||
// which avoids re-compiling them for each translation unit. If you get
|
||||
// a link error, you need to add an explicit instantiation for your
|
||||
// types in cub.cu
|
||||
|
||||
|
@ -93,7 +93,7 @@ struct IndexToOffset {
|
||||
}
|
||||
};
|
||||
|
||||
// Uses dynamic (runtime) instead of static (compiletime) dims
|
||||
// Uses dynamic (runtime) instead of static (compile time) dims
|
||||
template <typename T, typename IndexType>
|
||||
struct IndexToOffset<T, IndexType, -1> {
|
||||
static inline __host__ __device__ IndexType get(
|
||||
|
@ -32,7 +32,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(
|
||||
|
||||
// Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
|
||||
// fn_ptr is set to the appropriate function based on the vec size and GPU used
|
||||
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
|
||||
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
|
||||
// the same compute capability
|
||||
|
||||
std::string f_inputs_type_str = at::cuda::jit::typeName(common_dtype);
|
||||
|
@ -38,7 +38,7 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
|
||||
GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
|
||||
```
|
||||
|
||||
Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect
|
||||
Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect
|
||||
this and reject the tunings file because the prior tunings are likely affected by other software changes.
|
||||
|
||||
The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
|
||||
|
@ -235,7 +235,7 @@ class TunableOp {
|
||||
// numeric check option is controlled by non-static env var, so check it once per tuned operator
|
||||
bool do_numerics_check = ctx->IsNumericsCheckEnabled();
|
||||
|
||||
// calcaulte a reference answer for numerical check
|
||||
// calculate a reference answer for numerical check
|
||||
if (do_numerics_check) {
|
||||
reference_params = params->DeepCopy(false);
|
||||
TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
|
||||
|
@ -12,7 +12,7 @@ namespace at {
|
||||
|
||||
// AcceleratorHooksInterface is a shared interface provided by all
|
||||
// accelerators to allow generic code.
|
||||
// This inferface is hook-based as it corresponds to all the functions
|
||||
// This interface is hook-based as it corresponds to all the functions
|
||||
// that are going to be called in a generic way from the CPU code.
|
||||
|
||||
struct TORCH_API AcceleratorHooksInterface {
|
||||
|
@ -38,7 +38,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
|
||||
|
||||
Generator getNewGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||
// TODO(FFFrog): Perserved for BC and will be removed in the future.
|
||||
// TODO(FFFrog): Preserved for BC and will be removed in the future.
|
||||
if (at::GetGeneratorPrivate().has_value())
|
||||
return at::GetGeneratorForPrivateuse1(device_index);
|
||||
|
||||
|
@ -283,7 +283,7 @@ inline void boxed_existing_bdim_all_batch_rule(
|
||||
// Use when all tensors arguments accept one (normal) batch dim.
|
||||
// This batching rule expands the batch dim on all Tensors, reshapes it into
|
||||
// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
|
||||
// This is not the most efficient thing; if there are alternatives, plese try
|
||||
// This is not the most efficient thing; if there are alternatives, please try
|
||||
// to use them. Use this only as a last resort.
|
||||
#define EXISTING_BDIM_ALL_BOXED(op) \
|
||||
m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
|
||||
|
@ -384,7 +384,7 @@ fourOutputs solve_ex_batch_rule(
|
||||
|
||||
// NOTE [ solve_ex Batch Rule Contiguity ]
|
||||
// A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
|
||||
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior
|
||||
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior
|
||||
// differs based on whether or not the optimized path was taken
|
||||
const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
|
||||
if (batched_A_was_contiguous && !A.is_complex()) {
|
||||
|
@ -282,7 +282,7 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(
|
||||
|
||||
dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);
|
||||
|
||||
// Not sure why output_ needs to be marked as .contiguous(). Someting must
|
||||
// Not sure why output_ needs to be marked as .contiguous(). Something must
|
||||
// have changed in PyTorch (and output of softmax is probably always contiguous)
|
||||
return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
|
||||
}
|
||||
|
@ -224,7 +224,7 @@ static Tensor safeStack(TensorList tensors) {
|
||||
// is possible for the backward function to return an undefined grad for some
|
||||
// grad_input for each example. In that case, we return an undefined grad.
|
||||
//
|
||||
// It is theoretically posssible for *some* of the examples to produce an
|
||||
// It is theoretically possible for *some* of the examples to produce an
|
||||
// undefined grad (a kernel could peek at the gradient values and return an
|
||||
// undefined tensor if it determines the gradient is full of zeros). We
|
||||
// could handle this by treating the undefined grad as a zero-filled tensor
|
||||
|
@ -113,7 +113,7 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const {
|
||||
return sym_sizes_default();
|
||||
}
|
||||
|
||||
// The following are publically exposed as methods of Tensor
|
||||
// The following are publicly exposed as methods of Tensor
|
||||
|
||||
IntArrayRef BatchedTensorImpl::strides_custom() const {
|
||||
return strides_default();
|
||||
|
@ -37,7 +37,7 @@ namespace at::functorch {
|
||||
// how to perform the transform.
|
||||
//
|
||||
// TODO: we can excise DynamicLayer in favor of Interpreter,
|
||||
// But I am going to leave it for now as a compatiblity shim to avoid
|
||||
// But I am going to leave it for now as a compatibility shim to avoid
|
||||
// needing to refactor a lot of callsites...
|
||||
struct TORCH_API DynamicLayer {
|
||||
explicit DynamicLayer(
|
||||
|
@ -88,7 +88,7 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
|
||||
// >>> VmapInterpreterPtr(&interpreter).batchSize()
|
||||
//
|
||||
// Finally, Interpreter::process switches on the type of the interpreter
|
||||
// and calls one of {Transform}Intepreter::processImpl under the hood.
|
||||
// and calls one of {Transform}Interpreter::processImpl under the hood.
|
||||
// Same for Interpreter::sendToNextInterpreter :)
|
||||
|
||||
struct VmapInterpreterMeta {
|
||||
|
@ -143,7 +143,7 @@ struct TORCH_API VmapPhysicalView {
|
||||
// mapping a physical tensor to a new logical tensor (BatchedTensor)
|
||||
VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;
|
||||
|
||||
// Maps a logical shape to a physical shape by pre-pending the batch
|
||||
// Maps a logical shape to a physical shape by prepending the batch
|
||||
// sizes to the logical shape.
|
||||
VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
|
||||
SymDimVector getPhysicalShape(c10::SymIntArrayRef logical_shape) const;
|
||||
|
@ -27,7 +27,7 @@ namespace at::functorch {
|
||||
//
|
||||
// There are alternative designs we could have chosen (e.g. each grad transform
|
||||
// stores a weak map of Tensor -> AutogradMeta); the benefit of the TensorWrapper
|
||||
// design is that we can re-use existing VariableType kernels (i.e. Autograd kernels)
|
||||
// design is that we can reuse existing VariableType kernels (i.e. Autograd kernels)
|
||||
// without much modification. Since a TensorWrapper looks like a regular Tensor,
|
||||
// the VariableType kernel can pull out the AutogradMeta struct from where it
|
||||
// expects and extend the autograd graph
|
||||
|
@ -158,7 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
|
||||
endKernelCoalescing();
|
||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||
|
||||
// For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
|
||||
// For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26
|
||||
// See https://github.com/pytorch/pytorch/issues/163962
|
||||
// Workaround by batching copy commands into 4Gb chunks
|
||||
constexpr size_t max_copy_size = 0x100000000; // 4GB
|
||||
|
@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
|
||||
auto* input_tr_ptr =
|
||||
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
|
||||
// TODO: Activation transpose before and after the kernel can be removed if we
|
||||
// keep activation tensor always tranposed.
|
||||
// keep activation tensor always transposed.
|
||||
fbgemm::transpose_simd<uint8_t>(
|
||||
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);
|
||||
|
||||
|
@ -34,7 +34,7 @@ struct Dist {
|
||||
// finish : This tells what to do with the aggregated value to compute
|
||||
// the norm. Generally this is the result of val ^ (1 / p).
|
||||
// backward : This is the gradient for that norm. Arguments are pretty
|
||||
// self explanitory.
|
||||
// self explanatory.
|
||||
//
|
||||
// There are a few cases where these aren't used. The 0 norm has no backward,
|
||||
// because it's always 0, so that's shortcircuited earlier. There's a special
|
||||
|
@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
|
||||
|
||||
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
|
||||
compiler flags it is currently compiled. This allows the programmer to write
|
||||
generic code, which will be compiled under multipled compilation settings.
|
||||
generic code, which will be compiled under multiplied compilation settings.
|
||||
|
||||
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
|
||||
a generic definition of `sumImplAll`. This function allows the user to reduce
|
||||
|
@ -1017,7 +1017,7 @@ struct HelperInterpBase {
|
||||
while (aligned_interp_size % sizeof(int32_t) != 0) {
|
||||
aligned_interp_size += 1;
|
||||
}
|
||||
// assert that we wont go out of bounds
|
||||
// assert that we won't go out of bounds
|
||||
TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
|
||||
}
|
||||
|
||||
|
@ -655,7 +655,7 @@ void ImagingResampleHorizontalConvolution8u4x(
|
||||
// last element
|
||||
auto mmk = _mm256_set1_epi32(k[i]);
|
||||
// For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
|
||||
// lines 0, 1 and 2 wont go out of allocated memory bounds
|
||||
// lines 0, 1 and 2 won't go out of allocated memory bounds
|
||||
auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||
mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
|
||||
mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
|
||||
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
|
||||
|
||||
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
|
||||
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
|
||||
// RGBA: pix1 = [
|
||||
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
|
||||
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
|
||||
@ -1312,7 +1312,7 @@ void ImagingResampleVerticalConvolution8u(
|
||||
|
||||
// Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
|
||||
// It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
|
||||
// We also wont go out of bounds of lineOut memory allocation
|
||||
// We also won't go out of bounds of lineOut memory allocation
|
||||
std::memcpy(lineOut + j, (uint8_t *) &o, 4);
|
||||
}
|
||||
|
||||
|
@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
|
||||
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
||||
|
||||
|
||||
/* evaluation of 8 sines at onces using AVX intrinsics
|
||||
/* evaluation of 8 sines at once using AVX intrinsics
|
||||
|
||||
The code is the exact rewriting of the cephes sinf function.
|
||||
Precision is excellent as long as x < 8192 (I did not bother to
|
||||
|
@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
|
||||
const bool gamma_null = (gamma_data == nullptr);
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
|
||||
// NB: About algorithm choosen:
|
||||
// NB: About algorithm chosen:
|
||||
//
|
||||
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
|
||||
// Mean and rstd are collected per each n and g, which involves reduction
|
||||
|
@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
|
||||
}
|
||||
};
|
||||
|
||||
// Dynamically Quantize the float32 input to 8 bit assymetric
|
||||
// Dynamically Quantize the float32 input to 8 bit asymmetric
|
||||
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
|
||||
|
||||
const size_t lhs_stride =
|
||||
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
|
||||
const int64_t weight_packed_size =
|
||||
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
|
||||
if (weight_packed_size == packed_weights.numel()) {
|
||||
// KleidiAI interface intenally handles the Channelwise and groupwise
|
||||
// KleidiAI interface internally handles the Channelwise and groupwise
|
||||
// distinction
|
||||
kleidiai::kai_quant_pack_lhs_int4_mm(
|
||||
output, inp, packed_weights, M, N, K, block_size);
|
||||
|
@ -705,7 +705,7 @@ namespace {
|
||||
);
|
||||
} while (!done && max_threads);
|
||||
if (!done) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
|
||||
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accommodate sharedMemPerBlock limit");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -154,19 +154,19 @@ struct cublasCommonArgs {
|
||||
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
|
||||
bool transpose_result = false, transpose_a = false, transpose_b = false;
|
||||
result = prepare_matrix_for_cublas(c, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); // codespell:ignore
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); // codespell:ignore
|
||||
|
||||
// Handle scale tensors if provided
|
||||
if (scale_a && scale_b) {
|
||||
// By default since we return in row-major we run the gemm
|
||||
// as B.T @ A.T, check transpose_result to determine if we flip the scales
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); // codespell:ignore
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); // codespell:ignore
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; // codespell:ignore
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); // codespell:ignore
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); // codespell:ignore
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; // codespell:ignore
|
||||
}
|
||||
|
||||
if (scale_result) {
|
||||
@ -180,17 +180,17 @@ struct cublasCommonArgs {
|
||||
transpose_b = !transpose_b;
|
||||
}
|
||||
|
||||
auto sizes_a = mata->sizes();
|
||||
auto sizes_b = matb->sizes();
|
||||
auto sizes_a = mata->sizes(); // codespell:ignore
|
||||
auto sizes_b = matb->sizes(); // codespell:ignore
|
||||
|
||||
m = sizes_a[transpose_result ? 1 : 0];
|
||||
k = sizes_a[transpose_result ? 0 : 1];
|
||||
n = sizes_b[transpose_result ? 0 : 1];
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); // codespell:ignore
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); // codespell:ignore
|
||||
result_ld = result->stride(transpose_result ? 0 : 1);
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
|
||||
|
||||
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
|
||||
// if the gemm operands are in packed float4
|
||||
@ -205,16 +205,16 @@ struct cublasCommonArgs {
|
||||
char transa, transb;
|
||||
int64_t m, n, k;
|
||||
int64_t lda, ldb, result_ld;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result; // codespell:ignore
|
||||
|
||||
// Scale members
|
||||
void* scale_mata_ptr = nullptr;
|
||||
void* scale_matb_ptr = nullptr;
|
||||
void* scale_mata_ptr = nullptr; // codespell:ignore
|
||||
void* scale_matb_ptr = nullptr; // codespell:ignore
|
||||
void* scale_result_ptr = nullptr;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype;
|
||||
std::optional<ScalingType> scaling_mata_type;
|
||||
std::optional<c10::ScalarType> scale_matb_dtype;
|
||||
std::optional<ScalingType> scaling_matb_type;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype; // codespell:ignore
|
||||
std::optional<ScalingType> scaling_mata_type; // codespell:ignore
|
||||
std::optional<c10::ScalarType> scale_matb_dtype; // codespell:ignore
|
||||
std::optional<ScalingType> scaling_matb_type; // codespell:ignore
|
||||
std::optional<c10::ScalarType> scale_result_dtype;
|
||||
};
|
||||
} // namespace
|
||||
@ -362,7 +362,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
|
||||
#endif
|
||||
// if lt path fails, we recurse back into this function here and force the lt path to off
|
||||
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
// we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#if defined(USE_ROCM) && ROCM_VERSION == 60400
|
||||
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
|
||||
@ -2886,7 +2886,7 @@ _scaled_grouped_mm_cuda_v2(
|
||||
"Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ",
|
||||
mat_b.size(dim_b));
|
||||
// Note: only (-1, -2) is currently supported
|
||||
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only");
|
||||
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Currently contraction dims must be (-1, -2) only");
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
|
||||
}
|
||||
|
@ -298,7 +298,7 @@ static void jitted_gpu_kernel_impl(
|
||||
at::opmath_type<f_inputs_type> scalar_val,
|
||||
const std::tuple<ExtraArgs...>& extra_args) {
|
||||
|
||||
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
|
||||
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
|
||||
// the same compute capability
|
||||
static std::mutex jiterator_mutex;
|
||||
static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());
|
||||
|
@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
|
||||
auto value = static_cast<scalar_t>(rand * range + from);
|
||||
// reverse the bounds of curand4 from (0, 1] to [0, 1)
|
||||
// Note that this method is from legacy THCTensorRandom and is likely to give
|
||||
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
|
||||
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
|
||||
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
|
||||
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
|
||||
auto reverse_bound_value = value == to ? from : value;
|
||||
|
@ -75,7 +75,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<const scalar_t, IndexType>
|
||||
// We'll use this to actually cause vectorized loads later
|
||||
LoadT *value = reinterpret_cast<LoadT*>(&src);
|
||||
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
|
||||
// Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
|
||||
// sets of rand.
|
||||
if ((VEC >= 4) || (gridxvec_loop_state == 0)) {
|
||||
@ -159,7 +159,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo<const scalar_t, IndexType> a,
|
||||
for (IndexType linearIndex = idx;
|
||||
linearIndex < rounded_size;
|
||||
linearIndex += gridDim.x * blockDim.x*UNROLL) {
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
|
||||
float4 rand = curand_uniform4(&state);
|
||||
scalar_t src[UNROLL];
|
||||
rand.x = rand.x < p;
|
||||
|
@ -24,7 +24,7 @@ namespace at::native {
|
||||
namespace {
|
||||
|
||||
/* This code computes the sum of the weights in two-steps:
|
||||
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indeces`
|
||||
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indices`
|
||||
2) Each partial-sum from 1) are summed and scatter into `grad_weight`
|
||||
|
||||
Notice, `NROWS_PER_THREAD` impacts the Achieved Occupancy of the
|
||||
|
@ -204,7 +204,7 @@ Scalar scalar_reciprocal(const Scalar& scalar) {
|
||||
return Scalar(1. / scalar.toComplexDouble());
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
false, "divison with ", scalar.type(), " not supported");
|
||||
false, "division with ", scalar.type(), " not supported");
|
||||
}
|
||||
|
||||
void foreach_tensor_div_scalar_kernel_cuda_(
|
||||
|
@ -57,7 +57,7 @@ namespace {
|
||||
const index_t n = index / (out_H * out_W);
|
||||
const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y co-ordinates from grid
|
||||
// get the corresponding input x, y coordinates from grid
|
||||
opmath_t x = grid.data[grid_offset];
|
||||
opmath_t y = grid.data[grid_offset + grid_sCoor];
|
||||
|
||||
@ -193,7 +193,7 @@ namespace {
|
||||
const index_t n = index / (out_D * out_H * out_W);
|
||||
const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
// get the corresponding input x, y, z coordinates from grid
|
||||
opmath_t x = grid.data[grid_offset];
|
||||
opmath_t y = grid.data[grid_offset + grid_sCoor];
|
||||
opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
|
||||
@ -358,7 +358,7 @@ namespace {
|
||||
const index_t n = index / (out_H * out_W);
|
||||
const auto grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y co-ordinates from grid
|
||||
// get the corresponding input x, y coordinates from grid
|
||||
scalar_t x = grid.data[grid_offset];
|
||||
scalar_t y = grid.data[grid_offset + grid_sCoor];
|
||||
|
||||
@ -572,7 +572,7 @@ namespace {
|
||||
const index_t n = index / (out_D * out_H * out_W);
|
||||
const auto grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
// get the corresponding input x, y, z coordinates from grid
|
||||
scalar_t ix = grid.data[grid_offset];
|
||||
scalar_t iy = grid.data[grid_offset + grid_sCoor];
|
||||
scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
|
||||
// Three warninngs in Cutlass included header files
|
||||
// Three warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
@ -377,7 +377,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
|
||||
* result at the boundary
|
||||
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
|
||||
* Large Parameter (see DLMF 8.12.4 [igam1])
|
||||
* - if x > 1.1 and x < a, using the substraction from the regularized lower
|
||||
* - if x > 1.1 and x < a, using the subtraction from the regularized lower
|
||||
* incomplete gamma
|
||||
* - otherwise, calculate the series from [igam2] eq (5)
|
||||
*/
|
||||
@ -460,7 +460,7 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
|
||||
* result at the boundary
|
||||
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
|
||||
* Large Parameter (see DLMF 8.12.3 [igam1])
|
||||
* - if x > 1 and x > a, using the substraction from the regularized upper
|
||||
* - if x > 1 and x > a, using the subtraction from the regularized upper
|
||||
* incomplete gamma
|
||||
* - otherwise, calculate the series from [igam2] eq (4)
|
||||
*/
|
||||
|
@ -332,7 +332,7 @@ void cuda_take_put_kernel(
|
||||
const auto offset_calc = make_offset_calculator<2>(iter);
|
||||
using uindex_t = std::make_unsigned_t<index_t>;
|
||||
|
||||
// OffsetCalculator needs the sizes and strides reveresed
|
||||
// OffsetCalculator needs the sizes and strides reversed
|
||||
const auto indexed_sizes = std::vector<int64_t>(indexed.sizes().rbegin(), indexed.sizes().rend());
|
||||
const auto indexed_strides = std::vector<int64_t>(indexed.strides().rbegin(), indexed.strides().rend());
|
||||
const auto* indexed_strides_data = indexed_strides.data();
|
||||
|
@ -1611,7 +1611,7 @@ void index_select_out_cuda_impl(
|
||||
|
||||
// SmallIndexKernel is more performant when the number of indices is small, and pre-loading
|
||||
// the index reduces memory accesses. When the number of indices is large, we avoid that
|
||||
// and increase parallellism by calling gather_out which is a generalization of index_select
|
||||
// and increase parallelism by calling gather_out which is a generalization of index_select
|
||||
if (cuda::detail::canUse32BitIndexMath(out) &&
|
||||
cuda::detail::canUse32BitIndexMath(self) &&
|
||||
cuda::detail::canUse32BitIndexMath(index) &&
|
||||
|
@ -273,7 +273,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
|
||||
|
||||
scalar_t* dst = self_ptr + index;
|
||||
|
||||
//pack coalseced bf16 and fp16
|
||||
//pack coalesced bf16 and fp16
|
||||
if constexpr (std::is_same<scalar_t, c10::BFloat16>::value || std::is_same<scalar_t, c10::Half>::value)
|
||||
{
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
|
||||
@ -316,7 +316,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
|
||||
}
|
||||
}
|
||||
|
||||
// not coalsced, so now let try to capture lane-matches...
|
||||
// not coalesced, so now let try to capture lane-matches...
|
||||
|
||||
if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
|
||||
// well shucks, unlikely to capture same-dest atomics in a wave.
|
||||
|
@ -343,7 +343,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
|
||||
if (input_length == 0)
|
||||
return;
|
||||
|
||||
// "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
|
||||
// "first" row, the beta initialization before eq (10) (t=target_length - differs per batch)
|
||||
for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
|
||||
int64_t s = threadIdx.x + block_s;
|
||||
scalar_t lb;
|
||||
|
@ -816,7 +816,7 @@ const auto erfcx_string = jiterator_stringify(
|
||||
with the usual checks for overflow etcetera.
|
||||
|
||||
Performance-wise, it seems to be substantially faster than either
|
||||
the SLATEC DERFC function [or an erfcx function derived therefrom]
|
||||
the SLATEC DERFC function [or an erfcx function derived there from]
|
||||
or Cody's CALERF function (from netlib.org/specfun), while
|
||||
retaining near machine precision in accuracy.
|
||||
*/
|
||||
|
@ -370,7 +370,7 @@ struct vectorized {
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// This is similar to vectorized policy above, but this one supports
|
||||
// heterogenous input tensor types as templated parameters.
|
||||
// heterogeneous input tensor types as templated parameters.
|
||||
// Its use should be limited to frequently used heterogeneous data types
|
||||
// as each instantiation will generate a separate kernel, leading to code
|
||||
// bloating if applied to all combinations supported in PyTorch. Assumption: all
|
||||
|
@ -309,7 +309,7 @@ __global__ void sampleMultinomialOnce(
|
||||
} else {
|
||||
// This should address a rare bug where we don't select a valid index. This likely occurs when
|
||||
// due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
|
||||
// and our uniform sample is greater than this value. In this case we likely have unitialized memory
|
||||
// and our uniform sample is greater than this value. In this case we likely have uninitialized memory
|
||||
// in dest[curDist]. So basically we will loop through the distribution and pick the largest index
|
||||
// where the distribution is non-zero. This is obviously terribly inefficient, but due to the
|
||||
// rarity in which this occurs, this should not be an issue.
|
||||
|
@ -1623,7 +1623,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
|
||||
const auto stride = input.sizes()[1];
|
||||
const auto reduction_size = input.numel() / stride;
|
||||
|
||||
// Input is guarunteed to be channels-last compatible
|
||||
// Input is guaranteed to be channels-last compatible
|
||||
at::Tensor grad_input = at::empty_like(input);
|
||||
|
||||
dim3 block;
|
||||
@ -1691,7 +1691,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
|
||||
const auto reduction_size = input.numel() / stride;
|
||||
auto norm_fct = 1.0 / reduction_size;
|
||||
|
||||
// Input is guarunteed to be channels-last compatible
|
||||
// Input is guaranteed to be channels-last compatible
|
||||
at::Tensor grad_input = at::empty_like(input);
|
||||
|
||||
dim3 block;
|
||||
|
@ -37,7 +37,7 @@ namespace at::native {
|
||||
// threshold probability for having non-duplicate keys, then it can be proved that[1]
|
||||
// the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
|
||||
//
|
||||
// Then after sort, we lauch a separate kernel that additionally shuffles any islands
|
||||
// Then after sort, we launch a separate kernel that additionally shuffles any islands
|
||||
// of values whose keys matched. The algorithm of this kernel is as follows:
|
||||
// Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
|
||||
// For each island, the first thread in the island sees a key match at index i+1 but not index i-1.
|
||||
|
@ -1088,12 +1088,12 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
// load instructions.
|
||||
//
|
||||
// Case 1: "vectorize along input"
|
||||
// This case happens when we are reducing along fastest moving dimesion. In such case, threads
|
||||
// This case happens when we are reducing along fastest moving dimension. In such case, threads
|
||||
// with the same threadIdx.y works on the same reduction cooperatively and will produce results
|
||||
// for the same output. In such case, values in each loaded vector always correspond to the same output.
|
||||
//
|
||||
// Case 2: "vectorize along output"
|
||||
// This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
|
||||
// This case happens when the fastest moving dimension is not the dimension of reduction. In such case,
|
||||
// threads with different threadIdx.x are independent and will produce results for different outputs.
|
||||
// In such case, values in each loaded vector always correspond to different outputs.
|
||||
if (fastest_moving_stride == sizeof(scalar_t)) {
|
||||
|
@ -241,7 +241,7 @@ __global__ void reflection_pad2d_backward_det_out_kernel(
|
||||
const int64_t dist_cols = ::abs(inp_col - (input_dim_x - 1));
|
||||
|
||||
// we were dist_rows after, now we want to be dist_rows before
|
||||
// we were dist_cols before, now we wnat to be dist_cols after
|
||||
// we were dist_cols before, now we want to be dist_cols after
|
||||
const int64_t reflect_tr_out_row = (corner_tr_out_row - dist_rows);
|
||||
const int64_t reflect_tr_out_col = (corner_tr_out_col + dist_cols);
|
||||
const int64_t reflect_tr_out =
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
// Two warninngs in Cutlass included header files
|
||||
// Two warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
// Two warninngs in Cutlass included header files
|
||||
// Two warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
@ -460,7 +460,7 @@ __global__ void GammaBetaBackwardCUDAKernel2(
|
||||
}
|
||||
}
|
||||
|
||||
// Do warp reduce for the 2st 16 cols in the tile.
|
||||
// Do warp reduce for the 2nd 16 cols in the tile.
|
||||
sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
|
||||
sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
|
||||
sum1 = cuda_utils::WarpReduceSum<T_ACC>(sum1);
|
||||
|
@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
|
||||
|
||||
std::string file_path;
|
||||
if (cache_dir.has_value()) {
|
||||
// Attemps to read from the cache.
|
||||
// Attempts to read from the cache.
|
||||
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
|
||||
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
|
||||
// because we hash on the CUDA code, but we save the compiled ptx or sass
|
||||
@ -1556,19 +1556,19 @@ NvrtcFunction jit_pwise_function(
|
||||
ss << "_" << hash_code;
|
||||
file_path = ss.str();
|
||||
|
||||
std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
|
||||
if (readin.fail()) {
|
||||
std::ifstream read_stream{file_path, std::ios::in | std::ifstream::binary};
|
||||
if (read_stream.fail()) {
|
||||
// NOTE: this does not warn because the file might not exist
|
||||
// TODO: consider if this should explicitly check for the file's existence or not to throw
|
||||
// an informative warning
|
||||
readin.close();
|
||||
read_stream.close();
|
||||
} else {
|
||||
// TODO: try passing the "mapped" file directly to cuModuleLoadCall instead of using an intermediate buffer
|
||||
std::vector<char> buffer(std::istreambuf_iterator<char>(readin), {});
|
||||
std::vector<char> buffer(std::istreambuf_iterator<char>(read_stream), {});
|
||||
AT_CUDA_DRIVER_CHECK(nvrtc.cuModuleLoadData(&(compiled_kernel_.module), buffer.data()));
|
||||
AT_CUDA_DRIVER_CHECK(
|
||||
nvrtc.cuModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
|
||||
readin.close();
|
||||
read_stream.close();
|
||||
return compiled_kernel_;
|
||||
}
|
||||
}
|
||||
|
@ -1050,7 +1050,7 @@ void launch_vectorized_layer_norm_kernel(
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// the blocks.x contains the max grid x dimention without invalid configuration error
|
||||
// the blocks.x contains the max grid x dimension without invalid configuration error
|
||||
// Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
|
||||
// Ensure all elements are processed. Prepare for next round
|
||||
int64_t remaining = M - blocks.x;
|
||||
|
@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
|
||||
});
|
||||
|
||||
if (input.dim() > 2) {
|
||||
// if upper=true we need to tranpose and conjugate the result tensor
|
||||
// if upper=true we need to transpose and conjugate the result tensor
|
||||
// because the cholesky decomposition is stored in the lower triangular part
|
||||
if (upper) {
|
||||
input.copy_(result.mH());
|
||||
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
||||
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
// TODO Investigate whether the following magma bug is still occuring.
|
||||
// TODO Investigate whether the following magma bug is still occurring.
|
||||
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
|
||||
// geqrf_magma currently uses geqrf2_gpu
|
||||
//
|
||||
|
@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
|
||||
#if defined(BUILD_LAZY_CUDA_LINALG)
|
||||
namespace cuda { namespace detail {
|
||||
// This is only used for an old-style dispatches
|
||||
// Please do not add any new entires to it
|
||||
// Please do not add any new entries to it
|
||||
struct LinalgDispatch {
|
||||
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
|
||||
};
|
||||
|
@ -177,7 +177,7 @@ bool use_ragged_in_dense(
|
||||
TORCH_WARN_ONCE(
|
||||
"TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
|
||||
"e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
|
||||
"Falling back to regualr dense case, which may trigger excessive recompilation.");
|
||||
"Falling back to regular dense case, which may trigger excessive recompilation.");
|
||||
}
|
||||
return all_bshd;
|
||||
}
|
||||
@ -771,7 +771,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
|
||||
if (attn_bias.has_value()) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
scaled_dot_product_flash_attention_options.set_bias(
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_uid(BIAS)
|
||||
@ -1196,7 +1196,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
|
||||
if (attn_bias.has_value()) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
sdpa_backward_options.set_bias(
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_uid(BIAS)
|
||||
@ -1864,7 +1864,7 @@ void run_cudnn_SDP_bprop_nestedtensor(
|
||||
}
|
||||
TORCH_CHECK(
|
||||
!attn_bias.has_value(),
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
|
||||
auto workspace_size = mha_graph.get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
|
@ -30,7 +30,7 @@ static const std::unordered_map<
|
||||
};
|
||||
|
||||
|
||||
// This is the heursitic to choose a kernel based on inputs
|
||||
// This is the heuristic to choose a kernel based on inputs
|
||||
BGEMMKernel_BFloat16 dispatch_bfloat16_bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
|
||||
// Optional/future use: directly lookup shape tuples to map to instances
|
||||
/*
|
||||
|
@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
|
||||
namespace at::native {
|
||||
|
||||
void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
@ -471,7 +471,7 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
}
|
||||
|
||||
void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
|
||||
namespace at::native {
|
||||
|
||||
void dispatch_float_gemm(CUDABLAS_GEMM_ARGTYPES(float)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -13,7 +13,7 @@ namespace at::native {
|
||||
|
||||
void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
#if 0
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
@ -299,7 +299,7 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
#endif
|
||||
}
|
||||
void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -545,7 +545,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
|
||||
const ushort slices2 = divRoundUp(C2, 4);
|
||||
const ushort slices1 = divRoundUp(C1, 4);
|
||||
const ushort n2 = gid.z / slices2; //image index
|
||||
const ushort s2 = gid.z - n2 * slices2; // slice offest
|
||||
const ushort s2 = gid.z - n2 * slices2; // slice offset
|
||||
half4 value;
|
||||
for (int idx = 0; idx < 4; ++idx){
|
||||
// we compute the "linear index" of the output element,
|
||||
|
@ -86,4 +86,4 @@ TORCH_LIBRARY_IMPL(aten, Metal, m) {
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_));
|
||||
}
|
||||
|
||||
} // namepsace at::native::metal
|
||||
} // namespace at::native::metal
|
||||
|
@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
|
||||
// blocked format will propagate between layers. Input, output will be in blocked format.
|
||||
//
|
||||
// For inference case, weight can be prepacked into blocked format by
|
||||
// (so as to save weight reoder overhead):
|
||||
// (so as to save weight reorder overhead):
|
||||
// model = torch.utils.mkldnn.to_mkldnn(model)
|
||||
//
|
||||
// For training case, grad_output can be CPU tensor or MKLDNN tensor,
|
||||
|
@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
|
||||
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
|
||||
args.insert({DNNL_ARG_DST, dst});
|
||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
||||
// Create primitve and execute
|
||||
// Create primitive and execute
|
||||
auto primitive = dnnl::matmul(prim_desc);
|
||||
primitive.execute(ideep::stream::default_stream(), args);
|
||||
}
|
||||
|
@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
|
||||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
|
||||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
|
@ -34,7 +34,7 @@ namespace at::native::onednn {
|
||||
|
||||
/*
|
||||
oneDNN postops usage:
|
||||
Currently, oneDNN supports 5 kinds of post ops. More details can be refered
|
||||
Currently, oneDNN supports 5 kinds of post ops. More details can be referred
|
||||
to oneDNN doc.
|
||||
https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise
|
||||
|
||||
@ -345,7 +345,7 @@ class Attr {
|
||||
dnnl::memory binary_m;
|
||||
auto binary = ops_params_[i].binary_;
|
||||
auto md = ops_params_[i].meta_;
|
||||
// qeury expected_md to achieve peak performance
|
||||
// query expected_md to achieve peak performance
|
||||
auto expected_md = pd.query_md(
|
||||
dnnl::query::exec_arg_md,
|
||||
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
|
||||
@ -399,7 +399,7 @@ static inline void construct_attr_for_unary(
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
unary_post_op == "none",
|
||||
"onednn qlinear: unspported unary post op",
|
||||
"onednn qlinear: unsupported unary post op",
|
||||
unary_post_op);
|
||||
}
|
||||
}
|
||||
|
@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// the overlaped cases are not supported
|
||||
// the overlapped cases are not supported
|
||||
dnnl::memory::dims strides = get_onednn_strides(tensor);
|
||||
int64_t storage_size = 1;
|
||||
for (size_t dim = 0; dim < tensor_dim; ++dim)
|
||||
|
@ -29,7 +29,7 @@
|
||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
@ -42,7 +42,7 @@
|
||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||
|
||||
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||
// Use gather kernel to solve strides for macOS < 15.0
|
||||
// Starting with macOS 15.0, MPS supports native strides direclty in the kernels
|
||||
// Starting with macOS 15.0, MPS supports native strides directly in the kernels
|
||||
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
|
||||
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
|
||||
Tensor emptyShell = Tensor();
|
||||
@ -856,7 +856,7 @@ id<MTLLibrary> MetalShaderLibrary::getLibrary(const std::initializer_list<std::s
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported number of paramaters ", nparams);
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported number of parameters ", nparams);
|
||||
}
|
||||
return libMap[key] = lib;
|
||||
}
|
||||
@ -1184,9 +1184,9 @@ void MetalKernelFunction::dispatch(uint64_t length, std::optional<uint64_t> grou
|
||||
}
|
||||
|
||||
void MetalKernelFunction::dispatch(c10::ArrayRef<uint64_t> length, c10::OptionalArrayRef<uint64_t> group_size) {
|
||||
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimentions must be less than 3 and non-empty");
|
||||
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimensions must be less than 3 and non-empty");
|
||||
TORCH_CHECK(!group_size.has_value() || group_size->size() == length.size(),
|
||||
"size and group_size must have same number of dimentions");
|
||||
"size and group_size must have same number of dimensions");
|
||||
const auto max_tg_size = getMaxThreadsPerThreadgroup();
|
||||
const auto group_size_length = group_size.has_value() ? group_size->size() : 0;
|
||||
auto tg_size = MTLSizeMake(group_size_length > 0 ? group_size->at(0) : max_tg_size,
|
||||
|
@ -59,7 +59,7 @@ static GridSamplerOffsets find_grid_sampler_offsets(
|
||||
return offsets;
|
||||
}
|
||||
|
||||
// Mod function which gives postive output when `a` is negative
|
||||
// Mod function which gives positive output when `a` is negative
|
||||
static int32_t mod(int32_t a, int32_t b) {
|
||||
auto r = a % b;
|
||||
return r + (r < 0 ? b : 0);
|
||||
@ -191,9 +191,9 @@ void grid_sampler_single_element(
|
||||
int32_t right_indices[3];
|
||||
opmath_t<T> scales[3];
|
||||
|
||||
// For each dimension, find the pair of indices in the cooresponding dimension
|
||||
// For each dimension, find the pair of indices in the corresponding dimension
|
||||
// of `input` which surround the grid coordinate in that dimension. We'll do
|
||||
// this by mapping different coordiante spaces onto each other. There are
|
||||
// this by mapping different coordinate spaces onto each other. There are
|
||||
// basically three different coordinate spaces to keep in mind:
|
||||
//
|
||||
// * aligned grid space
|
||||
|
@ -137,7 +137,7 @@ kernel void index_put_serial(
|
||||
constant int64_t* index_strides,
|
||||
constant uint4& ndim_nindices_numel,
|
||||
uint thread_index [[thread_position_in_grid]]) {
|
||||
(void)thread_index; // Suppress unused vairable varning
|
||||
(void)thread_index; // Suppress unused variable warning
|
||||
for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
|
||||
index_put_impl(
|
||||
output,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user