Compare commits

...

77 Commits

Author SHA1 Message Date
f7a13a6dfc Update
[ghstack-poisoned]
2025-10-19 18:28:29 +08:00
98aff4e90e Update (base update)
[ghstack-poisoned]
2025-10-19 18:28:29 +08:00
6cdf27661c Update
[ghstack-poisoned]
2025-10-11 21:43:12 +08:00
ecd4542830 Update (base update)
[ghstack-poisoned]
2025-10-11 21:43:12 +08:00
d0b7578e17 Update
[ghstack-poisoned]
2025-10-08 22:35:50 +08:00
e22a5ecb45 Update (base update)
[ghstack-poisoned]
2025-10-08 22:35:50 +08:00
9c8b449159 Update
[ghstack-poisoned]
2025-09-20 23:28:20 +08:00
613b0adb13 Update (base update)
[ghstack-poisoned]
2025-09-20 14:37:09 +08:00
12534b44b3 Update
[ghstack-poisoned]
2025-09-20 14:37:09 +08:00
831ad1f70b Update
[ghstack-poisoned]
2025-09-20 14:30:33 +08:00
1eed70b417 Update (base update)
[ghstack-poisoned]
2025-09-19 17:48:16 +08:00
ef230fcd2d Update
[ghstack-poisoned]
2025-09-19 17:48:16 +08:00
c283224b77 Update (base update)
[ghstack-poisoned]
2025-09-05 23:43:05 +08:00
9bd3d28afa Update
[ghstack-poisoned]
2025-09-05 23:43:05 +08:00
1c2ad5fe9d Update
[ghstack-poisoned]
2025-08-31 23:15:35 +08:00
5c57109190 Update (base update)
[ghstack-poisoned]
2025-08-31 22:56:40 +08:00
e3b463d216 Update
[ghstack-poisoned]
2025-08-31 22:56:40 +08:00
e6bab78b38 Update (base update)
[ghstack-poisoned]
2025-08-27 19:30:05 +08:00
b9f7dd6e77 Update
[ghstack-poisoned]
2025-08-27 19:30:05 +08:00
2b6c6e3d64 Update
[ghstack-poisoned]
2025-08-25 13:08:58 +08:00
e289d12b73 Update (base update)
[ghstack-poisoned]
2025-08-25 13:05:28 +08:00
bb2c89fc3b Update
[ghstack-poisoned]
2025-08-25 13:05:28 +08:00
d530a21122 Update (base update)
[ghstack-poisoned]
2025-08-20 00:20:04 +08:00
27fb875a70 Update
[ghstack-poisoned]
2025-08-20 00:20:04 +08:00
a5ecc01ef8 Update (base update)
[ghstack-poisoned]
2025-08-16 19:42:28 +08:00
f7b574c862 Update
[ghstack-poisoned]
2025-08-16 19:42:28 +08:00
5d6924f0a4 Update (base update)
[ghstack-poisoned]
2025-08-14 12:26:15 +08:00
cf7a543013 Update
[ghstack-poisoned]
2025-08-14 12:26:15 +08:00
55043b3ada Update (base update)
[ghstack-poisoned]
2025-08-10 13:46:42 +08:00
7b61d461ab Update
[ghstack-poisoned]
2025-08-10 13:46:42 +08:00
0ff1ddecad Update (base update)
[ghstack-poisoned]
2025-08-05 13:35:59 +08:00
0f9b4aae53 Update
[ghstack-poisoned]
2025-08-05 13:35:59 +08:00
d6886407ef Update (base update)
[ghstack-poisoned]
2025-07-31 15:02:17 +08:00
c37103c16d Update
[ghstack-poisoned]
2025-07-31 15:02:17 +08:00
d7ecbf7243 Update (base update)
[ghstack-poisoned]
2025-07-30 15:15:56 +08:00
8080febac5 Update
[ghstack-poisoned]
2025-07-30 15:15:56 +08:00
408ab373f8 Update
[ghstack-poisoned]
2025-07-29 12:55:39 +08:00
23b1bbb810 Update (base update)
[ghstack-poisoned]
2025-07-29 11:38:05 +08:00
8268316590 Update
[ghstack-poisoned]
2025-07-29 11:38:05 +08:00
a56e4e5fc7 Update (base update)
[ghstack-poisoned]
2025-07-25 19:38:51 +08:00
dac4caf77b Update
[ghstack-poisoned]
2025-07-25 19:38:51 +08:00
1710c27341 Update (base update)
[ghstack-poisoned]
2025-07-24 18:43:42 +08:00
92fde5bdc5 Update
[ghstack-poisoned]
2025-07-24 18:43:42 +08:00
eafc6437d6 Update (base update)
[ghstack-poisoned]
2025-07-20 18:09:58 +08:00
4a90ec1387 Update
[ghstack-poisoned]
2025-07-20 18:09:58 +08:00
d9ab4e3ade Update (base update)
[ghstack-poisoned]
2025-07-18 13:10:59 +08:00
ce372a06e9 Update
[ghstack-poisoned]
2025-07-18 13:10:59 +08:00
67f0059726 Update (base update)
[ghstack-poisoned]
2025-07-17 14:55:19 +08:00
55473096e2 Update
[ghstack-poisoned]
2025-07-17 14:55:19 +08:00
2a15541b5d Update (base update)
[ghstack-poisoned]
2025-07-16 09:07:20 +08:00
16875b228b Update
[ghstack-poisoned]
2025-07-16 09:07:20 +08:00
808af988ae Update (base update)
[ghstack-poisoned]
2025-07-16 03:48:55 +08:00
61c298ed56 Update
[ghstack-poisoned]
2025-07-16 03:48:55 +08:00
2d38369728 Update
[ghstack-poisoned]
2025-07-15 12:00:06 +08:00
74bd12415e Update (base update)
[ghstack-poisoned]
2025-07-15 11:44:30 +08:00
a8c0c0263c Update
[ghstack-poisoned]
2025-07-15 11:44:30 +08:00
e3f14cdafa Update (base update)
[ghstack-poisoned]
2025-07-12 13:12:15 +08:00
b03cc2d9c8 Update
[ghstack-poisoned]
2025-07-12 13:12:15 +08:00
99917e659b Update (base update)
[ghstack-poisoned]
2025-07-11 22:11:06 +08:00
def4476b6b Update
[ghstack-poisoned]
2025-07-11 22:11:06 +08:00
84bb803719 Update (base update)
[ghstack-poisoned]
2025-07-11 15:05:13 +08:00
8266849bda Update
[ghstack-poisoned]
2025-07-11 15:05:13 +08:00
f766c8ceea Update (base update)
[ghstack-poisoned]
2025-07-09 13:24:04 +08:00
b3cf7bc86d Update
[ghstack-poisoned]
2025-07-09 13:24:04 +08:00
6a4a8b453d Update (base update)
[ghstack-poisoned]
2025-07-09 13:18:08 +08:00
cab14368a7 Update
[ghstack-poisoned]
2025-07-09 13:18:08 +08:00
0b87f606ca Update (base update)
[ghstack-poisoned]
2025-07-09 11:16:42 +08:00
25f68b6b5b Update
[ghstack-poisoned]
2025-07-09 11:16:42 +08:00
334489bfd0 Update (base update)
[ghstack-poisoned]
2025-07-08 19:50:26 +08:00
ada5d90c25 Update
[ghstack-poisoned]
2025-07-08 19:50:26 +08:00
a1e3e2026b Update (base update)
[ghstack-poisoned]
2025-07-08 13:52:13 +08:00
8dc4932b0c Update
[ghstack-poisoned]
2025-07-08 13:52:13 +08:00
1505d7a461 Update
[ghstack-poisoned]
2025-07-06 15:43:17 +08:00
1c842a9686 Update (base update)
[ghstack-poisoned]
2025-07-06 10:59:24 +08:00
692827ad29 Update
[ghstack-poisoned]
2025-07-06 10:59:24 +08:00
786d4646ee Update (base update)
[ghstack-poisoned]
2025-07-05 02:30:33 +08:00
70df0aed59 Update
[ghstack-poisoned]
2025-07-05 02:30:33 +08:00
183 changed files with 471 additions and 455 deletions

View File

@ -1202,12 +1202,6 @@ exclude_patterns = [
'torch/_inductor/fx_passes/serialized_patterns/**',
'torch/_inductor/autoheuristic/artifacts/**',
'torch/utils/model_dump/preact.mjs',
# These files are all grandfathered in, feel free to remove from this list
# as necessary
# NOTE: remove the patterns in the order they are listed
'aten/src/ATen/native/[a-pA-P]*/**',
'aten/src/ATen/[a-mA-M]*/**',
'test/**',
]
init_command = [
'python3',

View File

@ -94,11 +94,11 @@ struct PinnedReserveSegment {
struct TORCH_API HostStats {
// COUNT: total allocations (active)
Stat active_requests;
// SUM: bytes allocated/reserved by this memory alocator. (active)
// SUM: bytes allocated/reserved by this memory allocator. (active)
Stat active_bytes;
// COUNT: total allocations (active + free)
Stat allocations;
// SUM: bytes allocated/reserved by this memory alocator. This accounts
// SUM: bytes allocated/reserved by this memory allocator. This accounts
// for both free and in-use blocks.
Stat allocated_bytes;
@ -127,7 +127,7 @@ struct alignas(64) HostStatsStaged {
// COUNT: total allocations (active + free)
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
Stat allocations;
// SUM: bytes allocated/reserved by this memory alocator. This accounts
// SUM: bytes allocated/reserved by this memory allocator. This accounts
// for both free and in-use blocks.
Stat allocated_bytes;
// COUNT: number of allocations per bucket (active)
@ -455,7 +455,7 @@ struct CachingHostAllocatorImpl {
}
void resetAccumulatedStats() {
// Reseting accumulated memory stats requires concurrently holding both the
// Resetting accumulated memory stats requires concurrently holding both the
// free list mutexes and the blocks mutex. Previously, this was only done in
// empty_cache function.
for (size_t i = 0; i < free_list_.size(); ++i) {
@ -482,7 +482,7 @@ struct CachingHostAllocatorImpl {
}
void resetPeakStats() {
// Reseting peak memory stats requires concurrently holding both the
// Resetting peak memory stats requires concurrently holding both the
// free list mutexes and the blocks mutex. Previously, this was only done in
// empty_cache function.
for (size_t i = 0; i < free_list_.size(); ++i) {

View File

@ -3,7 +3,7 @@
namespace at {
// Re-declaring 'DimVector' type and size inside 'at' namespace.
// Redeclaring 'DimVector' type and size inside 'at' namespace.
// This is done to avoid modifying every use into their 'c10'
// equivalent.

View File

@ -16,7 +16,7 @@ _GeneratorRegister::_GeneratorRegister(const GeneratorFuncType& func) {
TORCH_WARN_DEPRECATION(
"REGISTER_GENERATOR_PRIVATEUSE1 is deprecated. \
Please derive PrivateUse1HooksInterface to implememt getNewGenerator instead.")
Please derive PrivateUse1HooksInterface to implement getNewGenerator instead.")
TORCH_CHECK(
!GetGeneratorPrivate().has_value(),

View File

@ -149,7 +149,7 @@
* First, keep in mind that we assume that boxed containers will
* have to deal with `IValue` (e.g. `c10::List`). In this context,
* what may be happening is that `IValue` doesn't store internally
* your type `T`. Instead, it constructs a type new `T` everytime
* your type `T`. Instead, it constructs a type new `T` every time
* you try to get `T` for it (see `IListRef<at::OptinalTensorRef>`).
*/
@ -186,7 +186,7 @@ class IListRef;
* This macro is useful because it allows us to handle different
* types (that correspond to different tags) to be implemented
* only once. We can do it even when the implementation of the
* different tags aren't syntatically the same, by dispatching
* different tags aren't syntactically the same, by dispatching
* it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
*/
#define TORCH_ILISTREF_UNWRAP(TAG, BODY) \

View File

@ -42,7 +42,7 @@ class IListRefTagImplBase<IListRefTag::Unboxed, T, ListElemT> {
/*
* We have these function (besides the `unwrap`s above) because the
* implementation for both `IListRef::operator[]` and `IListRefIterator::operator*`
* weren't syntatically equal for the existing tags at the time
* weren't syntactically equal for the existing tags at the time
* (`Unboxed` and `Boxed`).
*/
static IListRefConstRef<T> front(const list_type& lst) {

View File

@ -12,7 +12,7 @@ namespace at {
// in order. This is most commonly used in autogenerated code,
// where it is convenient to have a function that can uniformly
// take arguments of different types. If your arguments
// are homogenous consider using a std::initializer_list instead.
// are homogeneous consider using a std::initializer_list instead.
//
// For examples of this in use, see torch/csrc/utils/variadic.h
template <typename F>

View File

@ -148,7 +148,7 @@ struct TORCH_API ClassType : public NamedType {
void checkNotExist(const std::string& name, const std::string& what) const;
// Attributes are stored in a specific slot at runtime for effiency.
// Attributes are stored in a specific slot at runtime for efficiency.
// When emitting instructions we specify the slot so that attribute access is
// a constant lookup
std::optional<size_t> findAttributeSlot(const std::string& name) const {
@ -412,7 +412,7 @@ struct TORCH_API ClassType : public NamedType {
// Holds method attributes
std::weak_ptr<CompilationUnit> compilation_unit_;
// Holds all atrributes, attribute details are found on ClassAttribute
// Holds all attributes, attribute details are found on ClassAttribute
std::vector<ClassAttribute> attributes_;
// Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
// Never fill this without using the appropriate provideNewClassAttribute method

View File

@ -111,7 +111,7 @@ void Dispatcher::waitForDef(const FunctionSchema& schema) {
TORCH_INTERNAL_ASSERT(r,
"Expected main interpreter to define ", schema.operator_name(),
", but this didn't happen within timeout. Are you trying to load "
"different models in the same torchdeploy/multipy instance? You "
"different models in the same torchdeploy/multipy instance? You " // codespell:ignore
"must warmup each interpreter identically, e.g., import all "
"the same dependencies.");
}
@ -129,7 +129,7 @@ void Dispatcher::waitForImpl(const OperatorName& op_name, std::optional<c10::Dis
TORCH_INTERNAL_ASSERT(r,
"Expected main interpreter to implement ", dk, " for ", op_name,
", but this didn't happen within timeout. Are you trying to load "
"different models in the same torchdeploy/multipy instance? You "
"different models in the same torchdeploy/multipy instance? You " // codespell:ignore
"must warmup each interpreter identically, e.g., import all "
"the same dependencies.");
}
@ -531,7 +531,7 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK
// Note: this records a sequence number for both Autograd keys, and for
// non-Autograd keys where the dispatchKeySet still contains an autograd key.
// This means that we might collect the same sequence nubmer two different
// This means that we might collect the same sequence number two different
// events if they all occurred above Autograd and still had the Autograd
// dispatch key in the dispatch key set.
// However, this usually doesn't happen: normally the first call will

View File

@ -222,7 +222,8 @@ class TORCH_API Dispatcher final {
return backendFallbackKernels_[dispatch_ix].kernel.isValid();
}
// Used by torchdeploy/multipy for multiple interpreters racing.
// Used by torchdeploy/multipy for multiple // codespell:ignore: multipy
// interpreters racing.
void waitForDef(const FunctionSchema& schema);
void waitForImpl(
const OperatorName& op_name,
@ -414,7 +415,7 @@ class TORCH_API Dispatcher final {
std::unique_ptr<detail::RegistrationListenerList> listeners_;
// This condition variable gets notified whenever we add a new def/impl to the
// dispatch table. This is primarily used by multipy/torchdeploy, when
// dispatch table. This is primarily used by multiply/torchdeploy, when
// we have multiple interpreters trying to register to the dispatch table.
// In this situation, whenever the non-primary interpreter would have tried
// to register to the dispatch table, instead it will check to see if the
@ -585,7 +586,7 @@ class TORCH_API OperatorHandle {
// We need to store this iterator in order to make
// Dispatcher::cleanup() fast -- it runs a lot on program
// termination (and presuambly library unloading).
// termination (and presumably library unloading).
std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
};

View File

@ -261,7 +261,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
//
// There are 2 cases
// 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
// without the extra parenthesis, the c++ schem parser can not parse it.
// without the extra parenthesis, the c++ scheme parser can not parse it.
// 2. something like '-> ((str, str))'. Need extra parenthesis so the return
// type is a single tuple rather than two strings.
// PR (https://github.com/pytorch/pytorch/pull/23204) has more context about

View File

@ -1176,7 +1176,7 @@ struct TORCH_API IValue final {
using HashIdentityIValueMap =
std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
// Chechs if this and rhs has a subvalues in common.
// Checks if this and rhs has a subvalues in common.
// [t1,t2] and [t2, t3] returns true.
bool overlaps(const IValue& rhs) const;

View File

@ -990,7 +990,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
std::unique_lock<std::mutex> lock(mutex_);
if (completed_) {
// This should be rare and shouldn't cause log spew. Its important to
// log errors and thats why we have this log here.
// log errors and that's why we have this log here.
std::string msg = c10::str(
"Skipping setting following error on the Future since "
"it is already marked completed (this is not necessarily "
@ -1501,7 +1501,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
// However, the CompilationUnit holds ownership of the type's graphs, so
// inserting a constant object into a Graph would create a reference cycle if
// that constant object held a shared_ptr to its CU. For these objects we
// instatiate them with non-owning references to its CU
// instantiate them with non-owning references to its CU
Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
slots_.resize(numSlots);
}

View File

@ -374,7 +374,7 @@ struct TORCH_API SymbolicShape {
// Unranked shape constructor.
SymbolicShape() : dims_(std::nullopt) {}
// Known rank but unknown dimentions.
// Known rank but unknown dimensions.
SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
if(!rank) {
return;
@ -891,10 +891,10 @@ struct TORCH_API ListType
// global singleton
// Given an inner type T and an identifier,
// this function wil return the global singleton type pointer
// this function will return the global singleton type pointer
// the type List<T>.
// The extra "identifier" argument is needed beccause we have multiple container types
// that all re-use this function (List<T>, array<T, N>, etc.)
// The extra "identifier" argument is needed because we have multiple container types
// that all reuse this function (List<T>, array<T, N>, etc.)
static TypePtr get(const std::string& identifier, TypePtr inner);
// common cast List[Tensor]
@ -992,7 +992,7 @@ struct TORCH_API DictType : public SharedType {
// this function will return the global singleton type pointer
// the type List<T>.
// The extra "identifier" argument is needed because we have multiple container types
// that all re-use this function (Dict<K, V> and unordered_map<K, V>)
// that all reuse this function (Dict<K, V> and unordered_map<K, V>)
static TypePtr get(const std::string& identifier, TypePtr key, TypePtr val);
private:

View File

@ -21,7 +21,7 @@ namespace c10 {
namespace detail {
// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
// We do this because every argument in a function schema is expected to be convertable
// We do this because every argument in a function schema is expected to be convertible
// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
// See Note [Plumbing Keys Through The Dispatcher]
template<class KernelFunctor>

View File

@ -172,7 +172,7 @@ VaryingShape<Stride> TensorType::computeStrideProps(
// The logic below follows what TensorIterator uses in its logic:
// 1. Fast_set_up is the short-cut to identify a. channels_last and
// b. contiguous format, which is what we have in the below logic.
// 2. In more generla cases, it does best effort to preserve permutatoin.
// 2. In more general cases, it does best effort to preserve permutatoin.
if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
// case 1.a. short cut channels last
std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);

View File

@ -679,7 +679,7 @@ TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type) {
return false;
}
if (elem_type->kind() == AnyType::Kind) {
// List of Any can contains heterogenous types
// List of Any can contains heterogeneous types
return false;
}
return true;

View File

@ -234,7 +234,7 @@ class Vectorized<c10::Half> : public Vectorized16<
vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
return vaddvq_u16(bits_vec);
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// use known working implmentation.
// use known working implementation.
__at_align__ value_type tmp[size()];
store(tmp);
int mask = 0;

View File

@ -1740,7 +1740,7 @@ Vectorized<int16_t> inline shift_256_16(
// Control masks for shuffle operation, treating 256 bits as an
// array of 16-bit elements, and considering pairs of neighboring
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
// M!=N) is set so that shuffle will move element with index M from
// input pair into element with index N in output pair, and element
// with index M in output pair will be set to all 0s.
@ -1875,7 +1875,7 @@ Vectorized<T> inline shift_256_8(
// Control masks for shuffle operation, treating 256 bits as an
// array of 8-bit elements, and considering quadruples of
// neighboring elements. Specifially, a mask named "ctl_M_N" (M,N
// neighboring elements. Specifically, a mask named "ctl_M_N" (M,N
// in [0,1,2,3], and M!=N) is set so that shuffle will move element
// with index M from input quadruple into element with index N in
// output quadruple, and other elements in output quadruple will be

View File

@ -143,7 +143,7 @@ class Vectorized<double> {
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
return {
vec_sel(a._vec0, b._vec0, mask._vecb0),

View File

@ -142,7 +142,7 @@ class Vectorized<float> {
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
// assuming this we can use the same mask directly with vec_sel
return {
vec_sel(a._vec0, b._vec0, mask._vecb0),

View File

@ -202,7 +202,7 @@ class Vectorized<int16_t> {
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b,
const Vectorized<int16_t>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
// assuming this we can use the same mask directly with vec_sel
// warning intel style mask will not work properly
return {

View File

@ -155,7 +155,7 @@ class Vectorized<int32_t> {
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b,
const Vectorized<int32_t>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
// assuming this we can use the same mask directly with vec_sel
// warning intel style mask will not work properly
return {

View File

@ -119,7 +119,7 @@ class Vectorized<int64_t> {
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b,
const Vectorized<int64_t>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
return {
vec_sel(a._vec0, b._vec0, mask._vecb0),

View File

@ -397,7 +397,7 @@ inline Vectorized<bool> operator&&(
const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
__m512i out = _mm512_and_si512(*self_, *other_);
Vectorized<bool> ret;
// We do not have a constructer that takes __m512i, so we need to memcpy
// We do not have a constructor that takes __m512i, so we need to memcpy
std::memcpy(ret, &out, ret.size() * sizeof(bool));
return ret;
}

View File

@ -498,8 +498,8 @@ static inline Vectorized<T> binary_fp8_op_as_fp32(
// Refer to
// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
// -, *, /, planed to be deleted in the future and here is just to make compiler
// happy
// -, *, /, planned to be deleted in the future and here is just to make
// compiler happy
Vectorized<Float8_e4m3fn> inline operator+(
const Vectorized<Float8_e4m3fn>& a,
const Vectorized<Float8_e4m3fn>& b) {
@ -585,8 +585,8 @@ class Vectorized<Float8_e5m2> : public Vectorizedf8<Float8_e5m2> {
// Refer to
// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
// -, *, /, planed to be deleted in the future and here is just to make compiler
// happy
// -, *, /, planned to be deleted in the future and here is just to make
// compiler happy
Vectorized<Float8_e5m2> inline operator+(
const Vectorized<Float8_e5m2>& a,
const Vectorized<Float8_e5m2>& b) {

View File

@ -1852,7 +1852,7 @@ Vectorized<T> inline shift_512_8(
// Control masks for shuffle operation, treating 512 bits as an
// array of 8-bit elements, and considering pairs of neighboring
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
// M!=N) is set so that shuffle will move element with index M from
// input pair into element with index N in output pair, and element
// with index M in output pair will be set to all 0s.

View File

@ -1958,7 +1958,7 @@ void scaled_gemm(
ScalarType result_dtype,
bool use_fast_accum,
const std::optional<Tensor>& alpha) {
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
// Note: see `cublasCommonArgs` for various non-intuitive manipulations
// of input arguments to this function.
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;

View File

@ -311,7 +311,7 @@ CUDAGraph::~CUDAGraph() {
// There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
// They wait for next sync point in order to free the memory, this is to ensure that all
// hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
// We need to ensure all async opreations finish before deleting the object.
// We need to ensure all async operations finish before deleting the object.
#if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
{

View File

@ -179,7 +179,7 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
batch_offset * values_batch_stride * values.itemsize(),
index_type, // data type of row offsets index
index_type, // data type of col indices
CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col indes
CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col index
value_type // data type of values
));

View File

@ -137,7 +137,7 @@ struct CUDACachingHostAllocatorImpl
void free_block_slowpath(Block* block) {
auto start = std::chrono::steady_clock::now();
// Users may change the allocator config at will. torch unit tests do this.
// However, allocations using cudaHostRegister should use corresonding
// However, allocations using cudaHostRegister should use corresponding
// cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
void* ptr = block->ptr_;
bool use_register = false;

View File

@ -10,7 +10,7 @@ namespace at::cuda {
//
// A caching allocator for CUDA host allocations (pinned memory).
//
// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
// This provides a drop-in replacement for THCudaHostAllocator, which reuses
// freed pinned (page-locked) memory allocations. This avoids device
// synchronizations due to cudaFreeHost calls.
//
@ -26,7 +26,7 @@ inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() {
}
// Records an event in the specified stream. The allocation corresponding to the
// input `ptr`/`ctx` will not be re-used until the event has occurred.
// input `ptr`/`ctx` will not be reused until the event has occurred.
C10_DEPRECATED_MESSAGE(
"at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.")
inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(

View File

@ -4,7 +4,7 @@
#include <ATen/cuda/CUDAConfig.h>
// NOTE: These templates are intentionally not defined in this header,
// which aviods re-compiling them for each translation unit. If you get
// which avoids re-compiling them for each translation unit. If you get
// a link error, you need to add an explicit instantiation for your
// types in cub.cu

View File

@ -93,7 +93,7 @@ struct IndexToOffset {
}
};
// Uses dynamic (runtime) instead of static (compiletime) dims
// Uses dynamic (runtime) instead of static (compile time) dims
template <typename T, typename IndexType>
struct IndexToOffset<T, IndexType, -1> {
static inline __host__ __device__ IndexType get(

View File

@ -32,7 +32,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(
// Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
// fn_ptr is set to the appropriate function based on the vec size and GPU used
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
// the same compute capability
std::string f_inputs_type_str = at::cuda::jit::typeName(common_dtype);

View File

@ -38,7 +38,7 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
```
Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect
Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect
this and reject the tunings file because the prior tunings are likely affected by other software changes.
The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of

View File

@ -235,7 +235,7 @@ class TunableOp {
// numeric check option is controlled by non-static env var, so check it once per tuned operator
bool do_numerics_check = ctx->IsNumericsCheckEnabled();
// calcaulte a reference answer for numerical check
// calculate a reference answer for numerical check
if (do_numerics_check) {
reference_params = params->DeepCopy(false);
TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);

View File

@ -12,7 +12,7 @@ namespace at {
// AcceleratorHooksInterface is a shared interface provided by all
// accelerators to allow generic code.
// This inferface is hook-based as it corresponds to all the functions
// This interface is hook-based as it corresponds to all the functions
// that are going to be called in a generic way from the CPU code.
struct TORCH_API AcceleratorHooksInterface {

View File

@ -38,7 +38,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
Generator getNewGenerator(
[[maybe_unused]] DeviceIndex device_index = -1) const override {
// TODO(FFFrog): Perserved for BC and will be removed in the future.
// TODO(FFFrog): Preserved for BC and will be removed in the future.
if (at::GetGeneratorPrivate().has_value())
return at::GetGeneratorForPrivateuse1(device_index);

View File

@ -283,7 +283,7 @@ inline void boxed_existing_bdim_all_batch_rule(
// Use when all tensors arguments accept one (normal) batch dim.
// This batching rule expands the batch dim on all Tensors, reshapes it into
// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
// This is not the most efficient thing; if there are alternatives, plese try
// This is not the most efficient thing; if there are alternatives, please try
// to use them. Use this only as a last resort.
#define EXISTING_BDIM_ALL_BOXED(op) \
m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());

View File

@ -384,7 +384,7 @@ fourOutputs solve_ex_batch_rule(
// NOTE [ solve_ex Batch Rule Contiguity ]
// A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior
// differs based on whether or not the optimized path was taken
const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
if (batched_A_was_contiguous && !A.is_complex()) {

View File

@ -282,7 +282,7 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(
dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);
// Not sure why output_ needs to be marked as .contiguous(). Someting must
// Not sure why output_ needs to be marked as .contiguous(). Something must
// have changed in PyTorch (and output of softmax is probably always contiguous)
return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
}

View File

@ -224,7 +224,7 @@ static Tensor safeStack(TensorList tensors) {
// is possible for the backward function to return an undefined grad for some
// grad_input for each example. In that case, we return an undefined grad.
//
// It is theoretically posssible for *some* of the examples to produce an
// It is theoretically possible for *some* of the examples to produce an
// undefined grad (a kernel could peek at the gradient values and return an
// undefined tensor if it determines the gradient is full of zeros). We
// could handle this by treating the undefined grad as a zero-filled tensor

View File

@ -113,7 +113,7 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const {
return sym_sizes_default();
}
// The following are publically exposed as methods of Tensor
// The following are publicly exposed as methods of Tensor
IntArrayRef BatchedTensorImpl::strides_custom() const {
return strides_default();

View File

@ -37,7 +37,7 @@ namespace at::functorch {
// how to perform the transform.
//
// TODO: we can excise DynamicLayer in favor of Interpreter,
// But I am going to leave it for now as a compatiblity shim to avoid
// But I am going to leave it for now as a compatibility shim to avoid
// needing to refactor a lot of callsites...
struct TORCH_API DynamicLayer {
explicit DynamicLayer(

View File

@ -88,7 +88,7 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
// >>> VmapInterpreterPtr(&interpreter).batchSize()
//
// Finally, Interpreter::process switches on the type of the interpreter
// and calls one of {Transform}Intepreter::processImpl under the hood.
// and calls one of {Transform}Interpreter::processImpl under the hood.
// Same for Interpreter::sendToNextInterpreter :)
struct VmapInterpreterMeta {

View File

@ -143,7 +143,7 @@ struct TORCH_API VmapPhysicalView {
// mapping a physical tensor to a new logical tensor (BatchedTensor)
VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;
// Maps a logical shape to a physical shape by pre-pending the batch
// Maps a logical shape to a physical shape by prepending the batch
// sizes to the logical shape.
VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
SymDimVector getPhysicalShape(c10::SymIntArrayRef logical_shape) const;

View File

@ -27,7 +27,7 @@ namespace at::functorch {
//
// There are alternative designs we could have chosen (e.g. each grad transform
// stores a weak map of Tensor -> AutogradMeta); the benefit of the TensorWrapper
// design is that we can re-use existing VariableType kernels (i.e. Autograd kernels)
// design is that we can reuse existing VariableType kernels (i.e. Autograd kernels)
// without much modification. Since a TensorWrapper looks like a regular Tensor,
// the VariableType kernel can pull out the AutogradMeta struct from where it
// expects and extend the autograd graph

View File

@ -158,7 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
endKernelCoalescing();
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
// For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
// For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26
// See https://github.com/pytorch/pytorch/issues/163962
// Workaround by batching copy commands into 4Gb chunks
constexpr size_t max_copy_size = 0x100000000; // 4GB

View File

@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
auto* input_tr_ptr =
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
// TODO: Activation transpose before and after the kernel can be removed if we
// keep activation tensor always tranposed.
// keep activation tensor always transposed.
fbgemm::transpose_simd<uint8_t>(
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);

View File

@ -34,7 +34,7 @@ struct Dist {
// finish : This tells what to do with the aggregated value to compute
// the norm. Generally this is the result of val ^ (1 / p).
// backward : This is the gradient for that norm. Arguments are pretty
// self explanitory.
// self explanatory.
//
// There are a few cases where these aren't used. The 0 norm has no backward,
// because it's always 0, so that's shortcircuited earlier. There's a special

View File

@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
compiler flags it is currently compiled. This allows the programmer to write
generic code, which will be compiled under multipled compilation settings.
generic code, which will be compiled under multiplied compilation settings.
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
a generic definition of `sumImplAll`. This function allows the user to reduce

View File

@ -1017,7 +1017,7 @@ struct HelperInterpBase {
while (aligned_interp_size % sizeof(int32_t) != 0) {
aligned_interp_size += 1;
}
// assert that we wont go out of bounds
// assert that we won't go out of bounds
TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
}

View File

@ -655,7 +655,7 @@ void ImagingResampleHorizontalConvolution8u4x(
// last element
auto mmk = _mm256_set1_epi32(k[i]);
// For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
// lines 0, 1 and 2 wont go out of allocated memory bounds
// lines 0, 1 and 2 won't go out of allocated memory bounds
auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
// RGBA: pix1 = [
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
@ -1312,7 +1312,7 @@ void ImagingResampleVerticalConvolution8u(
// Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
// It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
// We also wont go out of bounds of lineOut memory allocation
// We also won't go out of bounds of lineOut memory allocation
std::memcpy(lineOut + j, (uint8_t *) &o, 4);
}

View File

@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 8 sines at onces using AVX intrinsics
/* evaluation of 8 sines at once using AVX intrinsics
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to

View File

@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
const bool gamma_null = (gamma_data == nullptr);
const bool beta_null = beta_data == nullptr;
// NB: About algorithm choosen:
// NB: About algorithm chosen:
//
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
// Mean and rstd are collected per each n and g, which involves reduction

View File

@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
}
};
// Dynamically Quantize the float32 input to 8 bit assymetric
// Dynamically Quantize the float32 input to 8 bit asymmetric
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
const size_t lhs_stride =
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
const int64_t weight_packed_size =
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
if (weight_packed_size == packed_weights.numel()) {
// KleidiAI interface intenally handles the Channelwise and groupwise
// KleidiAI interface internally handles the Channelwise and groupwise
// distinction
kleidiai::kai_quant_pack_lhs_int4_mm(
output, inp, packed_weights, M, N, K, block_size);

View File

@ -705,7 +705,7 @@ namespace {
);
} while (!done && max_threads);
if (!done) {
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accommodate sharedMemPerBlock limit");
}
break;
}

View File

@ -154,19 +154,19 @@ struct cublasCommonArgs {
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
bool transpose_result = false, transpose_a = false, transpose_b = false;
result = prepare_matrix_for_cublas(c, transpose_result);
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); // codespell:ignore
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); // codespell:ignore
// Handle scale tensors if provided
if (scale_a && scale_b) {
// By default since we return in row-major we run the gemm
// as B.T @ A.T, check transpose_result to determine if we flip the scales
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); // codespell:ignore
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); // codespell:ignore
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; // codespell:ignore
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); // codespell:ignore
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); // codespell:ignore
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; // codespell:ignore
}
if (scale_result) {
@ -180,17 +180,17 @@ struct cublasCommonArgs {
transpose_b = !transpose_b;
}
auto sizes_a = mata->sizes();
auto sizes_b = matb->sizes();
auto sizes_a = mata->sizes(); // codespell:ignore
auto sizes_b = matb->sizes(); // codespell:ignore
m = sizes_a[transpose_result ? 1 : 0];
k = sizes_a[transpose_result ? 0 : 1];
n = sizes_b[transpose_result ? 0 : 1];
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); // codespell:ignore
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); // codespell:ignore
result_ld = result->stride(transpose_result ? 0 : 1);
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
// if the gemm operands are in packed float4
@ -205,16 +205,16 @@ struct cublasCommonArgs {
char transa, transb;
int64_t m, n, k;
int64_t lda, ldb, result_ld;
c10::MaybeOwned<Tensor> mata, matb, result;
c10::MaybeOwned<Tensor> mata, matb, result; // codespell:ignore
// Scale members
void* scale_mata_ptr = nullptr;
void* scale_matb_ptr = nullptr;
void* scale_mata_ptr = nullptr; // codespell:ignore
void* scale_matb_ptr = nullptr; // codespell:ignore
void* scale_result_ptr = nullptr;
std::optional<c10::ScalarType> scale_mata_dtype;
std::optional<ScalingType> scaling_mata_type;
std::optional<c10::ScalarType> scale_matb_dtype;
std::optional<ScalingType> scaling_matb_type;
std::optional<c10::ScalarType> scale_mata_dtype; // codespell:ignore
std::optional<ScalingType> scaling_mata_type; // codespell:ignore
std::optional<c10::ScalarType> scale_matb_dtype; // codespell:ignore
std::optional<ScalingType> scaling_matb_type; // codespell:ignore
std::optional<c10::ScalarType> scale_result_dtype;
};
} // namespace
@ -362,7 +362,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
#endif
// if lt path fails, we recurse back into this function here and force the lt path to off
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
// we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
#if defined(USE_ROCM) && ROCM_VERSION == 60400
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@ -2886,7 +2886,7 @@ _scaled_grouped_mm_cuda_v2(
"Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ",
mat_b.size(dim_b));
// Note: only (-1, -2) is currently supported
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only");
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Currently contraction dims must be (-1, -2) only");
} else {
TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}

View File

@ -298,7 +298,7 @@ static void jitted_gpu_kernel_impl(
at::opmath_type<f_inputs_type> scalar_val,
const std::tuple<ExtraArgs...>& extra_args) {
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
// the same compute capability
static std::mutex jiterator_mutex;
static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());

View File

@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
auto value = static_cast<scalar_t>(rand * range + from);
// reverse the bounds of curand4 from (0, 1] to [0, 1)
// Note that this method is from legacy THCTensorRandom and is likely to give
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
auto reverse_bound_value = value == to ? from : value;

View File

@ -75,7 +75,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<const scalar_t, IndexType>
// We'll use this to actually cause vectorized loads later
LoadT *value = reinterpret_cast<LoadT*>(&src);
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
// Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
// sets of rand.
if ((VEC >= 4) || (gridxvec_loop_state == 0)) {
@ -159,7 +159,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo<const scalar_t, IndexType> a,
for (IndexType linearIndex = idx;
linearIndex < rounded_size;
linearIndex += gridDim.x * blockDim.x*UNROLL) {
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
float4 rand = curand_uniform4(&state);
scalar_t src[UNROLL];
rand.x = rand.x < p;

View File

@ -24,7 +24,7 @@ namespace at::native {
namespace {
/* This code computes the sum of the weights in two-steps:
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indeces`
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indices`
2) Each partial-sum from 1) are summed and scatter into `grad_weight`
Notice, `NROWS_PER_THREAD` impacts the Achieved Occupancy of the

View File

@ -204,7 +204,7 @@ Scalar scalar_reciprocal(const Scalar& scalar) {
return Scalar(1. / scalar.toComplexDouble());
}
TORCH_INTERNAL_ASSERT(
false, "divison with ", scalar.type(), " not supported");
false, "division with ", scalar.type(), " not supported");
}
void foreach_tensor_div_scalar_kernel_cuda_(

View File

@ -57,7 +57,7 @@ namespace {
const index_t n = index / (out_H * out_W);
const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y co-ordinates from grid
// get the corresponding input x, y coordinates from grid
opmath_t x = grid.data[grid_offset];
opmath_t y = grid.data[grid_offset + grid_sCoor];
@ -193,7 +193,7 @@ namespace {
const index_t n = index / (out_D * out_H * out_W);
const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
// get the corresponding input x, y, z coordinates from grid
opmath_t x = grid.data[grid_offset];
opmath_t y = grid.data[grid_offset + grid_sCoor];
opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
@ -358,7 +358,7 @@ namespace {
const index_t n = index / (out_H * out_W);
const auto grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y co-ordinates from grid
// get the corresponding input x, y coordinates from grid
scalar_t x = grid.data[grid_offset];
scalar_t y = grid.data[grid_offset + grid_sCoor];
@ -572,7 +572,7 @@ namespace {
const index_t n = index / (out_D * out_H * out_W);
const auto grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
// get the corresponding input x, y, z coordinates from grid
scalar_t ix = grid.data[grid_offset];
scalar_t iy = grid.data[grid_offset + grid_sCoor];
scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];

View File

@ -8,7 +8,7 @@
#include <c10/util/irange.h>
// Three warninngs in Cutlass included header files
// Three warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")

View File

@ -377,7 +377,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
* result at the boundary
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
* Large Parameter (see DLMF 8.12.4 [igam1])
* - if x > 1.1 and x < a, using the substraction from the regularized lower
* - if x > 1.1 and x < a, using the subtraction from the regularized lower
* incomplete gamma
* - otherwise, calculate the series from [igam2] eq (5)
*/
@ -460,7 +460,7 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
* result at the boundary
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
* Large Parameter (see DLMF 8.12.3 [igam1])
* - if x > 1 and x > a, using the substraction from the regularized upper
* - if x > 1 and x > a, using the subtraction from the regularized upper
* incomplete gamma
* - otherwise, calculate the series from [igam2] eq (4)
*/

View File

@ -332,7 +332,7 @@ void cuda_take_put_kernel(
const auto offset_calc = make_offset_calculator<2>(iter);
using uindex_t = std::make_unsigned_t<index_t>;
// OffsetCalculator needs the sizes and strides reveresed
// OffsetCalculator needs the sizes and strides reversed
const auto indexed_sizes = std::vector<int64_t>(indexed.sizes().rbegin(), indexed.sizes().rend());
const auto indexed_strides = std::vector<int64_t>(indexed.strides().rbegin(), indexed.strides().rend());
const auto* indexed_strides_data = indexed_strides.data();

View File

@ -1611,7 +1611,7 @@ void index_select_out_cuda_impl(
// SmallIndexKernel is more performant when the number of indices is small, and pre-loading
// the index reduces memory accesses. When the number of indices is large, we avoid that
// and increase parallellism by calling gather_out which is a generalization of index_select
// and increase parallelism by calling gather_out which is a generalization of index_select
if (cuda::detail::canUse32BitIndexMath(out) &&
cuda::detail::canUse32BitIndexMath(self) &&
cuda::detail::canUse32BitIndexMath(index) &&

View File

@ -273,7 +273,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
scalar_t* dst = self_ptr + index;
//pack coalseced bf16 and fp16
//pack coalesced bf16 and fp16
if constexpr (std::is_same<scalar_t, c10::BFloat16>::value || std::is_same<scalar_t, c10::Half>::value)
{
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
@ -316,7 +316,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
}
}
// not coalsced, so now let try to capture lane-matches...
// not coalesced, so now let try to capture lane-matches...
if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
// well shucks, unlikely to capture same-dest atomics in a wave.

View File

@ -343,7 +343,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
if (input_length == 0)
return;
// "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
// "first" row, the beta initialization before eq (10) (t=target_length - differs per batch)
for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
int64_t s = threadIdx.x + block_s;
scalar_t lb;

View File

@ -816,7 +816,7 @@ const auto erfcx_string = jiterator_stringify(
with the usual checks for overflow etcetera.
Performance-wise, it seems to be substantially faster than either
the SLATEC DERFC function [or an erfcx function derived therefrom]
the SLATEC DERFC function [or an erfcx function derived there from]
or Cody's CALERF function (from netlib.org/specfun), while
retaining near machine precision in accuracy.
*/

View File

@ -370,7 +370,7 @@ struct vectorized {
#ifdef USE_ROCM
// This is similar to vectorized policy above, but this one supports
// heterogenous input tensor types as templated parameters.
// heterogeneous input tensor types as templated parameters.
// Its use should be limited to frequently used heterogeneous data types
// as each instantiation will generate a separate kernel, leading to code
// bloating if applied to all combinations supported in PyTorch. Assumption: all

View File

@ -309,7 +309,7 @@ __global__ void sampleMultinomialOnce(
} else {
// This should address a rare bug where we don't select a valid index. This likely occurs when
// due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
// and our uniform sample is greater than this value. In this case we likely have unitialized memory
// and our uniform sample is greater than this value. In this case we likely have uninitialized memory
// in dest[curDist]. So basically we will loop through the distribution and pick the largest index
// where the distribution is non-zero. This is obviously terribly inefficient, but due to the
// rarity in which this occurs, this should not be an issue.

View File

@ -1623,7 +1623,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
const auto stride = input.sizes()[1];
const auto reduction_size = input.numel() / stride;
// Input is guarunteed to be channels-last compatible
// Input is guaranteed to be channels-last compatible
at::Tensor grad_input = at::empty_like(input);
dim3 block;
@ -1691,7 +1691,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
const auto reduction_size = input.numel() / stride;
auto norm_fct = 1.0 / reduction_size;
// Input is guarunteed to be channels-last compatible
// Input is guaranteed to be channels-last compatible
at::Tensor grad_input = at::empty_like(input);
dim3 block;

View File

@ -37,7 +37,7 @@ namespace at::native {
// threshold probability for having non-duplicate keys, then it can be proved that[1]
// the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
//
// Then after sort, we lauch a separate kernel that additionally shuffles any islands
// Then after sort, we launch a separate kernel that additionally shuffles any islands
// of values whose keys matched. The algorithm of this kernel is as follows:
// Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
// For each island, the first thread in the island sees a key match at index i+1 but not index i-1.

View File

@ -1088,12 +1088,12 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
// load instructions.
//
// Case 1: "vectorize along input"
// This case happens when we are reducing along fastest moving dimesion. In such case, threads
// This case happens when we are reducing along fastest moving dimension. In such case, threads
// with the same threadIdx.y works on the same reduction cooperatively and will produce results
// for the same output. In such case, values in each loaded vector always correspond to the same output.
//
// Case 2: "vectorize along output"
// This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
// This case happens when the fastest moving dimension is not the dimension of reduction. In such case,
// threads with different threadIdx.x are independent and will produce results for different outputs.
// In such case, values in each loaded vector always correspond to different outputs.
if (fastest_moving_stride == sizeof(scalar_t)) {

View File

@ -241,7 +241,7 @@ __global__ void reflection_pad2d_backward_det_out_kernel(
const int64_t dist_cols = ::abs(inp_col - (input_dim_x - 1));
// we were dist_rows after, now we want to be dist_rows before
// we were dist_cols before, now we wnat to be dist_cols after
// we were dist_cols before, now we want to be dist_cols after
const int64_t reflect_tr_out_row = (corner_tr_out_row - dist_rows);
const int64_t reflect_tr_out_col = (corner_tr_out_col + dist_cols);
const int64_t reflect_tr_out =

View File

@ -5,7 +5,7 @@
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
#include <c10/macros/Macros.h>
// Two warninngs in Cutlass included header files
// Two warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")

View File

@ -7,7 +7,7 @@
#include <c10/macros/Macros.h>
#include <c10/util/irange.h>
// Two warninngs in Cutlass included header files
// Two warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")

View File

@ -460,7 +460,7 @@ __global__ void GammaBetaBackwardCUDAKernel2(
}
}
// Do warp reduce for the 2st 16 cols in the tile.
// Do warp reduce for the 2nd 16 cols in the tile.
sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
sum1 = cuda_utils::WarpReduceSum<T_ACC>(sum1);

View File

@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
std::string file_path;
if (cache_dir.has_value()) {
// Attemps to read from the cache.
// Attempts to read from the cache.
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
// because we hash on the CUDA code, but we save the compiled ptx or sass
@ -1556,19 +1556,19 @@ NvrtcFunction jit_pwise_function(
ss << "_" << hash_code;
file_path = ss.str();
std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
if (readin.fail()) {
std::ifstream read_stream{file_path, std::ios::in | std::ifstream::binary};
if (read_stream.fail()) {
// NOTE: this does not warn because the file might not exist
// TODO: consider if this should explicitly check for the file's existence or not to throw
// an informative warning
readin.close();
read_stream.close();
} else {
// TODO: try passing the "mapped" file directly to cuModuleLoadCall instead of using an intermediate buffer
std::vector<char> buffer(std::istreambuf_iterator<char>(readin), {});
std::vector<char> buffer(std::istreambuf_iterator<char>(read_stream), {});
AT_CUDA_DRIVER_CHECK(nvrtc.cuModuleLoadData(&(compiled_kernel_.module), buffer.data()));
AT_CUDA_DRIVER_CHECK(
nvrtc.cuModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
readin.close();
read_stream.close();
return compiled_kernel_;
}
}

View File

@ -1050,7 +1050,7 @@ void launch_vectorized_layer_norm_kernel(
C10_CUDA_KERNEL_LAUNCH_CHECK();
#ifdef USE_ROCM
// the blocks.x contains the max grid x dimention without invalid configuration error
// the blocks.x contains the max grid x dimension without invalid configuration error
// Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
// Ensure all elements are processed. Prepare for next round
int64_t remaining = M - blocks.x;

View File

@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
});
if (input.dim() > 2) {
// if upper=true we need to tranpose and conjugate the result tensor
// if upper=true we need to transpose and conjugate the result tensor
// because the cholesky decomposition is stored in the lower triangular part
if (upper) {
input.copy_(result.mH());
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
// TODO Investigate whether the following magma bug is still occuring.
// TODO Investigate whether the following magma bug is still occurring.
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
// geqrf_magma currently uses geqrf2_gpu
//

View File

@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
#if defined(BUILD_LAZY_CUDA_LINALG)
namespace cuda { namespace detail {
// This is only used for an old-style dispatches
// Please do not add any new entires to it
// Please do not add any new entries to it
struct LinalgDispatch {
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
};

View File

@ -177,7 +177,7 @@ bool use_ragged_in_dense(
TORCH_WARN_ONCE(
"TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
"e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
"Falling back to regualr dense case, which may trigger excessive recompilation.");
"Falling back to regular dense case, which may trigger excessive recompilation.");
}
return all_bshd;
}
@ -771,7 +771,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
if (attn_bias.has_value()) {
TORCH_CHECK(
false,
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
scaled_dot_product_flash_attention_options.set_bias(
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(BIAS)
@ -1196,7 +1196,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
if (attn_bias.has_value()) {
TORCH_CHECK(
false,
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
sdpa_backward_options.set_bias(
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(BIAS)
@ -1864,7 +1864,7 @@ void run_cudnn_SDP_bprop_nestedtensor(
}
TORCH_CHECK(
!attn_bias.has_value(),
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
auto workspace_size = mha_graph.get_workspace_size();
auto workspace_ptr =

View File

@ -30,7 +30,7 @@ static const std::unordered_map<
};
// This is the heursitic to choose a kernel based on inputs
// This is the heuristic to choose a kernel based on inputs
BGEMMKernel_BFloat16 dispatch_bfloat16_bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
// Optional/future use: directly lookup shape tuples to map to instances
/*

View File

@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
namespace at::native {
void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.
@ -471,7 +471,7 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
}
void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.

View File

@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
namespace at::native {
void dispatch_float_gemm(CUDABLAS_GEMM_ARGTYPES(float)) {
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.

View File

@ -13,7 +13,7 @@ namespace at::native {
void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
#if 0
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.
@ -299,7 +299,7 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
#endif
}
void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.

View File

@ -545,7 +545,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
const ushort slices2 = divRoundUp(C2, 4);
const ushort slices1 = divRoundUp(C1, 4);
const ushort n2 = gid.z / slices2; //image index
const ushort s2 = gid.z - n2 * slices2; // slice offest
const ushort s2 = gid.z - n2 * slices2; // slice offset
half4 value;
for (int idx = 0; idx < 4; ++idx){
// we compute the "linear index" of the output element,

View File

@ -86,4 +86,4 @@ TORCH_LIBRARY_IMPL(aten, Metal, m) {
m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_));
}
} // namepsace at::native::metal
} // namespace at::native::metal

View File

@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
// blocked format will propagate between layers. Input, output will be in blocked format.
//
// For inference case, weight can be prepacked into blocked format by
// (so as to save weight reoder overhead):
// (so as to save weight reorder overhead):
// model = torch.utils.mkldnn.to_mkldnn(model)
//
// For training case, grad_output can be CPU tensor or MKLDNN tensor,

View File

@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
args.insert({DNNL_ARG_DST, dst});
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
// Create primitve and execute
// Create primitive and execute
auto primitive = dnnl::matmul(prim_desc);
primitive.execute(ideep::stream::default_stream(), args);
}

View File

@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
// For optional additive mask
std::optional<op> mask_add;
// For optional implicite causal mask
// For optional implicit causal mask
std::optional<op> mask_gen_idx_row;
std::optional<logical_tensor> mask_row_idx;
std::optional<op> mask_gen_idx_col;
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
// For optional additive mask
std::optional<op> mask_add;
// For optional implicite causal mask
// For optional implicit causal mask
std::optional<op> mask_gen_idx_row;
std::optional<logical_tensor> mask_row_idx;
std::optional<op> mask_gen_idx_col;

View File

@ -34,7 +34,7 @@ namespace at::native::onednn {
/*
oneDNN postops usage:
Currently, oneDNN supports 5 kinds of post ops. More details can be refered
Currently, oneDNN supports 5 kinds of post ops. More details can be referred
to oneDNN doc.
https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise
@ -345,7 +345,7 @@ class Attr {
dnnl::memory binary_m;
auto binary = ops_params_[i].binary_;
auto md = ops_params_[i].meta_;
// qeury expected_md to achieve peak performance
// query expected_md to achieve peak performance
auto expected_md = pd.query_md(
dnnl::query::exec_arg_md,
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
@ -399,7 +399,7 @@ static inline void construct_attr_for_unary(
} else {
TORCH_CHECK(
unary_post_op == "none",
"onednn qlinear: unspported unary post op",
"onednn qlinear: unsupported unary post op",
unary_post_op);
}
}

View File

@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
return false;
}
// the overlaped cases are not supported
// the overlapped cases are not supported
dnnl::memory::dims strides = get_onednn_strides(tensor);
int64_t storage_size = 1;
for (size_t dim = 0; dim < tensor_dim; ++dim)

View File

@ -29,7 +29,7 @@
secondaryTensor:(MPSGraphTensor*)secondaryTensor
name:(NSString*)name {
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
// agruments results in
// arguments results in
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
@ -42,7 +42,7 @@
secondaryTensor:(MPSGraphTensor*)secondaryTensor
name:(NSString*)name {
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
// agruments results in
// arguments results in
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
// Use gather kernel to solve strides for macOS < 15.0
// Starting with macOS 15.0, MPS supports native strides direclty in the kernels
// Starting with macOS 15.0, MPS supports native strides directly in the kernels
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
Tensor emptyShell = Tensor();
@ -856,7 +856,7 @@ id<MTLLibrary> MetalShaderLibrary::getLibrary(const std::initializer_list<std::s
break;
}
default:
TORCH_INTERNAL_ASSERT(false, "Unsupported number of paramaters ", nparams);
TORCH_INTERNAL_ASSERT(false, "Unsupported number of parameters ", nparams);
}
return libMap[key] = lib;
}
@ -1184,9 +1184,9 @@ void MetalKernelFunction::dispatch(uint64_t length, std::optional<uint64_t> grou
}
void MetalKernelFunction::dispatch(c10::ArrayRef<uint64_t> length, c10::OptionalArrayRef<uint64_t> group_size) {
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimentions must be less than 3 and non-empty");
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimensions must be less than 3 and non-empty");
TORCH_CHECK(!group_size.has_value() || group_size->size() == length.size(),
"size and group_size must have same number of dimentions");
"size and group_size must have same number of dimensions");
const auto max_tg_size = getMaxThreadsPerThreadgroup();
const auto group_size_length = group_size.has_value() ? group_size->size() : 0;
auto tg_size = MTLSizeMake(group_size_length > 0 ? group_size->at(0) : max_tg_size,

View File

@ -59,7 +59,7 @@ static GridSamplerOffsets find_grid_sampler_offsets(
return offsets;
}
// Mod function which gives postive output when `a` is negative
// Mod function which gives positive output when `a` is negative
static int32_t mod(int32_t a, int32_t b) {
auto r = a % b;
return r + (r < 0 ? b : 0);
@ -191,9 +191,9 @@ void grid_sampler_single_element(
int32_t right_indices[3];
opmath_t<T> scales[3];
// For each dimension, find the pair of indices in the cooresponding dimension
// For each dimension, find the pair of indices in the corresponding dimension
// of `input` which surround the grid coordinate in that dimension. We'll do
// this by mapping different coordiante spaces onto each other. There are
// this by mapping different coordinate spaces onto each other. There are
// basically three different coordinate spaces to keep in mind:
//
// * aligned grid space

View File

@ -137,7 +137,7 @@ kernel void index_put_serial(
constant int64_t* index_strides,
constant uint4& ndim_nindices_numel,
uint thread_index [[thread_position_in_grid]]) {
(void)thread_index; // Suppress unused vairable varning
(void)thread_index; // Suppress unused variable warning
for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
index_put_impl(
output,

Some files were not shown because too many files have changed in this diff Show More