[BE][4/5] fix typos in aten/ (aten/src/ATen/native/)

ghstack-source-id: 288daaa86090e465421c81ae8373b0f6f63eab4a
Pull-Request: https://github.com/pytorch/pytorch/pull/157553
This commit is contained in:
Xuehai Pan
2025-10-19 18:28:27 +08:00
parent 57ba575242
commit 0e32b4a985
60 changed files with 118 additions and 118 deletions

View File

@ -1205,7 +1205,6 @@ exclude_patterns = [
# These files are all grandfathered in, feel free to remove from this list
# as necessary
# NOTE: remove the patterns in the order they are listed
'aten/src/ATen/native/[a-pA-P]*/**',
'aten/src/ATen/[a-mA-M]*/**',
'test/**',
]

View File

@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
auto* input_tr_ptr =
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
// TODO: Activation transpose before and after the kernel can be removed if we
// keep activation tensor always tranposed.
// keep activation tensor always transposed.
fbgemm::transpose_simd<uint8_t>(
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);

View File

@ -34,7 +34,7 @@ struct Dist {
// finish : This tells what to do with the aggregated value to compute
// the norm. Generally this is the result of val ^ (1 / p).
// backward : This is the gradient for that norm. Arguments are pretty
// self explanitory.
// self explanatory.
//
// There are a few cases where these aren't used. The 0 norm has no backward,
// because it's always 0, so that's shortcircuited earlier. There's a special

View File

@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
compiler flags it is currently compiled. This allows the programmer to write
generic code, which will be compiled under multipled compilation settings.
generic code, which will be compiled under multiplied compilation settings.
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
a generic definition of `sumImplAll`. This function allows the user to reduce

View File

@ -1017,7 +1017,7 @@ struct HelperInterpBase {
while (aligned_interp_size % sizeof(int32_t) != 0) {
aligned_interp_size += 1;
}
// assert that we wont go out of bounds
// assert that we won't go out of bounds
TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
}

View File

@ -655,7 +655,7 @@ void ImagingResampleHorizontalConvolution8u4x(
// last element
auto mmk = _mm256_set1_epi32(k[i]);
// For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
// lines 0, 1 and 2 wont go out of allocated memory bounds
// lines 0, 1 and 2 won't go out of allocated memory bounds
auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
// RGBA: pix1 = [
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
@ -1312,7 +1312,7 @@ void ImagingResampleVerticalConvolution8u(
// Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
// It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
// We also wont go out of bounds of lineOut memory allocation
// We also won't go out of bounds of lineOut memory allocation
std::memcpy(lineOut + j, (uint8_t *) &o, 4);
}

View File

@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 8 sines at onces using AVX intrinsics
/* evaluation of 8 sines at once using AVX intrinsics
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to

View File

@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
const bool gamma_null = (gamma_data == nullptr);
const bool beta_null = beta_data == nullptr;
// NB: About algorithm choosen:
// NB: About algorithm chosen:
//
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
// Mean and rstd are collected per each n and g, which involves reduction

View File

@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
}
};
// Dynamically Quantize the float32 input to 8 bit assymetric
// Dynamically Quantize the float32 input to 8 bit asymmetric
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
const size_t lhs_stride =
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
const int64_t weight_packed_size =
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
if (weight_packed_size == packed_weights.numel()) {
// KleidiAI interface intenally handles the Channelwise and groupwise
// KleidiAI interface internally handles the Channelwise and groupwise
// distinction
kleidiai::kai_quant_pack_lhs_int4_mm(
output, inp, packed_weights, M, N, K, block_size);

View File

@ -705,7 +705,7 @@ namespace {
);
} while (!done && max_threads);
if (!done) {
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accommodate sharedMemPerBlock limit");
}
break;
}

View File

@ -154,19 +154,19 @@ struct cublasCommonArgs {
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
bool transpose_result = false, transpose_a = false, transpose_b = false;
result = prepare_matrix_for_cublas(c, transpose_result);
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); // codespell:ignore
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); // codespell:ignore
// Handle scale tensors if provided
if (scale_a && scale_b) {
// By default since we return in row-major we run the gemm
// as B.T @ A.T, check transpose_result to determine if we flip the scales
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); // codespell:ignore
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); // codespell:ignore
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; // codespell:ignore
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); // codespell:ignore
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); // codespell:ignore
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; // codespell:ignore
}
if (scale_result) {
@ -180,17 +180,17 @@ struct cublasCommonArgs {
transpose_b = !transpose_b;
}
auto sizes_a = mata->sizes();
auto sizes_b = matb->sizes();
auto sizes_a = mata->sizes(); // codespell:ignore
auto sizes_b = matb->sizes(); // codespell:ignore
m = sizes_a[transpose_result ? 1 : 0];
k = sizes_a[transpose_result ? 0 : 1];
n = sizes_b[transpose_result ? 0 : 1];
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); // codespell:ignore
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); // codespell:ignore
result_ld = result->stride(transpose_result ? 0 : 1);
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
// if the gemm operands are in packed float4
@ -205,16 +205,16 @@ struct cublasCommonArgs {
char transa, transb;
int64_t m, n, k;
int64_t lda, ldb, result_ld;
c10::MaybeOwned<Tensor> mata, matb, result;
c10::MaybeOwned<Tensor> mata, matb, result; // codespell:ignore
// Scale members
void* scale_mata_ptr = nullptr;
void* scale_matb_ptr = nullptr;
void* scale_mata_ptr = nullptr; // codespell:ignore
void* scale_matb_ptr = nullptr; // codespell:ignore
void* scale_result_ptr = nullptr;
std::optional<c10::ScalarType> scale_mata_dtype;
std::optional<ScalingType> scaling_mata_type;
std::optional<c10::ScalarType> scale_matb_dtype;
std::optional<ScalingType> scaling_matb_type;
std::optional<c10::ScalarType> scale_mata_dtype; // codespell:ignore
std::optional<ScalingType> scaling_mata_type; // codespell:ignore
std::optional<c10::ScalarType> scale_matb_dtype; // codespell:ignore
std::optional<ScalingType> scaling_matb_type; // codespell:ignore
std::optional<c10::ScalarType> scale_result_dtype;
};
} // namespace
@ -362,7 +362,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
#endif
// if lt path fails, we recurse back into this function here and force the lt path to off
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
// we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
#if defined(USE_ROCM) && ROCM_VERSION == 60400
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@ -2886,7 +2886,7 @@ _scaled_grouped_mm_cuda_v2(
"Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ",
mat_b.size(dim_b));
// Note: only (-1, -2) is currently supported
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only");
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Currently contraction dims must be (-1, -2) only");
} else {
TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}

View File

@ -298,7 +298,7 @@ static void jitted_gpu_kernel_impl(
at::opmath_type<f_inputs_type> scalar_val,
const std::tuple<ExtraArgs...>& extra_args) {
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
// the same compute capability
static std::mutex jiterator_mutex;
static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());

View File

@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
auto value = static_cast<scalar_t>(rand * range + from);
// reverse the bounds of curand4 from (0, 1] to [0, 1)
// Note that this method is from legacy THCTensorRandom and is likely to give
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
auto reverse_bound_value = value == to ? from : value;

View File

@ -75,7 +75,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<const scalar_t, IndexType>
// We'll use this to actually cause vectorized loads later
LoadT *value = reinterpret_cast<LoadT*>(&src);
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
// Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
// sets of rand.
if ((VEC >= 4) || (gridxvec_loop_state == 0)) {
@ -159,7 +159,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo<const scalar_t, IndexType> a,
for (IndexType linearIndex = idx;
linearIndex < rounded_size;
linearIndex += gridDim.x * blockDim.x*UNROLL) {
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
float4 rand = curand_uniform4(&state);
scalar_t src[UNROLL];
rand.x = rand.x < p;

View File

@ -24,7 +24,7 @@ namespace at::native {
namespace {
/* This code computes the sum of the weights in two-steps:
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indeces`
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indices`
2) Each partial-sum from 1) are summed and scatter into `grad_weight`
Notice, `NROWS_PER_THREAD` impacts the Achieved Occupancy of the

View File

@ -204,7 +204,7 @@ Scalar scalar_reciprocal(const Scalar& scalar) {
return Scalar(1. / scalar.toComplexDouble());
}
TORCH_INTERNAL_ASSERT(
false, "divison with ", scalar.type(), " not supported");
false, "division with ", scalar.type(), " not supported");
}
void foreach_tensor_div_scalar_kernel_cuda_(

View File

@ -57,7 +57,7 @@ namespace {
const index_t n = index / (out_H * out_W);
const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y co-ordinates from grid
// get the corresponding input x, y coordinates from grid
opmath_t x = grid.data[grid_offset];
opmath_t y = grid.data[grid_offset + grid_sCoor];
@ -193,7 +193,7 @@ namespace {
const index_t n = index / (out_D * out_H * out_W);
const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
// get the corresponding input x, y, z coordinates from grid
opmath_t x = grid.data[grid_offset];
opmath_t y = grid.data[grid_offset + grid_sCoor];
opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
@ -358,7 +358,7 @@ namespace {
const index_t n = index / (out_H * out_W);
const auto grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y co-ordinates from grid
// get the corresponding input x, y coordinates from grid
scalar_t x = grid.data[grid_offset];
scalar_t y = grid.data[grid_offset + grid_sCoor];
@ -572,7 +572,7 @@ namespace {
const index_t n = index / (out_D * out_H * out_W);
const auto grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
// get the corresponding input x, y, z coordinates from grid
scalar_t ix = grid.data[grid_offset];
scalar_t iy = grid.data[grid_offset + grid_sCoor];
scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];

View File

@ -8,7 +8,7 @@
#include <c10/util/irange.h>
// Three warninngs in Cutlass included header files
// Three warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")

View File

@ -377,7 +377,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
* result at the boundary
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
* Large Parameter (see DLMF 8.12.4 [igam1])
* - if x > 1.1 and x < a, using the substraction from the regularized lower
* - if x > 1.1 and x < a, using the subtraction from the regularized lower
* incomplete gamma
* - otherwise, calculate the series from [igam2] eq (5)
*/
@ -460,7 +460,7 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
* result at the boundary
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
* Large Parameter (see DLMF 8.12.3 [igam1])
* - if x > 1 and x > a, using the substraction from the regularized upper
* - if x > 1 and x > a, using the subtraction from the regularized upper
* incomplete gamma
* - otherwise, calculate the series from [igam2] eq (4)
*/

View File

@ -332,7 +332,7 @@ void cuda_take_put_kernel(
const auto offset_calc = make_offset_calculator<2>(iter);
using uindex_t = std::make_unsigned_t<index_t>;
// OffsetCalculator needs the sizes and strides reveresed
// OffsetCalculator needs the sizes and strides reversed
const auto indexed_sizes = std::vector<int64_t>(indexed.sizes().rbegin(), indexed.sizes().rend());
const auto indexed_strides = std::vector<int64_t>(indexed.strides().rbegin(), indexed.strides().rend());
const auto* indexed_strides_data = indexed_strides.data();

View File

@ -1611,7 +1611,7 @@ void index_select_out_cuda_impl(
// SmallIndexKernel is more performant when the number of indices is small, and pre-loading
// the index reduces memory accesses. When the number of indices is large, we avoid that
// and increase parallellism by calling gather_out which is a generalization of index_select
// and increase parallelism by calling gather_out which is a generalization of index_select
if (cuda::detail::canUse32BitIndexMath(out) &&
cuda::detail::canUse32BitIndexMath(self) &&
cuda::detail::canUse32BitIndexMath(index) &&

View File

@ -273,7 +273,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
scalar_t* dst = self_ptr + index;
//pack coalseced bf16 and fp16
//pack coalesced bf16 and fp16
if constexpr (std::is_same<scalar_t, c10::BFloat16>::value || std::is_same<scalar_t, c10::Half>::value)
{
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
@ -316,7 +316,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
}
}
// not coalsced, so now let try to capture lane-matches...
// not coalesced, so now let try to capture lane-matches...
if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
// well shucks, unlikely to capture same-dest atomics in a wave.

View File

@ -343,7 +343,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
if (input_length == 0)
return;
// "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
// "first" row, the beta initialization before eq (10) (t=target_length - differs per batch)
for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
int64_t s = threadIdx.x + block_s;
scalar_t lb;

View File

@ -816,7 +816,7 @@ const auto erfcx_string = jiterator_stringify(
with the usual checks for overflow etcetera.
Performance-wise, it seems to be substantially faster than either
the SLATEC DERFC function [or an erfcx function derived therefrom]
the SLATEC DERFC function [or an erfcx function derived there from]
or Cody's CALERF function (from netlib.org/specfun), while
retaining near machine precision in accuracy.
*/

View File

@ -370,7 +370,7 @@ struct vectorized {
#ifdef USE_ROCM
// This is similar to vectorized policy above, but this one supports
// heterogenous input tensor types as templated parameters.
// heterogeneous input tensor types as templated parameters.
// Its use should be limited to frequently used heterogeneous data types
// as each instantiation will generate a separate kernel, leading to code
// bloating if applied to all combinations supported in PyTorch. Assumption: all

View File

@ -309,7 +309,7 @@ __global__ void sampleMultinomialOnce(
} else {
// This should address a rare bug where we don't select a valid index. This likely occurs when
// due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
// and our uniform sample is greater than this value. In this case we likely have unitialized memory
// and our uniform sample is greater than this value. In this case we likely have uninitialized memory
// in dest[curDist]. So basically we will loop through the distribution and pick the largest index
// where the distribution is non-zero. This is obviously terribly inefficient, but due to the
// rarity in which this occurs, this should not be an issue.

View File

@ -1623,7 +1623,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
const auto stride = input.sizes()[1];
const auto reduction_size = input.numel() / stride;
// Input is guarunteed to be channels-last compatible
// Input is guaranteed to be channels-last compatible
at::Tensor grad_input = at::empty_like(input);
dim3 block;
@ -1691,7 +1691,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
const auto reduction_size = input.numel() / stride;
auto norm_fct = 1.0 / reduction_size;
// Input is guarunteed to be channels-last compatible
// Input is guaranteed to be channels-last compatible
at::Tensor grad_input = at::empty_like(input);
dim3 block;

View File

@ -37,7 +37,7 @@ namespace at::native {
// threshold probability for having non-duplicate keys, then it can be proved that[1]
// the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
//
// Then after sort, we lauch a separate kernel that additionally shuffles any islands
// Then after sort, we launch a separate kernel that additionally shuffles any islands
// of values whose keys matched. The algorithm of this kernel is as follows:
// Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
// For each island, the first thread in the island sees a key match at index i+1 but not index i-1.

View File

@ -1088,12 +1088,12 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
// load instructions.
//
// Case 1: "vectorize along input"
// This case happens when we are reducing along fastest moving dimesion. In such case, threads
// This case happens when we are reducing along fastest moving dimension. In such case, threads
// with the same threadIdx.y works on the same reduction cooperatively and will produce results
// for the same output. In such case, values in each loaded vector always correspond to the same output.
//
// Case 2: "vectorize along output"
// This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
// This case happens when the fastest moving dimension is not the dimension of reduction. In such case,
// threads with different threadIdx.x are independent and will produce results for different outputs.
// In such case, values in each loaded vector always correspond to different outputs.
if (fastest_moving_stride == sizeof(scalar_t)) {

View File

@ -241,7 +241,7 @@ __global__ void reflection_pad2d_backward_det_out_kernel(
const int64_t dist_cols = ::abs(inp_col - (input_dim_x - 1));
// we were dist_rows after, now we want to be dist_rows before
// we were dist_cols before, now we wnat to be dist_cols after
// we were dist_cols before, now we want to be dist_cols after
const int64_t reflect_tr_out_row = (corner_tr_out_row - dist_rows);
const int64_t reflect_tr_out_col = (corner_tr_out_col + dist_cols);
const int64_t reflect_tr_out =

View File

@ -5,7 +5,7 @@
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
#include <c10/macros/Macros.h>
// Two warninngs in Cutlass included header files
// Two warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")

View File

@ -7,7 +7,7 @@
#include <c10/macros/Macros.h>
#include <c10/util/irange.h>
// Two warninngs in Cutlass included header files
// Two warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")

View File

@ -460,7 +460,7 @@ __global__ void GammaBetaBackwardCUDAKernel2(
}
}
// Do warp reduce for the 2st 16 cols in the tile.
// Do warp reduce for the 2nd 16 cols in the tile.
sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
sum1 = cuda_utils::WarpReduceSum<T_ACC>(sum1);

View File

@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
std::string file_path;
if (cache_dir.has_value()) {
// Attemps to read from the cache.
// Attempts to read from the cache.
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
// because we hash on the CUDA code, but we save the compiled ptx or sass
@ -1556,19 +1556,19 @@ NvrtcFunction jit_pwise_function(
ss << "_" << hash_code;
file_path = ss.str();
std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
if (readin.fail()) {
std::ifstream read_stream{file_path, std::ios::in | std::ifstream::binary};
if (read_stream.fail()) {
// NOTE: this does not warn because the file might not exist
// TODO: consider if this should explicitly check for the file's existence or not to throw
// an informative warning
readin.close();
read_stream.close();
} else {
// TODO: try passing the "mapped" file directly to cuModuleLoadCall instead of using an intermediate buffer
std::vector<char> buffer(std::istreambuf_iterator<char>(readin), {});
std::vector<char> buffer(std::istreambuf_iterator<char>(read_stream), {});
AT_CUDA_DRIVER_CHECK(nvrtc.cuModuleLoadData(&(compiled_kernel_.module), buffer.data()));
AT_CUDA_DRIVER_CHECK(
nvrtc.cuModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
readin.close();
read_stream.close();
return compiled_kernel_;
}
}

View File

@ -1050,7 +1050,7 @@ void launch_vectorized_layer_norm_kernel(
C10_CUDA_KERNEL_LAUNCH_CHECK();
#ifdef USE_ROCM
// the blocks.x contains the max grid x dimention without invalid configuration error
// the blocks.x contains the max grid x dimension without invalid configuration error
// Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
// Ensure all elements are processed. Prepare for next round
int64_t remaining = M - blocks.x;

View File

@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
});
if (input.dim() > 2) {
// if upper=true we need to tranpose and conjugate the result tensor
// if upper=true we need to transpose and conjugate the result tensor
// because the cholesky decomposition is stored in the lower triangular part
if (upper) {
input.copy_(result.mH());
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) {
// TODO Investigate whether the following magma bug is still occuring.
// TODO Investigate whether the following magma bug is still occurring.
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
// geqrf_magma currently uses geqrf2_gpu
//

View File

@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
#if defined(BUILD_LAZY_CUDA_LINALG)
namespace cuda { namespace detail {
// This is only used for an old-style dispatches
// Please do not add any new entires to it
// Please do not add any new entries to it
struct LinalgDispatch {
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
};

View File

@ -177,7 +177,7 @@ bool use_ragged_in_dense(
TORCH_WARN_ONCE(
"TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
"e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
"Falling back to regualr dense case, which may trigger excessive recompilation.");
"Falling back to regular dense case, which may trigger excessive recompilation.");
}
return all_bshd;
}
@ -771,7 +771,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
if (attn_bias.has_value()) {
TORCH_CHECK(
false,
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
scaled_dot_product_flash_attention_options.set_bias(
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(BIAS)
@ -1196,7 +1196,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
if (attn_bias.has_value()) {
TORCH_CHECK(
false,
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
sdpa_backward_options.set_bias(
mha_graph->tensor(fe::graph::Tensor_attributes()
.set_uid(BIAS)
@ -1864,7 +1864,7 @@ void run_cudnn_SDP_bprop_nestedtensor(
}
TORCH_CHECK(
!attn_bias.has_value(),
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
auto workspace_size = mha_graph.get_workspace_size();
auto workspace_ptr =

View File

@ -30,7 +30,7 @@ static const std::unordered_map<
};
// This is the heursitic to choose a kernel based on inputs
// This is the heuristic to choose a kernel based on inputs
BGEMMKernel_BFloat16 dispatch_bfloat16_bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
// Optional/future use: directly lookup shape tuples to map to instances
/*

View File

@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
namespace at::native {
void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.
@ -471,7 +471,7 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
}
void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.

View File

@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
namespace at::native {
void dispatch_float_gemm(CUDABLAS_GEMM_ARGTYPES(float)) {
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.

View File

@ -13,7 +13,7 @@ namespace at::native {
void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
#if 0
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.
@ -299,7 +299,7 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
#endif
}
void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
// If any of the shapes cant be tiled, we must use padding.
// If any of the shapes can't be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.

View File

@ -545,7 +545,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
const ushort slices2 = divRoundUp(C2, 4);
const ushort slices1 = divRoundUp(C1, 4);
const ushort n2 = gid.z / slices2; //image index
const ushort s2 = gid.z - n2 * slices2; // slice offest
const ushort s2 = gid.z - n2 * slices2; // slice offset
half4 value;
for (int idx = 0; idx < 4; ++idx){
// we compute the "linear index" of the output element,

View File

@ -86,4 +86,4 @@ TORCH_LIBRARY_IMPL(aten, Metal, m) {
m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_));
}
} // namepsace at::native::metal
} // namespace at::native::metal

View File

@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
// blocked format will propagate between layers. Input, output will be in blocked format.
//
// For inference case, weight can be prepacked into blocked format by
// (so as to save weight reoder overhead):
// (so as to save weight reorder overhead):
// model = torch.utils.mkldnn.to_mkldnn(model)
//
// For training case, grad_output can be CPU tensor or MKLDNN tensor,

View File

@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
args.insert({DNNL_ARG_DST, dst});
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
// Create primitve and execute
// Create primitive and execute
auto primitive = dnnl::matmul(prim_desc);
primitive.execute(ideep::stream::default_stream(), args);
}

View File

@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
// For optional additive mask
std::optional<op> mask_add;
// For optional implicite causal mask
// For optional implicit causal mask
std::optional<op> mask_gen_idx_row;
std::optional<logical_tensor> mask_row_idx;
std::optional<op> mask_gen_idx_col;
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
// For optional additive mask
std::optional<op> mask_add;
// For optional implicite causal mask
// For optional implicit causal mask
std::optional<op> mask_gen_idx_row;
std::optional<logical_tensor> mask_row_idx;
std::optional<op> mask_gen_idx_col;

View File

@ -34,7 +34,7 @@ namespace at::native::onednn {
/*
oneDNN postops usage:
Currently, oneDNN supports 5 kinds of post ops. More details can be refered
Currently, oneDNN supports 5 kinds of post ops. More details can be referred
to oneDNN doc.
https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise
@ -345,7 +345,7 @@ class Attr {
dnnl::memory binary_m;
auto binary = ops_params_[i].binary_;
auto md = ops_params_[i].meta_;
// qeury expected_md to achieve peak performance
// query expected_md to achieve peak performance
auto expected_md = pd.query_md(
dnnl::query::exec_arg_md,
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
@ -399,7 +399,7 @@ static inline void construct_attr_for_unary(
} else {
TORCH_CHECK(
unary_post_op == "none",
"onednn qlinear: unspported unary post op",
"onednn qlinear: unsupported unary post op",
unary_post_op);
}
}

View File

@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
return false;
}
// the overlaped cases are not supported
// the overlapped cases are not supported
dnnl::memory::dims strides = get_onednn_strides(tensor);
int64_t storage_size = 1;
for (size_t dim = 0; dim < tensor_dim; ++dim)

View File

@ -29,7 +29,7 @@
secondaryTensor:(MPSGraphTensor*)secondaryTensor
name:(NSString*)name {
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
// agruments results in
// arguments results in
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
@ -42,7 +42,7 @@
secondaryTensor:(MPSGraphTensor*)secondaryTensor
name:(NSString*)name {
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
// agruments results in
// arguments results in
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
// Use gather kernel to solve strides for macOS < 15.0
// Starting with macOS 15.0, MPS supports native strides direclty in the kernels
// Starting with macOS 15.0, MPS supports native strides directly in the kernels
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
Tensor emptyShell = Tensor();
@ -856,7 +856,7 @@ id<MTLLibrary> MetalShaderLibrary::getLibrary(const std::initializer_list<std::s
break;
}
default:
TORCH_INTERNAL_ASSERT(false, "Unsupported number of paramaters ", nparams);
TORCH_INTERNAL_ASSERT(false, "Unsupported number of parameters ", nparams);
}
return libMap[key] = lib;
}
@ -1184,9 +1184,9 @@ void MetalKernelFunction::dispatch(uint64_t length, std::optional<uint64_t> grou
}
void MetalKernelFunction::dispatch(c10::ArrayRef<uint64_t> length, c10::OptionalArrayRef<uint64_t> group_size) {
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimentions must be less than 3 and non-empty");
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimensions must be less than 3 and non-empty");
TORCH_CHECK(!group_size.has_value() || group_size->size() == length.size(),
"size and group_size must have same number of dimentions");
"size and group_size must have same number of dimensions");
const auto max_tg_size = getMaxThreadsPerThreadgroup();
const auto group_size_length = group_size.has_value() ? group_size->size() : 0;
auto tg_size = MTLSizeMake(group_size_length > 0 ? group_size->at(0) : max_tg_size,

View File

@ -59,7 +59,7 @@ static GridSamplerOffsets find_grid_sampler_offsets(
return offsets;
}
// Mod function which gives postive output when `a` is negative
// Mod function which gives positive output when `a` is negative
static int32_t mod(int32_t a, int32_t b) {
auto r = a % b;
return r + (r < 0 ? b : 0);
@ -191,9 +191,9 @@ void grid_sampler_single_element(
int32_t right_indices[3];
opmath_t<T> scales[3];
// For each dimension, find the pair of indices in the cooresponding dimension
// For each dimension, find the pair of indices in the corresponding dimension
// of `input` which surround the grid coordinate in that dimension. We'll do
// this by mapping different coordiante spaces onto each other. There are
// this by mapping different coordinate spaces onto each other. There are
// basically three different coordinate spaces to keep in mind:
//
// * aligned grid space

View File

@ -137,7 +137,7 @@ kernel void index_put_serial(
constant int64_t* index_strides,
constant uint4& ndim_nindices_numel,
uint thread_index [[thread_position_in_grid]]) {
(void)thread_index; // Suppress unused vairable varning
(void)thread_index; // Suppress unused variable warning
for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
index_put_impl(
output,

View File

@ -112,7 +112,7 @@ kernel void int4pack_mm(constant T *A [[buffer(0)]],
constant uchar *B_ptr = B + ((n * K) / k_pack_factor);
thread float4 result = float4(0.0);
// We multipy group of 4 channels with these scales.
// We multiply group of 4 channels with these scales.
// Because corresponding values from weight matrix are effectively left
// shifted. This is to avoid doing right shift on those values which ends up
// affecting performance. This is the trick applied in MLX kernels.

View File

@ -372,7 +372,7 @@ struct log1p_functor {
}
template <typename T>
inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
// TODO: Implement proper log1p algoirthm
// TODO: Implement proper log1p algorithm
auto magnitude = ::precise::sqrt((1.0f + x.x) * (1.0f + x.x) + x.y * x.y);
auto real = ::precise::log(magnitude);
auto imag = (x.x == -1 && x.y == 0) ? 0 : ::precise::atan2(x.y, 1.0 + x.x);

View File

@ -448,7 +448,7 @@ kernel void upsample_trilinear_backward(
// See Note [ Weights computation for uint8_t and multiplication trick ]
// Essentially fall back to fixed floating point arithmetic during uint8
// interpolation, which is not necesserily more accurate (see example below),
// interpolation, which is not necessarily more accurate (see example below),
// but matches closes to what CPU can deliver
// I.e. mid-point 152+249+172+35 is 152, but algorithm yields 153 as horizontal
// and vertical interpolation is done in separate steps and results are rounded

View File

@ -1282,7 +1282,7 @@ static void all_any_common_impl_mps(const Tensor& input_t,
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
// reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4
// reductionOrWithTensor:axis: will throw an internal assert if number of dimensions is more than 4
// See https://github.com/pytorch/pytorch/issues/95538
MPSGraphTensor* outputTensor = nil;
if (input_t.ndimension() > 4) {
@ -1352,7 +1352,7 @@ TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
// reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
// reductionOrWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
// See https://github.com/pytorch/pytorch/issues/95538
if (input_t.dim() > 4) {
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
@ -1400,7 +1400,7 @@ TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
// reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
// reductionAndWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
// See https://github.com/pytorch/pytorch/issues/95538
if (input_t.ndimension() > 4) {
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];

View File

@ -41,7 +41,7 @@ Tensor pad_tensor_to_shape(
const Tensor& t,
IntArrayRef goal_shape,
double value = 0) {
std::vector<int64_t> padd;
std::vector<int64_t> padding;
auto tup = t.sizes();
TORCH_CHECK(
t.dim() == (int64_t)(goal_shape.size()),
@ -51,10 +51,10 @@ Tensor pad_tensor_to_shape(
goal_shape.size(),
" of goal shape.");
for (int64_t i = static_cast<int64_t>(tup.size()) - 1; i >= 0; i--) {
padd.push_back(0);
padd.push_back(goal_shape[i] - tup[i]);
padding.push_back(0);
padding.push_back(goal_shape[i] - tup[i]);
}
Tensor new_tensor = at::constant_pad_nd(t, IntArrayRef(padd), value);
Tensor new_tensor = at::constant_pad_nd(t, IntArrayRef(padding), value);
new_tensor = new_tensor.reshape(goal_shape);
return new_tensor;
}

View File

@ -53,7 +53,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
normalized_shape);
// Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
// Also, compute M and N considering the idiosyncracies of NestedTensors
// Also, compute M and N considering the idiosyncrasies of NestedTensors
int64_t N = 1;
for (const auto i: c10::irange(normalized_ndim)) {
TORCH_CHECK(

View File

@ -95,7 +95,7 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6
for (const auto split_idx : c10::irange(chunks)) {
auto new_sizes = sizes.clone();
auto new_strides = strides.clone();
// This copys offsets so we are safe to move
// This copies offsets so we are safe to move
auto new_offsets = offsets.clone();
int64_t *size_ptr = new_sizes.data_ptr<int64_t>();
int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>();

View File

@ -37,6 +37,7 @@ NotIn
nout
NowNs
numer
OffsetT
oH
optins
ot