mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[BE][4/5] fix typos in aten/ (aten/src/ATen/native/)
ghstack-source-id: 288daaa86090e465421c81ae8373b0f6f63eab4a Pull-Request: https://github.com/pytorch/pytorch/pull/157553
This commit is contained in:
@ -1205,7 +1205,6 @@ exclude_patterns = [
|
||||
# These files are all grandfathered in, feel free to remove from this list
|
||||
# as necessary
|
||||
# NOTE: remove the patterns in the order they are listed
|
||||
'aten/src/ATen/native/[a-pA-P]*/**',
|
||||
'aten/src/ATen/[a-mA-M]*/**',
|
||||
'test/**',
|
||||
]
|
||||
|
@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
|
||||
auto* input_tr_ptr =
|
||||
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
|
||||
// TODO: Activation transpose before and after the kernel can be removed if we
|
||||
// keep activation tensor always tranposed.
|
||||
// keep activation tensor always transposed.
|
||||
fbgemm::transpose_simd<uint8_t>(
|
||||
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);
|
||||
|
||||
|
@ -34,7 +34,7 @@ struct Dist {
|
||||
// finish : This tells what to do with the aggregated value to compute
|
||||
// the norm. Generally this is the result of val ^ (1 / p).
|
||||
// backward : This is the gradient for that norm. Arguments are pretty
|
||||
// self explanitory.
|
||||
// self explanatory.
|
||||
//
|
||||
// There are a few cases where these aren't used. The 0 norm has no backward,
|
||||
// because it's always 0, so that's shortcircuited earlier. There's a special
|
||||
|
@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
|
||||
|
||||
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
|
||||
compiler flags it is currently compiled. This allows the programmer to write
|
||||
generic code, which will be compiled under multipled compilation settings.
|
||||
generic code, which will be compiled under multiplied compilation settings.
|
||||
|
||||
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
|
||||
a generic definition of `sumImplAll`. This function allows the user to reduce
|
||||
|
@ -1017,7 +1017,7 @@ struct HelperInterpBase {
|
||||
while (aligned_interp_size % sizeof(int32_t) != 0) {
|
||||
aligned_interp_size += 1;
|
||||
}
|
||||
// assert that we wont go out of bounds
|
||||
// assert that we won't go out of bounds
|
||||
TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
|
||||
}
|
||||
|
||||
|
@ -655,7 +655,7 @@ void ImagingResampleHorizontalConvolution8u4x(
|
||||
// last element
|
||||
auto mmk = _mm256_set1_epi32(k[i]);
|
||||
// For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
|
||||
// lines 0, 1 and 2 wont go out of allocated memory bounds
|
||||
// lines 0, 1 and 2 won't go out of allocated memory bounds
|
||||
auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||
mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
|
||||
mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
|
||||
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
|
||||
|
||||
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
|
||||
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
|
||||
// RGBA: pix1 = [
|
||||
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
|
||||
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
|
||||
@ -1312,7 +1312,7 @@ void ImagingResampleVerticalConvolution8u(
|
||||
|
||||
// Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
|
||||
// It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
|
||||
// We also wont go out of bounds of lineOut memory allocation
|
||||
// We also won't go out of bounds of lineOut memory allocation
|
||||
std::memcpy(lineOut + j, (uint8_t *) &o, 4);
|
||||
}
|
||||
|
||||
|
@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
|
||||
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
||||
|
||||
|
||||
/* evaluation of 8 sines at onces using AVX intrinsics
|
||||
/* evaluation of 8 sines at once using AVX intrinsics
|
||||
|
||||
The code is the exact rewriting of the cephes sinf function.
|
||||
Precision is excellent as long as x < 8192 (I did not bother to
|
||||
|
@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
|
||||
const bool gamma_null = (gamma_data == nullptr);
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
|
||||
// NB: About algorithm choosen:
|
||||
// NB: About algorithm chosen:
|
||||
//
|
||||
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
|
||||
// Mean and rstd are collected per each n and g, which involves reduction
|
||||
|
@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
|
||||
}
|
||||
};
|
||||
|
||||
// Dynamically Quantize the float32 input to 8 bit assymetric
|
||||
// Dynamically Quantize the float32 input to 8 bit asymmetric
|
||||
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
|
||||
|
||||
const size_t lhs_stride =
|
||||
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
|
||||
const int64_t weight_packed_size =
|
||||
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
|
||||
if (weight_packed_size == packed_weights.numel()) {
|
||||
// KleidiAI interface intenally handles the Channelwise and groupwise
|
||||
// KleidiAI interface internally handles the Channelwise and groupwise
|
||||
// distinction
|
||||
kleidiai::kai_quant_pack_lhs_int4_mm(
|
||||
output, inp, packed_weights, M, N, K, block_size);
|
||||
|
@ -705,7 +705,7 @@ namespace {
|
||||
);
|
||||
} while (!done && max_threads);
|
||||
if (!done) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
|
||||
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accommodate sharedMemPerBlock limit");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -154,19 +154,19 @@ struct cublasCommonArgs {
|
||||
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
|
||||
bool transpose_result = false, transpose_a = false, transpose_b = false;
|
||||
result = prepare_matrix_for_cublas(c, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); // codespell:ignore
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); // codespell:ignore
|
||||
|
||||
// Handle scale tensors if provided
|
||||
if (scale_a && scale_b) {
|
||||
// By default since we return in row-major we run the gemm
|
||||
// as B.T @ A.T, check transpose_result to determine if we flip the scales
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); // codespell:ignore
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); // codespell:ignore
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; // codespell:ignore
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); // codespell:ignore
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); // codespell:ignore
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; // codespell:ignore
|
||||
}
|
||||
|
||||
if (scale_result) {
|
||||
@ -180,17 +180,17 @@ struct cublasCommonArgs {
|
||||
transpose_b = !transpose_b;
|
||||
}
|
||||
|
||||
auto sizes_a = mata->sizes();
|
||||
auto sizes_b = matb->sizes();
|
||||
auto sizes_a = mata->sizes(); // codespell:ignore
|
||||
auto sizes_b = matb->sizes(); // codespell:ignore
|
||||
|
||||
m = sizes_a[transpose_result ? 1 : 0];
|
||||
k = sizes_a[transpose_result ? 0 : 1];
|
||||
n = sizes_b[transpose_result ? 0 : 1];
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); // codespell:ignore
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); // codespell:ignore
|
||||
result_ld = result->stride(transpose_result ? 0 : 1);
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
|
||||
|
||||
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
|
||||
// if the gemm operands are in packed float4
|
||||
@ -205,16 +205,16 @@ struct cublasCommonArgs {
|
||||
char transa, transb;
|
||||
int64_t m, n, k;
|
||||
int64_t lda, ldb, result_ld;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result; // codespell:ignore
|
||||
|
||||
// Scale members
|
||||
void* scale_mata_ptr = nullptr;
|
||||
void* scale_matb_ptr = nullptr;
|
||||
void* scale_mata_ptr = nullptr; // codespell:ignore
|
||||
void* scale_matb_ptr = nullptr; // codespell:ignore
|
||||
void* scale_result_ptr = nullptr;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype;
|
||||
std::optional<ScalingType> scaling_mata_type;
|
||||
std::optional<c10::ScalarType> scale_matb_dtype;
|
||||
std::optional<ScalingType> scaling_matb_type;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype; // codespell:ignore
|
||||
std::optional<ScalingType> scaling_mata_type; // codespell:ignore
|
||||
std::optional<c10::ScalarType> scale_matb_dtype; // codespell:ignore
|
||||
std::optional<ScalingType> scaling_matb_type; // codespell:ignore
|
||||
std::optional<c10::ScalarType> scale_result_dtype;
|
||||
};
|
||||
} // namespace
|
||||
@ -362,7 +362,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
|
||||
#endif
|
||||
// if lt path fails, we recurse back into this function here and force the lt path to off
|
||||
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
// we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#if defined(USE_ROCM) && ROCM_VERSION == 60400
|
||||
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
|
||||
@ -2886,7 +2886,7 @@ _scaled_grouped_mm_cuda_v2(
|
||||
"Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ",
|
||||
mat_b.size(dim_b));
|
||||
// Note: only (-1, -2) is currently supported
|
||||
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only");
|
||||
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Currently contraction dims must be (-1, -2) only");
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
|
||||
}
|
||||
|
@ -298,7 +298,7 @@ static void jitted_gpu_kernel_impl(
|
||||
at::opmath_type<f_inputs_type> scalar_val,
|
||||
const std::tuple<ExtraArgs...>& extra_args) {
|
||||
|
||||
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
|
||||
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
|
||||
// the same compute capability
|
||||
static std::mutex jiterator_mutex;
|
||||
static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());
|
||||
|
@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
|
||||
auto value = static_cast<scalar_t>(rand * range + from);
|
||||
// reverse the bounds of curand4 from (0, 1] to [0, 1)
|
||||
// Note that this method is from legacy THCTensorRandom and is likely to give
|
||||
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
|
||||
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
|
||||
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
|
||||
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
|
||||
auto reverse_bound_value = value == to ? from : value;
|
||||
|
@ -75,7 +75,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<const scalar_t, IndexType>
|
||||
// We'll use this to actually cause vectorized loads later
|
||||
LoadT *value = reinterpret_cast<LoadT*>(&src);
|
||||
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
|
||||
// Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
|
||||
// sets of rand.
|
||||
if ((VEC >= 4) || (gridxvec_loop_state == 0)) {
|
||||
@ -159,7 +159,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo<const scalar_t, IndexType> a,
|
||||
for (IndexType linearIndex = idx;
|
||||
linearIndex < rounded_size;
|
||||
linearIndex += gridDim.x * blockDim.x*UNROLL) {
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
|
||||
float4 rand = curand_uniform4(&state);
|
||||
scalar_t src[UNROLL];
|
||||
rand.x = rand.x < p;
|
||||
|
@ -24,7 +24,7 @@ namespace at::native {
|
||||
namespace {
|
||||
|
||||
/* This code computes the sum of the weights in two-steps:
|
||||
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indeces`
|
||||
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indices`
|
||||
2) Each partial-sum from 1) are summed and scatter into `grad_weight`
|
||||
|
||||
Notice, `NROWS_PER_THREAD` impacts the Achieved Occupancy of the
|
||||
|
@ -204,7 +204,7 @@ Scalar scalar_reciprocal(const Scalar& scalar) {
|
||||
return Scalar(1. / scalar.toComplexDouble());
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
false, "divison with ", scalar.type(), " not supported");
|
||||
false, "division with ", scalar.type(), " not supported");
|
||||
}
|
||||
|
||||
void foreach_tensor_div_scalar_kernel_cuda_(
|
||||
|
@ -57,7 +57,7 @@ namespace {
|
||||
const index_t n = index / (out_H * out_W);
|
||||
const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y co-ordinates from grid
|
||||
// get the corresponding input x, y coordinates from grid
|
||||
opmath_t x = grid.data[grid_offset];
|
||||
opmath_t y = grid.data[grid_offset + grid_sCoor];
|
||||
|
||||
@ -193,7 +193,7 @@ namespace {
|
||||
const index_t n = index / (out_D * out_H * out_W);
|
||||
const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
// get the corresponding input x, y, z coordinates from grid
|
||||
opmath_t x = grid.data[grid_offset];
|
||||
opmath_t y = grid.data[grid_offset + grid_sCoor];
|
||||
opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
|
||||
@ -358,7 +358,7 @@ namespace {
|
||||
const index_t n = index / (out_H * out_W);
|
||||
const auto grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y co-ordinates from grid
|
||||
// get the corresponding input x, y coordinates from grid
|
||||
scalar_t x = grid.data[grid_offset];
|
||||
scalar_t y = grid.data[grid_offset + grid_sCoor];
|
||||
|
||||
@ -572,7 +572,7 @@ namespace {
|
||||
const index_t n = index / (out_D * out_H * out_W);
|
||||
const auto grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
// get the corresponding input x, y, z coordinates from grid
|
||||
scalar_t ix = grid.data[grid_offset];
|
||||
scalar_t iy = grid.data[grid_offset + grid_sCoor];
|
||||
scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
|
||||
// Three warninngs in Cutlass included header files
|
||||
// Three warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
@ -377,7 +377,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
|
||||
* result at the boundary
|
||||
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
|
||||
* Large Parameter (see DLMF 8.12.4 [igam1])
|
||||
* - if x > 1.1 and x < a, using the substraction from the regularized lower
|
||||
* - if x > 1.1 and x < a, using the subtraction from the regularized lower
|
||||
* incomplete gamma
|
||||
* - otherwise, calculate the series from [igam2] eq (5)
|
||||
*/
|
||||
@ -460,7 +460,7 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
|
||||
* result at the boundary
|
||||
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
|
||||
* Large Parameter (see DLMF 8.12.3 [igam1])
|
||||
* - if x > 1 and x > a, using the substraction from the regularized upper
|
||||
* - if x > 1 and x > a, using the subtraction from the regularized upper
|
||||
* incomplete gamma
|
||||
* - otherwise, calculate the series from [igam2] eq (4)
|
||||
*/
|
||||
|
@ -332,7 +332,7 @@ void cuda_take_put_kernel(
|
||||
const auto offset_calc = make_offset_calculator<2>(iter);
|
||||
using uindex_t = std::make_unsigned_t<index_t>;
|
||||
|
||||
// OffsetCalculator needs the sizes and strides reveresed
|
||||
// OffsetCalculator needs the sizes and strides reversed
|
||||
const auto indexed_sizes = std::vector<int64_t>(indexed.sizes().rbegin(), indexed.sizes().rend());
|
||||
const auto indexed_strides = std::vector<int64_t>(indexed.strides().rbegin(), indexed.strides().rend());
|
||||
const auto* indexed_strides_data = indexed_strides.data();
|
||||
|
@ -1611,7 +1611,7 @@ void index_select_out_cuda_impl(
|
||||
|
||||
// SmallIndexKernel is more performant when the number of indices is small, and pre-loading
|
||||
// the index reduces memory accesses. When the number of indices is large, we avoid that
|
||||
// and increase parallellism by calling gather_out which is a generalization of index_select
|
||||
// and increase parallelism by calling gather_out which is a generalization of index_select
|
||||
if (cuda::detail::canUse32BitIndexMath(out) &&
|
||||
cuda::detail::canUse32BitIndexMath(self) &&
|
||||
cuda::detail::canUse32BitIndexMath(index) &&
|
||||
|
@ -273,7 +273,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
|
||||
|
||||
scalar_t* dst = self_ptr + index;
|
||||
|
||||
//pack coalseced bf16 and fp16
|
||||
//pack coalesced bf16 and fp16
|
||||
if constexpr (std::is_same<scalar_t, c10::BFloat16>::value || std::is_same<scalar_t, c10::Half>::value)
|
||||
{
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
|
||||
@ -316,7 +316,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
|
||||
}
|
||||
}
|
||||
|
||||
// not coalsced, so now let try to capture lane-matches...
|
||||
// not coalesced, so now let try to capture lane-matches...
|
||||
|
||||
if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
|
||||
// well shucks, unlikely to capture same-dest atomics in a wave.
|
||||
|
@ -343,7 +343,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
|
||||
if (input_length == 0)
|
||||
return;
|
||||
|
||||
// "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
|
||||
// "first" row, the beta initialization before eq (10) (t=target_length - differs per batch)
|
||||
for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
|
||||
int64_t s = threadIdx.x + block_s;
|
||||
scalar_t lb;
|
||||
|
@ -816,7 +816,7 @@ const auto erfcx_string = jiterator_stringify(
|
||||
with the usual checks for overflow etcetera.
|
||||
|
||||
Performance-wise, it seems to be substantially faster than either
|
||||
the SLATEC DERFC function [or an erfcx function derived therefrom]
|
||||
the SLATEC DERFC function [or an erfcx function derived there from]
|
||||
or Cody's CALERF function (from netlib.org/specfun), while
|
||||
retaining near machine precision in accuracy.
|
||||
*/
|
||||
|
@ -370,7 +370,7 @@ struct vectorized {
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// This is similar to vectorized policy above, but this one supports
|
||||
// heterogenous input tensor types as templated parameters.
|
||||
// heterogeneous input tensor types as templated parameters.
|
||||
// Its use should be limited to frequently used heterogeneous data types
|
||||
// as each instantiation will generate a separate kernel, leading to code
|
||||
// bloating if applied to all combinations supported in PyTorch. Assumption: all
|
||||
|
@ -309,7 +309,7 @@ __global__ void sampleMultinomialOnce(
|
||||
} else {
|
||||
// This should address a rare bug where we don't select a valid index. This likely occurs when
|
||||
// due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
|
||||
// and our uniform sample is greater than this value. In this case we likely have unitialized memory
|
||||
// and our uniform sample is greater than this value. In this case we likely have uninitialized memory
|
||||
// in dest[curDist]. So basically we will loop through the distribution and pick the largest index
|
||||
// where the distribution is non-zero. This is obviously terribly inefficient, but due to the
|
||||
// rarity in which this occurs, this should not be an issue.
|
||||
|
@ -1623,7 +1623,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
|
||||
const auto stride = input.sizes()[1];
|
||||
const auto reduction_size = input.numel() / stride;
|
||||
|
||||
// Input is guarunteed to be channels-last compatible
|
||||
// Input is guaranteed to be channels-last compatible
|
||||
at::Tensor grad_input = at::empty_like(input);
|
||||
|
||||
dim3 block;
|
||||
@ -1691,7 +1691,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
|
||||
const auto reduction_size = input.numel() / stride;
|
||||
auto norm_fct = 1.0 / reduction_size;
|
||||
|
||||
// Input is guarunteed to be channels-last compatible
|
||||
// Input is guaranteed to be channels-last compatible
|
||||
at::Tensor grad_input = at::empty_like(input);
|
||||
|
||||
dim3 block;
|
||||
|
@ -37,7 +37,7 @@ namespace at::native {
|
||||
// threshold probability for having non-duplicate keys, then it can be proved that[1]
|
||||
// the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
|
||||
//
|
||||
// Then after sort, we lauch a separate kernel that additionally shuffles any islands
|
||||
// Then after sort, we launch a separate kernel that additionally shuffles any islands
|
||||
// of values whose keys matched. The algorithm of this kernel is as follows:
|
||||
// Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
|
||||
// For each island, the first thread in the island sees a key match at index i+1 but not index i-1.
|
||||
|
@ -1088,12 +1088,12 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
// load instructions.
|
||||
//
|
||||
// Case 1: "vectorize along input"
|
||||
// This case happens when we are reducing along fastest moving dimesion. In such case, threads
|
||||
// This case happens when we are reducing along fastest moving dimension. In such case, threads
|
||||
// with the same threadIdx.y works on the same reduction cooperatively and will produce results
|
||||
// for the same output. In such case, values in each loaded vector always correspond to the same output.
|
||||
//
|
||||
// Case 2: "vectorize along output"
|
||||
// This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
|
||||
// This case happens when the fastest moving dimension is not the dimension of reduction. In such case,
|
||||
// threads with different threadIdx.x are independent and will produce results for different outputs.
|
||||
// In such case, values in each loaded vector always correspond to different outputs.
|
||||
if (fastest_moving_stride == sizeof(scalar_t)) {
|
||||
|
@ -241,7 +241,7 @@ __global__ void reflection_pad2d_backward_det_out_kernel(
|
||||
const int64_t dist_cols = ::abs(inp_col - (input_dim_x - 1));
|
||||
|
||||
// we were dist_rows after, now we want to be dist_rows before
|
||||
// we were dist_cols before, now we wnat to be dist_cols after
|
||||
// we were dist_cols before, now we want to be dist_cols after
|
||||
const int64_t reflect_tr_out_row = (corner_tr_out_row - dist_rows);
|
||||
const int64_t reflect_tr_out_col = (corner_tr_out_col + dist_cols);
|
||||
const int64_t reflect_tr_out =
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
// Two warninngs in Cutlass included header files
|
||||
// Two warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
// Two warninngs in Cutlass included header files
|
||||
// Two warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
@ -460,7 +460,7 @@ __global__ void GammaBetaBackwardCUDAKernel2(
|
||||
}
|
||||
}
|
||||
|
||||
// Do warp reduce for the 2st 16 cols in the tile.
|
||||
// Do warp reduce for the 2nd 16 cols in the tile.
|
||||
sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
|
||||
sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
|
||||
sum1 = cuda_utils::WarpReduceSum<T_ACC>(sum1);
|
||||
|
@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
|
||||
|
||||
std::string file_path;
|
||||
if (cache_dir.has_value()) {
|
||||
// Attemps to read from the cache.
|
||||
// Attempts to read from the cache.
|
||||
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
|
||||
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
|
||||
// because we hash on the CUDA code, but we save the compiled ptx or sass
|
||||
@ -1556,19 +1556,19 @@ NvrtcFunction jit_pwise_function(
|
||||
ss << "_" << hash_code;
|
||||
file_path = ss.str();
|
||||
|
||||
std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
|
||||
if (readin.fail()) {
|
||||
std::ifstream read_stream{file_path, std::ios::in | std::ifstream::binary};
|
||||
if (read_stream.fail()) {
|
||||
// NOTE: this does not warn because the file might not exist
|
||||
// TODO: consider if this should explicitly check for the file's existence or not to throw
|
||||
// an informative warning
|
||||
readin.close();
|
||||
read_stream.close();
|
||||
} else {
|
||||
// TODO: try passing the "mapped" file directly to cuModuleLoadCall instead of using an intermediate buffer
|
||||
std::vector<char> buffer(std::istreambuf_iterator<char>(readin), {});
|
||||
std::vector<char> buffer(std::istreambuf_iterator<char>(read_stream), {});
|
||||
AT_CUDA_DRIVER_CHECK(nvrtc.cuModuleLoadData(&(compiled_kernel_.module), buffer.data()));
|
||||
AT_CUDA_DRIVER_CHECK(
|
||||
nvrtc.cuModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
|
||||
readin.close();
|
||||
read_stream.close();
|
||||
return compiled_kernel_;
|
||||
}
|
||||
}
|
||||
|
@ -1050,7 +1050,7 @@ void launch_vectorized_layer_norm_kernel(
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// the blocks.x contains the max grid x dimention without invalid configuration error
|
||||
// the blocks.x contains the max grid x dimension without invalid configuration error
|
||||
// Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
|
||||
// Ensure all elements are processed. Prepare for next round
|
||||
int64_t remaining = M - blocks.x;
|
||||
|
@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
|
||||
});
|
||||
|
||||
if (input.dim() > 2) {
|
||||
// if upper=true we need to tranpose and conjugate the result tensor
|
||||
// if upper=true we need to transpose and conjugate the result tensor
|
||||
// because the cholesky decomposition is stored in the lower triangular part
|
||||
if (upper) {
|
||||
input.copy_(result.mH());
|
||||
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
||||
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
// TODO Investigate whether the following magma bug is still occuring.
|
||||
// TODO Investigate whether the following magma bug is still occurring.
|
||||
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
|
||||
// geqrf_magma currently uses geqrf2_gpu
|
||||
//
|
||||
|
@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
|
||||
#if defined(BUILD_LAZY_CUDA_LINALG)
|
||||
namespace cuda { namespace detail {
|
||||
// This is only used for an old-style dispatches
|
||||
// Please do not add any new entires to it
|
||||
// Please do not add any new entries to it
|
||||
struct LinalgDispatch {
|
||||
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
|
||||
};
|
||||
|
@ -177,7 +177,7 @@ bool use_ragged_in_dense(
|
||||
TORCH_WARN_ONCE(
|
||||
"TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
|
||||
"e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
|
||||
"Falling back to regualr dense case, which may trigger excessive recompilation.");
|
||||
"Falling back to regular dense case, which may trigger excessive recompilation.");
|
||||
}
|
||||
return all_bshd;
|
||||
}
|
||||
@ -771,7 +771,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
|
||||
if (attn_bias.has_value()) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
scaled_dot_product_flash_attention_options.set_bias(
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_uid(BIAS)
|
||||
@ -1196,7 +1196,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
|
||||
if (attn_bias.has_value()) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
sdpa_backward_options.set_bias(
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_uid(BIAS)
|
||||
@ -1864,7 +1864,7 @@ void run_cudnn_SDP_bprop_nestedtensor(
|
||||
}
|
||||
TORCH_CHECK(
|
||||
!attn_bias.has_value(),
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
|
||||
auto workspace_size = mha_graph.get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
|
@ -30,7 +30,7 @@ static const std::unordered_map<
|
||||
};
|
||||
|
||||
|
||||
// This is the heursitic to choose a kernel based on inputs
|
||||
// This is the heuristic to choose a kernel based on inputs
|
||||
BGEMMKernel_BFloat16 dispatch_bfloat16_bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
|
||||
// Optional/future use: directly lookup shape tuples to map to instances
|
||||
/*
|
||||
|
@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
|
||||
namespace at::native {
|
||||
|
||||
void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
@ -471,7 +471,7 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
}
|
||||
|
||||
void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
|
||||
namespace at::native {
|
||||
|
||||
void dispatch_float_gemm(CUDABLAS_GEMM_ARGTYPES(float)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -13,7 +13,7 @@ namespace at::native {
|
||||
|
||||
void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
#if 0
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
@ -299,7 +299,7 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
#endif
|
||||
}
|
||||
void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -545,7 +545,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
|
||||
const ushort slices2 = divRoundUp(C2, 4);
|
||||
const ushort slices1 = divRoundUp(C1, 4);
|
||||
const ushort n2 = gid.z / slices2; //image index
|
||||
const ushort s2 = gid.z - n2 * slices2; // slice offest
|
||||
const ushort s2 = gid.z - n2 * slices2; // slice offset
|
||||
half4 value;
|
||||
for (int idx = 0; idx < 4; ++idx){
|
||||
// we compute the "linear index" of the output element,
|
||||
|
@ -86,4 +86,4 @@ TORCH_LIBRARY_IMPL(aten, Metal, m) {
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_));
|
||||
}
|
||||
|
||||
} // namepsace at::native::metal
|
||||
} // namespace at::native::metal
|
||||
|
@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
|
||||
// blocked format will propagate between layers. Input, output will be in blocked format.
|
||||
//
|
||||
// For inference case, weight can be prepacked into blocked format by
|
||||
// (so as to save weight reoder overhead):
|
||||
// (so as to save weight reorder overhead):
|
||||
// model = torch.utils.mkldnn.to_mkldnn(model)
|
||||
//
|
||||
// For training case, grad_output can be CPU tensor or MKLDNN tensor,
|
||||
|
@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
|
||||
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
|
||||
args.insert({DNNL_ARG_DST, dst});
|
||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
||||
// Create primitve and execute
|
||||
// Create primitive and execute
|
||||
auto primitive = dnnl::matmul(prim_desc);
|
||||
primitive.execute(ideep::stream::default_stream(), args);
|
||||
}
|
||||
|
@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
|
||||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
|
||||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
|
@ -34,7 +34,7 @@ namespace at::native::onednn {
|
||||
|
||||
/*
|
||||
oneDNN postops usage:
|
||||
Currently, oneDNN supports 5 kinds of post ops. More details can be refered
|
||||
Currently, oneDNN supports 5 kinds of post ops. More details can be referred
|
||||
to oneDNN doc.
|
||||
https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise
|
||||
|
||||
@ -345,7 +345,7 @@ class Attr {
|
||||
dnnl::memory binary_m;
|
||||
auto binary = ops_params_[i].binary_;
|
||||
auto md = ops_params_[i].meta_;
|
||||
// qeury expected_md to achieve peak performance
|
||||
// query expected_md to achieve peak performance
|
||||
auto expected_md = pd.query_md(
|
||||
dnnl::query::exec_arg_md,
|
||||
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
|
||||
@ -399,7 +399,7 @@ static inline void construct_attr_for_unary(
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
unary_post_op == "none",
|
||||
"onednn qlinear: unspported unary post op",
|
||||
"onednn qlinear: unsupported unary post op",
|
||||
unary_post_op);
|
||||
}
|
||||
}
|
||||
|
@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// the overlaped cases are not supported
|
||||
// the overlapped cases are not supported
|
||||
dnnl::memory::dims strides = get_onednn_strides(tensor);
|
||||
int64_t storage_size = 1;
|
||||
for (size_t dim = 0; dim < tensor_dim; ++dim)
|
||||
|
@ -29,7 +29,7 @@
|
||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
@ -42,7 +42,7 @@
|
||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||
|
||||
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||
// Use gather kernel to solve strides for macOS < 15.0
|
||||
// Starting with macOS 15.0, MPS supports native strides direclty in the kernels
|
||||
// Starting with macOS 15.0, MPS supports native strides directly in the kernels
|
||||
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
|
||||
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
|
||||
Tensor emptyShell = Tensor();
|
||||
@ -856,7 +856,7 @@ id<MTLLibrary> MetalShaderLibrary::getLibrary(const std::initializer_list<std::s
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported number of paramaters ", nparams);
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported number of parameters ", nparams);
|
||||
}
|
||||
return libMap[key] = lib;
|
||||
}
|
||||
@ -1184,9 +1184,9 @@ void MetalKernelFunction::dispatch(uint64_t length, std::optional<uint64_t> grou
|
||||
}
|
||||
|
||||
void MetalKernelFunction::dispatch(c10::ArrayRef<uint64_t> length, c10::OptionalArrayRef<uint64_t> group_size) {
|
||||
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimentions must be less than 3 and non-empty");
|
||||
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimensions must be less than 3 and non-empty");
|
||||
TORCH_CHECK(!group_size.has_value() || group_size->size() == length.size(),
|
||||
"size and group_size must have same number of dimentions");
|
||||
"size and group_size must have same number of dimensions");
|
||||
const auto max_tg_size = getMaxThreadsPerThreadgroup();
|
||||
const auto group_size_length = group_size.has_value() ? group_size->size() : 0;
|
||||
auto tg_size = MTLSizeMake(group_size_length > 0 ? group_size->at(0) : max_tg_size,
|
||||
|
@ -59,7 +59,7 @@ static GridSamplerOffsets find_grid_sampler_offsets(
|
||||
return offsets;
|
||||
}
|
||||
|
||||
// Mod function which gives postive output when `a` is negative
|
||||
// Mod function which gives positive output when `a` is negative
|
||||
static int32_t mod(int32_t a, int32_t b) {
|
||||
auto r = a % b;
|
||||
return r + (r < 0 ? b : 0);
|
||||
@ -191,9 +191,9 @@ void grid_sampler_single_element(
|
||||
int32_t right_indices[3];
|
||||
opmath_t<T> scales[3];
|
||||
|
||||
// For each dimension, find the pair of indices in the cooresponding dimension
|
||||
// For each dimension, find the pair of indices in the corresponding dimension
|
||||
// of `input` which surround the grid coordinate in that dimension. We'll do
|
||||
// this by mapping different coordiante spaces onto each other. There are
|
||||
// this by mapping different coordinate spaces onto each other. There are
|
||||
// basically three different coordinate spaces to keep in mind:
|
||||
//
|
||||
// * aligned grid space
|
||||
|
@ -137,7 +137,7 @@ kernel void index_put_serial(
|
||||
constant int64_t* index_strides,
|
||||
constant uint4& ndim_nindices_numel,
|
||||
uint thread_index [[thread_position_in_grid]]) {
|
||||
(void)thread_index; // Suppress unused vairable varning
|
||||
(void)thread_index; // Suppress unused variable warning
|
||||
for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
|
||||
index_put_impl(
|
||||
output,
|
||||
|
@ -112,7 +112,7 @@ kernel void int4pack_mm(constant T *A [[buffer(0)]],
|
||||
constant uchar *B_ptr = B + ((n * K) / k_pack_factor);
|
||||
|
||||
thread float4 result = float4(0.0);
|
||||
// We multipy group of 4 channels with these scales.
|
||||
// We multiply group of 4 channels with these scales.
|
||||
// Because corresponding values from weight matrix are effectively left
|
||||
// shifted. This is to avoid doing right shift on those values which ends up
|
||||
// affecting performance. This is the trick applied in MLX kernels.
|
||||
|
@ -372,7 +372,7 @@ struct log1p_functor {
|
||||
}
|
||||
template <typename T>
|
||||
inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
|
||||
// TODO: Implement proper log1p algoirthm
|
||||
// TODO: Implement proper log1p algorithm
|
||||
auto magnitude = ::precise::sqrt((1.0f + x.x) * (1.0f + x.x) + x.y * x.y);
|
||||
auto real = ::precise::log(magnitude);
|
||||
auto imag = (x.x == -1 && x.y == 0) ? 0 : ::precise::atan2(x.y, 1.0 + x.x);
|
||||
|
@ -448,7 +448,7 @@ kernel void upsample_trilinear_backward(
|
||||
|
||||
// See Note [ Weights computation for uint8_t and multiplication trick ]
|
||||
// Essentially fall back to fixed floating point arithmetic during uint8
|
||||
// interpolation, which is not necesserily more accurate (see example below),
|
||||
// interpolation, which is not necessarily more accurate (see example below),
|
||||
// but matches closes to what CPU can deliver
|
||||
// I.e. mid-point 152+249+172+35 is 152, but algorithm yields 153 as horizontal
|
||||
// and vertical interpolation is done in separate steps and results are rounded
|
||||
|
@ -1282,7 +1282,7 @@ static void all_any_common_impl_mps(const Tensor& input_t,
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
MPSGraphTensor* outputTensor = nil;
|
||||
if (input_t.ndimension() > 4) {
|
||||
@ -1352,7 +1352,7 @@ TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
if (input_t.dim() > 4) {
|
||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||
@ -1400,7 +1400,7 @@ TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
if (input_t.ndimension() > 4) {
|
||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||
|
@ -41,7 +41,7 @@ Tensor pad_tensor_to_shape(
|
||||
const Tensor& t,
|
||||
IntArrayRef goal_shape,
|
||||
double value = 0) {
|
||||
std::vector<int64_t> padd;
|
||||
std::vector<int64_t> padding;
|
||||
auto tup = t.sizes();
|
||||
TORCH_CHECK(
|
||||
t.dim() == (int64_t)(goal_shape.size()),
|
||||
@ -51,10 +51,10 @@ Tensor pad_tensor_to_shape(
|
||||
goal_shape.size(),
|
||||
" of goal shape.");
|
||||
for (int64_t i = static_cast<int64_t>(tup.size()) - 1; i >= 0; i--) {
|
||||
padd.push_back(0);
|
||||
padd.push_back(goal_shape[i] - tup[i]);
|
||||
padding.push_back(0);
|
||||
padding.push_back(goal_shape[i] - tup[i]);
|
||||
}
|
||||
Tensor new_tensor = at::constant_pad_nd(t, IntArrayRef(padd), value);
|
||||
Tensor new_tensor = at::constant_pad_nd(t, IntArrayRef(padding), value);
|
||||
new_tensor = new_tensor.reshape(goal_shape);
|
||||
return new_tensor;
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
|
||||
normalized_shape);
|
||||
|
||||
// Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
|
||||
// Also, compute M and N considering the idiosyncracies of NestedTensors
|
||||
// Also, compute M and N considering the idiosyncrasies of NestedTensors
|
||||
int64_t N = 1;
|
||||
for (const auto i: c10::irange(normalized_ndim)) {
|
||||
TORCH_CHECK(
|
||||
|
@ -95,7 +95,7 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6
|
||||
for (const auto split_idx : c10::irange(chunks)) {
|
||||
auto new_sizes = sizes.clone();
|
||||
auto new_strides = strides.clone();
|
||||
// This copys offsets so we are safe to move
|
||||
// This copies offsets so we are safe to move
|
||||
auto new_offsets = offsets.clone();
|
||||
int64_t *size_ptr = new_sizes.data_ptr<int64_t>();
|
||||
int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>();
|
||||
|
@ -37,6 +37,7 @@ NotIn
|
||||
nout
|
||||
NowNs
|
||||
numer
|
||||
OffsetT
|
||||
oH
|
||||
optins
|
||||
ot
|
||||
|
Reference in New Issue
Block a user