mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Fix typos in multiple files (#152254)
Fix typos in multiple files Pull Request resolved: https://github.com/pytorch/pytorch/pull/152254 Approved by: https://github.com/Skylion007
This commit is contained in:
@ -2652,7 +2652,7 @@ Tensor mexp_impl(
|
||||
// `norm_cpu` is used to decide which Tensors require which approximation
|
||||
// based on their norm. This decision takes place on CPU.
|
||||
// It requires moving data back and forth between devices when `a` is on CUDA,
|
||||
// but at the cost of only one sigle CPU-CUDA synchronization (instead of 6),
|
||||
// but at the cost of only one single CPU-CUDA synchronization (instead of 6),
|
||||
// and better performance overall (benchmarked).
|
||||
const auto norm_cpu = (a.device().type() == at::kCUDA)
|
||||
? norm.to(at::kCPU) : norm;
|
||||
|
@ -296,7 +296,7 @@ void slow_conv_transpose3d_out_cpu_template(
|
||||
int64_t elt;
|
||||
// For each elt in batch, do:
|
||||
for (elt = 0; elt < batch_size; ++elt) {
|
||||
// Matrix mulitply per output:
|
||||
// Matrix multiply per output:
|
||||
input_n = input.select(0, elt);
|
||||
output_n = output.select(0, elt);
|
||||
|
||||
@ -520,7 +520,7 @@ void slow_conv_transpose3d_backward_out_cpu_template(
|
||||
int64_t elt;
|
||||
// For each elt in batch, do:
|
||||
for (elt = 0; elt < batch_size; ++elt) {
|
||||
// Matrix mulitply per sample:
|
||||
// Matrix multiply per sample:
|
||||
grad_input_n = grad_input.select(0, elt);
|
||||
grad_output_n = grad_output.select(0, elt);
|
||||
|
||||
@ -736,12 +736,12 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu(
|
||||
int64_t elt;
|
||||
// For each elt in batch, do:
|
||||
for (elt = 0; elt < batch_size; ++elt) {
|
||||
// Matrix mulitply per output:
|
||||
// Matrix multiply per output:
|
||||
grad_output_n = grad_output.select(0, elt);
|
||||
|
||||
// Do Weight:
|
||||
if (grad_weight.defined()) {
|
||||
// Matrix mulitply per output:
|
||||
// Matrix multiply per output:
|
||||
input_n = input.select(0, elt);
|
||||
|
||||
if (need_columns) {
|
||||
|
@ -31,7 +31,7 @@ is present in the working directory). For additional details see [1].
|
||||
# read in as dataframe, explicitly use zero values
|
||||
df = pd.DataFrame(rows).fillna(0).astype(int)
|
||||
|
||||
# peform conversion according to Section 2.1 of [1]
|
||||
# perform conversion according to Section 2.1 of [1]
|
||||
df["poly"] = 2 * df["a"] + 2 ** df["s"] + 1
|
||||
|
||||
# ensure columns are properly ordered
|
||||
|
@ -559,7 +559,7 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const std::
|
||||
TORCH_CHECK((input_.sizes()[0] == mask.sizes()[0]) && (input_.sizes()[2] == mask.sizes()[1]),
|
||||
"For mask_type == 1 mask shape should be (B, L)");
|
||||
if (dim_ != input_.dim() - 1) {
|
||||
// We only process padding mask in the optimized way if softmax is applied along the last dimesion,
|
||||
// We only process padding mask in the optimized way if softmax is applied along the last dimension,
|
||||
// otherwise we need to expand the mask into a generic 4D one
|
||||
mask = mask_.view({input_.sizes()[0], 1, 1, input_.sizes()[2]});
|
||||
mask = mask.expand(input_.sizes()).contiguous();
|
||||
@ -570,7 +570,7 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const std::
|
||||
TORCH_CHECK((mask.dim() == 2) && (input_.sizes()[2] == mask.sizes()[0]) && (input_.sizes()[2] == mask.sizes()[1]),
|
||||
"For mask_type == 0 mask shape should be (L, L)");
|
||||
if (dim_ != input_.dim() - 1) {
|
||||
// We only process attention mask in a optimized way if softmax is applied along the last dimesion,
|
||||
// We only process attention mask in a optimized way if softmax is applied along the last dimension,
|
||||
// otherwise we need to expand the mask into a generic 4D one
|
||||
mask = mask.view({1, 1, input_.sizes()[2], input_.sizes()[2]});
|
||||
mask = mask.expand(input_.sizes()).contiguous();
|
||||
|
@ -1693,7 +1693,7 @@ static Tensor sparse_compressed_to_flipped(
|
||||
|
||||
// Step 4:
|
||||
// Convert the COO indices to the CSC/BSC indices and form the output.
|
||||
// We need to sort COO indices along the "tranposed" dim to satisfy the
|
||||
// We need to sort COO indices along the "transposed" dim to satisfy the
|
||||
// invariant of sorted plain indices.
|
||||
// Hash coo indices by converting 2d indices to linear offsets with
|
||||
// more "weight" (aka stride) placed on the "transposed" dimension.
|
||||
|
@ -572,7 +572,7 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) {
|
||||
// }
|
||||
|
||||
// Then define for each sparse dim the number of reps for each nnz index/value
|
||||
// due to broadcasting. Repetitions do not take into accout the current value
|
||||
// due to broadcasting. Repetitions do not take into account the current value
|
||||
// of nnz - this will be taken care of later {
|
||||
auto nnz_repeats = c10::DimVector(res_sparse_dim);
|
||||
nnz_repeats.back() = res_sparse_dim_broadcast_mask.back();
|
||||
@ -3601,7 +3601,7 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
|
||||
// in-place operations. For other sparse formats, the in-place
|
||||
// transpose would not be possible without shuffling the specified
|
||||
// values. So we don't support this as it would defeat the purpose
|
||||
// of in-place opreations of being memory-efficient.
|
||||
// of in-place operations of being memory-efficient.
|
||||
if (self.is_sparse()) {
|
||||
return sparse_transpose_(self, dim0, dim1);
|
||||
}
|
||||
|
@ -341,7 +341,7 @@ inline int64_t nearest_idx(
|
||||
int64_t input_size,
|
||||
int64_t output_size,
|
||||
std::optional<double> scales) {
|
||||
// This method specificly treats cases: output_size == input_size or
|
||||
// This method specifically treats cases: output_size == input_size or
|
||||
// output_size == 2 * input_size, that we would like to get rid of
|
||||
// We keep this method for BC and consider as deprecated.
|
||||
// See nearest_exact_idx as replacement
|
||||
|
@ -585,7 +585,7 @@ cpu_adaptive_max_pool3d_channels_last(
|
||||
using Vec = vec::Vectorized<scalar_t>;
|
||||
using integer_t = vec::int_same_size_t<scalar_t>;
|
||||
using iVec = vec::Vectorized<integer_t>;
|
||||
// for the convience of vectorization, use integer of the same size of scalar_t,
|
||||
// for the convenience of vectorization, use integer of the same size of scalar_t,
|
||||
// e.g. int32_t for float, int64_t for double
|
||||
// need to make sure doesn't overflow
|
||||
TORCH_CHECK(input_height * input_width <= std::numeric_limits<integer_t>::max());
|
||||
|
@ -386,7 +386,7 @@ void cpu_flash_attention(
|
||||
int64_t thresh_size = (dtype == at::ScalarType::BFloat16) ? 64 : 16;
|
||||
need_pack = kvSize >= thresh_size && qSize >= thresh_size;
|
||||
// When the number of gemm is greater than the number of pack,
|
||||
// the pack overhead can be overlaped.
|
||||
// the pack overhead can be overlapped.
|
||||
if (need_pack) {
|
||||
double pack_size = batchSize * num_head * kvSize * headSize;
|
||||
double qs_per_thread = (batchSize * num_head * qSlice + num_thread - 1) / num_thread;
|
||||
|
@ -41,7 +41,7 @@
|
||||
*/
|
||||
#endif /* VULKAN_DEBUG */
|
||||
|
||||
// Note: Do not try to use C10 convenience macors here, as this header is
|
||||
// Note: Do not try to use C10 convenience macros here, as this header is
|
||||
// included from ExecuTorch that does not want to have dependency on C10
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
|
Reference in New Issue
Block a user