mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Fix typos, via a Levenshtein-type corrector (#31523)
Summary: Should be non-semantic. Uses https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines to find likely typos, with https://github.com/bwignall/typochecker to help automate the checking. Uses an updated version of the tool used in https://github.com/pytorch/pytorch/pull/30606 . Pull Request resolved: https://github.com/pytorch/pytorch/pull/31523 Differential Revision: D19216749 Pulled By: mrshenli fbshipit-source-id: 7fd489cb9a77cd7e4950c1046f925d57524960ea
This commit is contained in:
committed by
Facebook Github Bot
parent
c8ca70e39d
commit
f326045b37
@ -270,7 +270,7 @@ if (MSVC)
|
||||
endif()
|
||||
|
||||
# /bigobj increases number of sections in .obj file, which is needed to link
|
||||
# against libaries in Python 2.7 under Windows
|
||||
# against libraries in Python 2.7 under Windows
|
||||
set(${flag_var} "${${flag_var}} /MP /bigobj")
|
||||
endforeach(flag_var)
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
/test/test_c10d.py @pietern @mrshenli @zhaojuanmao
|
||||
/torch/utils/cpp_extension.py @goldsborough @fmassa @soumith @ezyang
|
||||
|
||||
# Not there to stricly require the approval, but to be tagged as a reviewer
|
||||
# Not there to strictly require the approval, but to be tagged as a reviewer
|
||||
# on the PRs to push them into a high priority inbox.
|
||||
/torch/csrc/api/data/ @apaszke
|
||||
/torch/csrc/autograd/ @apaszke
|
||||
|
@ -24,7 +24,7 @@ else()
|
||||
set(CAFFE2_STATIC_LINK_CUDA_INT 0)
|
||||
endif()
|
||||
CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
|
||||
# TODO: Don't unconditionally generate CUDAConfig.h.in. Unfortuantely,
|
||||
# TODO: Don't unconditionally generate CUDAConfig.h.in. Unfortunately,
|
||||
# this file generates AT_ROCM_ENABLED() which is required by the miopen
|
||||
# files, which are compiled even if we are doing a vanilla CUDA build.
|
||||
# Once we properly split CUDA and HIP in ATen, we can remove this code.
|
||||
|
@ -8,7 +8,7 @@ namespace c10 {
|
||||
namespace detail {
|
||||
// WrapRuntimeKernelFunctor: Wraps any runtime functor into a functor that
|
||||
// inherits from c10::OperatorKernel, so it can be used as a c10 kernel.
|
||||
// This can, for example, be used for lamdas, functors or even function pointers.
|
||||
// This can, for example, be used for lambdas, functors or even function pointers.
|
||||
// In the case of function pointers, since it is a runtime function pointer,
|
||||
// there is an overhead for calling it whenever the kernel is invoked.
|
||||
template<class FuncType, class ReturnType, class ParameterList> class WrapRuntimeKernelFunctor_ {};
|
||||
|
@ -184,7 +184,7 @@ struct FunctionSchema {
|
||||
std::vector<Argument> returns_;
|
||||
// if true then this schema takes an arbitrary number of additional arguments
|
||||
// after the argument specified in arguments
|
||||
// currently this is used primarily to represent 'primtive' operators whose
|
||||
// currently this is used primarily to represent 'primitive' operators whose
|
||||
// arguments are not checked by schema
|
||||
bool is_vararg_;
|
||||
bool is_varret_;
|
||||
|
@ -1366,7 +1366,7 @@ struct getTypePtr_<at::optional<T>> final {
|
||||
} // namespace detail
|
||||
template <class T>
|
||||
inline TypePtr getTypePtr() {
|
||||
// TODO: static_assert that a templated function exists, and throw a friendy
|
||||
// TODO: static_assert that a templated function exists, and throw a friendly
|
||||
// error message if not
|
||||
return detail::getTypePtr_<T>::call();
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ public:
|
||||
// a constexpr variable if we never odr-use it. But it seems that some
|
||||
// versions GCC/Clang have buggy determinations on whether or not an
|
||||
// identifier is odr-used or not, and in any case it's hard to tell if
|
||||
// a variable is odr-used or not. So best to just cut the probem at the root.
|
||||
// a variable is odr-used or not. So best to just cut the problem at the root.
|
||||
static constexpr int size() {
|
||||
return 32 / sizeof(T);
|
||||
}
|
||||
|
@ -94,7 +94,7 @@ uint64_t CUDAGenerator::current_seed() const {
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a nondeterminstic random number from /dev/urandom or time,
|
||||
* Gets a nondeterministic random number from /dev/urandom or time,
|
||||
* seeds the CPUGenerator with it and then returns that number.
|
||||
*
|
||||
* FIXME: You can move this function to Generator.cpp if the algorithm
|
||||
|
@ -53,7 +53,7 @@ namespace at { namespace cuda {
|
||||
// NOTE [ ATen NVRTC Stub and HIP ]
|
||||
//
|
||||
// ATen's NVRTC stub library, caffe2_nvrtc, provides dynamic loading of both
|
||||
// NVRTC and driver APIs. While the former is not yet suppoted for HIP, the
|
||||
// NVRTC and driver APIs. While the former is not yet supported for HIP, the
|
||||
// later is supported and needed (e.g., in CUDAHooks::getDeviceWithPrimaryContext()
|
||||
// used by tensor.pin_memory()).
|
||||
//
|
||||
|
@ -76,7 +76,7 @@ public:
|
||||
T* desc() const { return desc_.get(); }
|
||||
T* desc() { return desc_.get(); }
|
||||
|
||||
// Use mut_desc() to access the underlying desciptor pointer
|
||||
// Use mut_desc() to access the underlying descriptor pointer
|
||||
// if you intend to modify what it points to (e.g., using
|
||||
// cudnnSetFooDescriptor). This will ensure that the descriptor
|
||||
// is initialized. Code in this file will use this function.
|
||||
|
@ -27,7 +27,7 @@ namespace c10 { namespace hip {
|
||||
// HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP".
|
||||
// For example, when you use HIPified PyTorch, you say x.cuda() to
|
||||
// move a tensor onto ROCm device. We call this situation "HIP
|
||||
// maquerading as CUDA".
|
||||
// masquerading as CUDA".
|
||||
//
|
||||
// This leads to a very awkward situation when we want to call c10_hip
|
||||
// code from PyTorch, since c10_hip is expecting things to be called
|
||||
|
@ -61,7 +61,7 @@ public:
|
||||
T* desc() const { return desc_.get(); }
|
||||
T* desc() { return desc_.get(); }
|
||||
|
||||
// Use mut_desc() to access the underlying desciptor pointer
|
||||
// Use mut_desc() to access the underlying descriptor pointer
|
||||
// if you intend to modify what it points to (e.g., using
|
||||
// miopenSetFooDescriptor). This will ensure that the descriptor
|
||||
// is initialized. Code in this file will use this function.
|
||||
|
@ -1104,7 +1104,7 @@ Tensor _lu_solve_helper_cpu(const Tensor& self, const Tensor& LU_data, const Ten
|
||||
return self_working_copy;
|
||||
}
|
||||
|
||||
// Supports arbitrary batch dimensions for self and LU_data (implicity LU_pivots also)
|
||||
// Supports arbitrary batch dimensions for self and LU_data (implicitly LU_pivots also)
|
||||
Tensor lu_solve(const Tensor& self, const Tensor& LU_data, const Tensor& LU_pivots) {
|
||||
TORCH_CHECK(self.dim() >= 2,
|
||||
"b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
|
||||
|
@ -59,7 +59,7 @@ static inline void multi_margin_loss_cpu_kernel(
|
||||
using accscalar_t = at::acc_type<scalar_t, false>;
|
||||
|
||||
// dim() != 0 check is for 1d input which produces a scalar output (that
|
||||
// cannot be handeld by TensorAccessor)
|
||||
// cannot be handled by TensorAccessor)
|
||||
if (reduction == Reduction::None && output.dim() > 0) {
|
||||
auto output_acc = output.accessor<scalar_t, 1>();
|
||||
for (int64_t t = 0; t < nframe; t++) {
|
||||
|
@ -295,7 +295,7 @@ static std::vector<QuantizedCellParamsDynamic> gather_quantized_params_dynamic(
|
||||
}
|
||||
return result;
|
||||
#else // USE_FBGEMM
|
||||
TORCH_INTERNAL_ASSERT(false, "Tried to use quantized RNN wihtout FBGEMM!")
|
||||
TORCH_INTERNAL_ASSERT(false, "Tried to use quantized RNN without FBGEMM!")
|
||||
#endif // USE_FBGEMM
|
||||
}
|
||||
|
||||
|
@ -276,7 +276,7 @@ std::tuple<Tensor, Tensor> kthvalue(
|
||||
return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim);
|
||||
}
|
||||
|
||||
// this does not reduce to median with dim beause we don't want to copy twice
|
||||
// this does not reduce to median with dim because we don't want to copy twice
|
||||
Tensor median_cpu(const Tensor& self) {
|
||||
NoNamesGuard guard;
|
||||
TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
|
||||
|
@ -618,7 +618,7 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
|
||||
self - sparse tensor, its shape is sizes = sparse_shape + dense_shape
|
||||
indices - 2-D tensor of indices, shape is (sparse_dims, nnz)
|
||||
values - (1+len(dense_shape))-D tensor of values, shape is (nnz,) + dense_shape
|
||||
index_select(dim, index) returns a sparse tensor with the follwing data
|
||||
index_select(dim, index) returns a sparse tensor with the following data
|
||||
new_sizes = sizes[:dim] + (n,) + sizes[dim+1:]
|
||||
new_indices - shape is (sparse_dims, new_nnz)
|
||||
new_values - shape is (new_nnz,) + dense_shape
|
||||
|
@ -85,7 +85,7 @@ static void unfolded3d_copy(
|
||||
const int64_t input_hw = input_height * input_width;
|
||||
const int64_t input_dhw = input_hw * input_depth;
|
||||
|
||||
// the following variables are updated ouside the most inner loop
|
||||
// the following variables are updated outside the most inner loop
|
||||
int64_t d = d_out * dT - pT + i;
|
||||
int64_t h = h_out * dH - pH + j;
|
||||
int64_t ofs = nip * input_dhw + d * input_hw + h * input_width;
|
||||
|
@ -28,7 +28,7 @@
|
||||
* are computed from the input and the output size;
|
||||
*
|
||||
*
|
||||
* When the scales are infered from the input and output sizes,
|
||||
* When the scales are inferred from the input and output sizes,
|
||||
* we view each pixel as an area, idx + 0.5 as its center index.
|
||||
* Here is an example formula in 1D case.
|
||||
* if align_corners: center of two corner pixel areas are preserved,
|
||||
|
@ -26,7 +26,7 @@ struct Dist {
|
||||
// map : This tells how to modify (a - b) to form the component that
|
||||
// gets summed.
|
||||
// red : This tells how to sum the result of map up. This is
|
||||
// separate because the inf norm actuall uses max instead of
|
||||
// separate because the inf norm actually uses max instead of
|
||||
// sum.
|
||||
// finish : This tells what to do with the aggregated value to compute
|
||||
// the norm. Generally this is the result of val ^ (1 / p).
|
||||
|
@ -158,7 +158,7 @@ namespace at { namespace native { namespace {
|
||||
* `apply_fn` will be called multiple times, and together cover the entire
|
||||
* output spatial space.
|
||||
*
|
||||
* Now you should be able tp understand everything about the implementaion of
|
||||
* Now you should be able tp understand everything about the implementation of
|
||||
* 2D forward kernel shown at the beginning of this note.
|
||||
*
|
||||
**/
|
||||
|
@ -117,7 +117,7 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
|
||||
Device dst_device = iter.device(0);
|
||||
Device src_device = iter.device(1);
|
||||
|
||||
// Enable p2p access between devices. (No-op if it invovles the CPU)
|
||||
// Enable p2p access between devices. (No-op if it involves the CPU)
|
||||
bool p2p_enabled = maybe_enable_p2p_access(dst_device, src_device);
|
||||
|
||||
if (copy_requires_temporaries(iter, p2p_enabled)) {
|
||||
|
@ -364,7 +364,7 @@ namespace {
|
||||
|
||||
// assuming grad_grid is contiguous
|
||||
// thus we can
|
||||
// 1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
|
||||
// 1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
|
||||
// 2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
|
||||
scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
|
||||
gGrid_ptr_NHW[0] = gix_mult * gix;
|
||||
@ -383,7 +383,7 @@ namespace {
|
||||
|
||||
// assuming grad_grid is contiguous
|
||||
// thus we can
|
||||
// 1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
|
||||
// 1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
|
||||
// 2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
|
||||
scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
|
||||
gGrid_ptr_NHW[0] = static_cast<scalar_t>(0);
|
||||
@ -569,7 +569,7 @@ namespace {
|
||||
|
||||
// assuming grad_grid is contiguous
|
||||
// thus we can
|
||||
// 1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
|
||||
// 1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
|
||||
// 2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
|
||||
scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
|
||||
gGrid_ptr_NDHW[0] = gix_mult * gix;
|
||||
@ -591,7 +591,7 @@ namespace {
|
||||
|
||||
// assuming grad_grid is contiguous
|
||||
// thus we can
|
||||
// 1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
|
||||
// 1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
|
||||
// 2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
|
||||
scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
|
||||
gGrid_ptr_NDHW[0] = static_cast<scalar_t>(0);
|
||||
|
@ -108,7 +108,7 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size,
|
||||
}
|
||||
|
||||
static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
|
||||
// computes the stride as if tensor were contigous
|
||||
// computes the stride as if tensor were contiguous
|
||||
auto sizes = tensor.sizes();
|
||||
std::vector<int64_t> stride(tensor.dim());
|
||||
stride[tensor.dim() - 1] = 1;
|
||||
|
@ -7,7 +7,7 @@
|
||||
//
|
||||
// The gpu_kernel_with_scalars generates specializations that support a
|
||||
// single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar
|
||||
// is lifted to a kernel paramter instead of copying to device memory.
|
||||
// is lifted to a kernel parameter instead of copying to device memory.
|
||||
// This should be used in conjunction with TensorIterator::allow_cpu_scalars_,
|
||||
// which is the default for TensorIterator::binary_op. Otherwise, all inputs
|
||||
// and the output must be on the GPU.
|
||||
|
@ -51,7 +51,7 @@ __device__ __forceinline__ void warp_reduce(acc_t* sum) {
|
||||
// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp.
|
||||
// This is important because it means only __shfl_ instructions are required for reductions.
|
||||
// Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
|
||||
// CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch.
|
||||
// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
|
||||
// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
|
||||
// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
|
||||
// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.
|
||||
|
@ -200,7 +200,7 @@ __global__ void cunn_SpatialSoftMaxForward(
|
||||
for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) {
|
||||
const uint32_t data_offset = outer_offset + inner_index;
|
||||
////////////////////////////////////////////////////////////
|
||||
// These two blocks are really eqivalent, but specializing on
|
||||
// These two blocks are really equivalent, but specializing on
|
||||
// blockDim.x == 1 makes the kernel faster when it's unused.
|
||||
// I didn't want to thread an extra template parameter, and nvcc
|
||||
// seems to be smart enough to hoist the if outside of the loops.
|
||||
|
@ -177,7 +177,7 @@ void kthvalue_cuda_template(
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
// this does not reduce to median with dim beause we don't want to copy twice
|
||||
// this does not reduce to median with dim because we don't want to copy twice
|
||||
template <typename scalar_t>
|
||||
Tensor median_cuda_template(const Tensor& self) {
|
||||
TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
|
||||
|
@ -211,7 +211,7 @@ inline int64_t resolve_root_int(
|
||||
// (row + 2f - 1)row <= 2x
|
||||
// row^2 + (2f-1)row - 2x <= 0. [3]
|
||||
//
|
||||
// Based on ineuqality [3], we have the following coefficients for formula of
|
||||
// Based on inequality [3], we have the following coefficients for formula of
|
||||
// root:
|
||||
// a = 1
|
||||
// b = 2f - 1
|
||||
@ -254,7 +254,7 @@ inline void get_coordinate_in_tril_trapezoid(
|
||||
// (-row + 2f + 1)row <= 2x
|
||||
// row^2 - (2f+1)row + 2x >= 0. [3]
|
||||
//
|
||||
// Based on ineuqality [3], we have the following coefficients for formula of
|
||||
// Based on inequality [3], we have the following coefficients for formula of
|
||||
// root:
|
||||
// a = 1
|
||||
// b = -1 - 2f
|
||||
|
@ -213,7 +213,7 @@ __device__ __forceinline__ static void upsample_increment_value_bounded(
|
||||
accscalar_t value) {
|
||||
int access_y = max(min(y, height - 1), 0);
|
||||
int access_x = max(min(x, width - 1), 0);
|
||||
/* TODO: result here is trucated to scalar_t,
|
||||
/* TODO: result here is truncated to scalar_t,
|
||||
check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912
|
||||
*/
|
||||
gpuAtomicAdd(
|
||||
|
@ -1119,7 +1119,7 @@ std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor&
|
||||
struct DropoutState {
|
||||
// Both buffer and event are lazily instantiated when a dropout state is needed
|
||||
// for the first time. Note that in this case needed != used, as we don't need
|
||||
// a bufer to e.g. run RNNs in test mode.
|
||||
// a buffer to e.g. run RNNs in test mode.
|
||||
at::Tensor buffer;
|
||||
c10::optional<cuda::CUDAEvent> event;
|
||||
std::mutex mutex;
|
||||
|
@ -99,7 +99,7 @@ static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output,
|
||||
// 1. if this dim idx becomes 1, will need to add (size - 1) * stride
|
||||
// 2. otherwise, will need to subtract stride
|
||||
if (from_slice_indices[d] == 0) {
|
||||
// Substract. Carries over to previous dimension
|
||||
// Subtract. Carries over to previous dimension
|
||||
from_slice_data -= output.stride(d);
|
||||
} else if (from_slice_indices[d] == 1) {
|
||||
// Dimension index becomes 1
|
||||
@ -107,7 +107,7 @@ static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output,
|
||||
from_slice_data += (output.size(d) - 1) * output.stride(d);
|
||||
break;
|
||||
} else {
|
||||
// Substract. Doesn't carry over to previous dimension
|
||||
// Subtract. Doesn't carry over to previous dimension
|
||||
from_slice_data -= output.stride(d);
|
||||
break;
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ using namespace mkldnn;
|
||||
|
||||
namespace {
|
||||
// Helper function for getting an ideep tensor out of an aten Tensor.
|
||||
// Note in case the aten Tensor is a dense tensor, the retured ideep
|
||||
// Note in case the aten Tensor is a dense tensor, the returned ideep
|
||||
// tensor is just a view of the storage of the aten dense tensor, so
|
||||
// caller needs to make sure the aten dense tensor's lifetime is
|
||||
// longer than the ideep tensor.
|
||||
|
@ -23,7 +23,7 @@ inline int start_index(int out_idx, int out_len, int in_len) {
|
||||
* in_len: the dimension_size of input matrix
|
||||
* Basically, in_len / out_len gives the number of
|
||||
* elements in each average computation.
|
||||
* This functin computes the start index on input matrix.
|
||||
* This function computes the start index on input matrix.
|
||||
*/
|
||||
return (int)std::floor((float)(out_idx * in_len) / out_len);
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ Tensor quantized_clamp_impl(
|
||||
qclamp_stub(qx.device().type(), qx, *min, *max, qy);
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Both min and max should be specifed for quantized clamp!");
|
||||
false, "Both min and max should be specified for quantized clamp!");
|
||||
}
|
||||
return qy;
|
||||
}
|
||||
|
@ -15,7 +15,7 @@ inline void check_inputs(const Tensor& qa, const Tensor& qb) {
|
||||
TORCH_CHECK(qa.qscheme() == kPerTensorAffine,
|
||||
"Only per tensor quantization is supported in Mul.");
|
||||
TORCH_CHECK(qa.qscheme() == qb.qscheme(),
|
||||
"Both inputs to Mul must have the same quantization shceme.");
|
||||
"Both inputs to Mul must have the same quantization scheme.");
|
||||
TORCH_CHECK(qa.numel() == qb.numel(),
|
||||
"Mul operands must be the same size!");
|
||||
TORCH_CHECK(qa.scalar_type() == qb.scalar_type(),
|
||||
|
@ -63,7 +63,7 @@ void pytorch_qnnp_requantize_fp32__neon(
|
||||
|
||||
#ifdef __aarch64__
|
||||
/*
|
||||
* Leverage "Floating-point Convert to Signed integer, rouding to nearest
|
||||
* Leverage "Floating-point Convert to Signed integer, rounding to nearest
|
||||
* with ties to even" instruction. This is an ARMv8 instruction (always
|
||||
* available in AArch64), which saturates result on overflow. We don't need
|
||||
* to specifically consider saturated results, they will be clamped at the
|
||||
|
@ -46,7 +46,7 @@ void pytorch_qnnp_requantize_fp32__psimd(
|
||||
* - Large int32_t values can't be exactly represented as FP32. We expect
|
||||
* that conversion instruction would round it to nearest FP32 value with
|
||||
* ties to even, but Clang documentation for __builtin_convertvector does
|
||||
* not guaratee that.
|
||||
* not guarantee that.
|
||||
* - Product of two FP32 values is generally not exactly representation as
|
||||
* an FP32 value, and will be rounded to nearest FP32 value with ties to
|
||||
* even.
|
||||
|
@ -91,7 +91,7 @@ void pytorch_qnnp_requantize_precise__scalar_unsigned32(
|
||||
*
|
||||
* To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do
|
||||
* it in two steps:
|
||||
* - Shift by 32, which can be implemented by extacting the high 32-bit word
|
||||
* - Shift by 32, which can be implemented by extracting the high 32-bit word
|
||||
* on 32-bit systems.
|
||||
* - Shift by (shift - 32), which can be implemented as a 32-bit shift of
|
||||
* high word of addition result.
|
||||
|
@ -11,7 +11,7 @@ struct QnnpackOperatorDeleter {
|
||||
};
|
||||
|
||||
// PackedWeight struct for QNNPACK stores the original Weight and Bias as
|
||||
// QNNPACK currently does not support an unpack function. Possible optimiation -
|
||||
// QNNPACK currently does not support an unpack function. Possible optimization -
|
||||
// For PyTorch Mobile, once the model is scripted and serialized we don't need
|
||||
// to call unpack, so we can save some memory by checking for this case.
|
||||
// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized with
|
||||
|
@ -61,7 +61,7 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
|
||||
TORCH_CHECK(cuda::check_device({sparse_, r_, t, dense}));
|
||||
|
||||
TORCH_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor");
|
||||
TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " spase dims");
|
||||
TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " sparse dims");
|
||||
// no need to check dense_dim because dense_dim + sparse_dim = dim
|
||||
|
||||
// mxk * kxn = mxn
|
||||
|
@ -33,7 +33,7 @@
|
||||
// we should merge macros.
|
||||
#ifdef _WIN32
|
||||
#if !defined(AT_CORE_STATIC_WINDOWS)
|
||||
// TODO: unfiy the controlling macros.
|
||||
// TODO: unify the controlling macros.
|
||||
#if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
|
||||
#define TH_CPP_API __declspec(dllexport)
|
||||
#else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
|
||||
|
@ -1,4 +1,4 @@
|
||||
#pragma once
|
||||
#include <TH/THStorageFunctions.h>
|
||||
|
||||
// Compatability header. Use THStorageFunctions.h instead if you need this.
|
||||
// Compatibility header. Use THStorageFunctions.h instead if you need this.
|
||||
|
@ -42,7 +42,7 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
|
||||
|
||||
// [NOTE: nDimension vs nDimensionLegacyNoScalars vs nDimensionLegacyAll]
|
||||
// nDimension corresponds to the "true" ATen dimension.
|
||||
// nDimensionLegacyNoScalars correpsonds to the ATen dimension, except scalars are viewed as 1-dimensional tensors.
|
||||
// nDimensionLegacyNoScalars corresponds to the ATen dimension, except scalars are viewed as 1-dimensional tensors.
|
||||
// nDimensionLegacyAll corresponds to the ATen dimension, except scalars are viewed as 1-dimensional tensors
|
||||
// and tensors with a dimension of size zero are collapsed to 0-dimensional tensors.
|
||||
//
|
||||
|
@ -197,7 +197,7 @@ void THVector_(normal_fill)(scalar_t *data,
|
||||
}
|
||||
|
||||
/*
|
||||
* This struct's constructor initalizes the dispatch tables. It simply checks
|
||||
* This struct's constructor initializes the dispatch tables. It simply checks
|
||||
* what SIMD extensions are available, and then walks the dispatch table
|
||||
* to choose the best function.
|
||||
* NOTE: As implemented, it will initialize the dispatch pointer to the first supported function.
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
// A utility class to implement integer division by muliplication, given a fixed
|
||||
// A utility class to implement integer division by multiplication, given a fixed
|
||||
// divisor.
|
||||
//
|
||||
// WARNING: The fast divider algorithm is only implemented for unsigned int;
|
||||
|
@ -41,7 +41,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
|
||||
weight = THCTensor_(newContiguous)(state, weight);
|
||||
bias = bias ? THCTensor_(newContiguous)(state, bias) : bias;
|
||||
|
||||
// Following the behvaior of other THCUNN functions, we shape the output
|
||||
// Following the behavior of other THCUNN functions, we shape the output
|
||||
// Tensor ourselves
|
||||
|
||||
int batchSize = input->size(0);
|
||||
|
@ -14,7 +14,7 @@ class C2SimpleNet(object):
|
||||
"""
|
||||
This module constructs a net with 'op_name' operator. The net consist
|
||||
a series of such operator.
|
||||
It intializes the workspace with input blob equal to the number of parameters
|
||||
It initializes the workspace with input blob equal to the number of parameters
|
||||
needed for the op.
|
||||
Provides forward method to run the net niter times.
|
||||
"""
|
||||
|
@ -37,7 +37,7 @@ List all the supported tests:
|
||||
$ python -m pt.add_test --list_tests
|
||||
```
|
||||
|
||||
Filter and run a test (use `add_M8_N16_K32` as an exapmle):
|
||||
Filter and run a test (use `add_M8_N16_K32` as an example):
|
||||
```
|
||||
$ python -m pt.add_test --test_name add_K32_M8_N1
|
||||
--omp_num_threads 1 --mkl_num_threads 1
|
||||
|
@ -145,7 +145,7 @@ OpMeta = namedtuple("OpMeta", "op_type num_inputs input_dims input_types \
|
||||
|
||||
def generate_c2_test_from_ops(ops_metadata, bench_op, tags):
|
||||
"""
|
||||
This function is used to generate Caffe2 tests based on the meatdata
|
||||
This function is used to generate Caffe2 tests based on the metadata
|
||||
of operators. The metadata includes seven fields which are 1) op_type:
|
||||
the name of the operator. 2) num_inputs: the number of input blobs.
|
||||
3) input_dims: a dictionary which includes the shapes of the input blobs.
|
||||
|
@ -93,7 +93,7 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
|
||||
tags = attr["tags"]
|
||||
continue
|
||||
|
||||
# if 'cuda' is sepcified in input shape but the testing machines doesn't
|
||||
# if 'cuda' is specified in input shape but the testing machines doesn't
|
||||
# support, we will skip this input
|
||||
if 'cuda' in attr.values():
|
||||
if not torch.cuda.is_available():
|
||||
|
@ -52,7 +52,7 @@ class LSTMBenchmark(op_bench.TorchBenchmarkBase):
|
||||
|
||||
self.x = torch.randn(sequence_len, # sequence length
|
||||
batch_size, # batch size
|
||||
I) # Number of featues in X
|
||||
I) # Number of features in X
|
||||
self.h = torch.randn(NL * (D + 1), # layer_num * dir_num
|
||||
batch_size, # batch size
|
||||
H) # hidden size
|
||||
|
@ -62,7 +62,7 @@ C10_DEFINE_string(
|
||||
"Report the conversion stage time to screen. "
|
||||
"The format of the string is <type>|<identifier>. "
|
||||
"The valid type is 'json'. "
|
||||
"The valid identifier is nothing or an identifer that prefix every line");
|
||||
"The valid identifier is nothing or an identifier that prefix every line");
|
||||
C10_DEFINE_string(
|
||||
scale,
|
||||
"-1,-1",
|
||||
|
@ -63,7 +63,7 @@ C10_DEFINE_string(
|
||||
"Report the conversion stage time to screen. "
|
||||
"The format of the string is <type>|<identifier>. "
|
||||
"The valid type is 'json'. "
|
||||
"The valid identifier is nothing or an identifer that prefix every line");
|
||||
"The valid identifier is nothing or an identifier that prefix every line");
|
||||
C10_DEFINE_string(
|
||||
scale,
|
||||
"-1,-1",
|
||||
|
@ -203,7 +203,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
|
||||
data_ptr_ = std::move(data_ptr);
|
||||
// NOTE: data_type might change and so it's also possible that capacity
|
||||
// might not be divisible by itemsize. There is no way for us to keep track
|
||||
// of the exact capacity if we're not explicity storing is. More conrectely
|
||||
// of the exact capacity if we're not explicitly storing is. More concretely
|
||||
// capacity() might not return the value that was set here, if itemsize does
|
||||
// not evenly divide it.
|
||||
numel_ = capacity / data_type_.itemsize();
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
// Note: we use a different namespace to test if the macros defined in
|
||||
// Registry.h actuall works with a different namespace from c10.
|
||||
// Registry.h actually works with a different namespace from c10.
|
||||
namespace c10_test {
|
||||
|
||||
class Foo {
|
||||
|
@ -10,7 +10,7 @@
|
||||
|
||||
namespace {
|
||||
/**
|
||||
* This is a helper function which attemtps to get a base value depending on the
|
||||
* This is a helper function which attempts to get a base value depending on the
|
||||
* # of nodes. Larger the base the better performance (up to 4) is what we have
|
||||
* observed in gloo benchmarks. At the moment bcube works only if # nodes = base
|
||||
* ^ x. Where x is some constant. So, if # node don't match our expectation
|
||||
|
@ -63,7 +63,7 @@ class AllreduceOp final : public Operator<Context> {
|
||||
// Store which inputs/outputs this instance initialized with
|
||||
update(init_);
|
||||
|
||||
// Verify inputs == ouputs
|
||||
// Verify inputs == outputs
|
||||
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
|
||||
for (auto i = 0; i < init_.inputs.size(); i++) {
|
||||
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
|
||||
|
@ -37,7 +37,7 @@ std::unique_ptr<::gloo::Algorithm> initializeAlgorithm(
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a helper function which attemtps to get a base value depending on the
|
||||
* This is a helper function which attempts to get a base value depending on the
|
||||
* # of nodes. Larger the base the better performance (up to 4) is what we have
|
||||
* observed in gloo benchmarks. At the moment bcube works only if # nodes = base
|
||||
* ^ x. Where x is some constant. So, if # node don't match our expectation
|
||||
|
@ -58,7 +58,7 @@ class BroadcastOp final : public Operator<Context> {
|
||||
// Store which inputs/outputs this instance initialized with
|
||||
update(init_);
|
||||
|
||||
// Verify inputs == ouputs
|
||||
// Verify inputs == outputs
|
||||
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
|
||||
for (auto i = 0; i < init_.inputs.size(); i++) {
|
||||
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
|
||||
|
@ -73,7 +73,7 @@ class ReduceScatterOp final : public Operator<Context> {
|
||||
// Store which inputs/outputs this instance initialized with
|
||||
update(init_);
|
||||
|
||||
// Verify inputs == ouputs
|
||||
// Verify inputs == outputs
|
||||
CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
|
||||
for (auto i = 0; i < init_.inputs.size(); i++) {
|
||||
CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
|
||||
|
@ -68,7 +68,7 @@
|
||||
* The following example shows a general use case for the C++
|
||||
* bindings, including support for the optional exception feature and
|
||||
* also the supplied vector and string classes, see following sections for
|
||||
* decriptions of these features.
|
||||
* descriptions of these features.
|
||||
*
|
||||
* \code
|
||||
* #define __CL_ENABLE_EXCEPTIONS
|
||||
|
@ -108,7 +108,7 @@ def broadcast_parameters(opts, model, num_xpus, broadcast_computed_param=False):
|
||||
else caffe2_pb2.CPU
|
||||
for params in all_params:
|
||||
assert len(params) % num_xpus == 0, \
|
||||
"Current model dosen't match device number when loading checkpoint"
|
||||
"Current model doesn't match device number when loading checkpoint"
|
||||
params_per_xpu = int(len(params) / num_xpus)
|
||||
for idx in range(params_per_xpu):
|
||||
blobs = [param for param in params[idx::params_per_xpu]]
|
||||
|
@ -507,7 +507,7 @@ void TensorRTTransformer::Transform(
|
||||
return true;
|
||||
};
|
||||
|
||||
// function to convert runnbale subgraph into a trt op. Note that to keep the
|
||||
// function to convert runnable subgraph into a trt op. Note that to keep the
|
||||
// interface clean, we do the double conversion from C2 op to Onnx ops here
|
||||
// but it should be OK as the cost is really small. We also need to keep the
|
||||
// same exporter throughout the process to avoid duplicated dummy name
|
||||
|
@ -38,7 +38,7 @@ CAFFE2_API void SerializeBlob(
|
||||
/**
|
||||
* @brief Convenience function to serialize a blob to a string.
|
||||
*
|
||||
* This is a conveinence function to serialize small Blobs that produce
|
||||
* This is a convenience function to serialize small Blobs that produce
|
||||
* manageable serialized strings. To serialize big blobs such as
|
||||
* large sparse tensors, use the fully-functional interface in
|
||||
* blob_serializer_base.h.
|
||||
|
@ -92,7 +92,7 @@ inline Dst dynamic_cast_if_rtti(Src ptr) {
|
||||
}
|
||||
|
||||
// SkipIndices are used in operator_fallback_gpu.h and operator_fallback_mkl.h
|
||||
// as utilty functions that marks input / output indices to skip when we use a
|
||||
// as utility functions that marks input / output indices to skip when we use a
|
||||
// CPU operator as the fallback of GPU/MKL operator option.
|
||||
template <int... values>
|
||||
class SkipIndices {
|
||||
|
@ -174,7 +174,7 @@ std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
|
||||
static std::unordered_map<void*, uint8_t> g_cuda_device_affiliation;
|
||||
|
||||
// Data structures for optional memory tracking. Access to these structures
|
||||
// is garded by the CUDAContext::mutex.
|
||||
// is guarded by the CUDAContext::mutex.
|
||||
static std::unordered_map<void*, long> g_size_map;
|
||||
static std::vector<long> g_total_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
|
||||
static std::vector<long> g_max_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
|
||||
|
@ -471,7 +471,7 @@ class ComputeBlobRecyclingForDag {
|
||||
}
|
||||
}
|
||||
|
||||
// Rturns true if the op that generates that blob acquires all tokens.
|
||||
// Returns true if the op that generates that blob acquires all tokens.
|
||||
inline bool can_use_blob(
|
||||
const string& blob_name,
|
||||
std::unordered_set<int>* tokens,
|
||||
|
@ -76,7 +76,7 @@ class CAFFE2_API NetBase : public Observable<NetBase> {
|
||||
* seconds spent during the benchmark. The 0-th item is the time spent per
|
||||
* each network run, and if a net instantiation supports run_individual,
|
||||
* the remainder of the vector returns the number of milliseconds spent per
|
||||
* opeartor.
|
||||
* operator.
|
||||
*/
|
||||
virtual vector<float> TEST_Benchmark(
|
||||
const int /*warmup_runs*/,
|
||||
|
@ -461,7 +461,7 @@ std::shared_ptr<Tracer> create(
|
||||
const std::string& net_name) {
|
||||
// Enable the tracer if the net has the "enable_tracing" argument set OR
|
||||
// if the command line option includes the net name option in the list of
|
||||
// tracable nets.
|
||||
// traceable nets.
|
||||
bool trace_net = hasEnableTracingFlag(net) || isTraceableNetName(net_name);
|
||||
return trace_net
|
||||
? std::make_shared<Tracer>(net, net_name, getTracingConfigFromNet(net))
|
||||
|
@ -24,7 +24,7 @@ SimpleRefCountNet::SimpleRefCountNet(
|
||||
|
||||
std::map<string, int> last_consumed_at;
|
||||
std::set<string> created_by_me;
|
||||
// For each opeartor
|
||||
// For each operator
|
||||
for (int idx = 0; idx < net_def->op_size(); ++idx) {
|
||||
const auto& op_def = net_def->op(idx);
|
||||
for (const string& in_name : op_def.input()) {
|
||||
|
@ -254,7 +254,7 @@ struct CAFFE2_API NNModule {
|
||||
NNModule(NNModule&&) = default;
|
||||
NNModule() {}
|
||||
|
||||
/* Repalce subgraph sg by node, using the order of
|
||||
/* Replace subgraph sg by node, using the order of
|
||||
* node_inputs and node_outputs to determine how to link
|
||||
* them to the node. node_inputs *must* enumerate all the
|
||||
* inputs to the subgraph (NeuralNetData that do not
|
||||
|
@ -645,7 +645,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
|
||||
std::string type_;
|
||||
vector<const Blob*> inputs_;
|
||||
vector<Blob*> outputs_;
|
||||
// Preferrably use c10::optional, but nvcc doesn't work
|
||||
// Preferably use c10::optional, but nvcc doesn't work
|
||||
#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
|
||||
std::unique_ptr<const c10::FunctionSchema> fn_schema_;
|
||||
vector<c10::IValue> newstyle_inputs_;
|
||||
|
@ -131,7 +131,7 @@ class CAFFE2_API OpSchema {
|
||||
OpSchema& AllowInplace(std::function<bool(int, int)> inplace);
|
||||
OpSchema& AllowInplace(set<std::pair<int, int>> inplace);
|
||||
OpSchema& AllowOneToOneInplace();
|
||||
// Sets the rule to enforce in-place opeartion.
|
||||
// Sets the rule to enforce in-place operation.
|
||||
OpSchema& EnforceInplace(std::function<bool(int, int)> inplace);
|
||||
OpSchema& EnforceInplace(set<std::pair<int, int>> inplace);
|
||||
OpSchema& EnforceOneToOneInplace();
|
||||
|
@ -112,7 +112,7 @@ using ScopeGuardImplDecay = ScopeGuardImpl<typename std::decay<F>::type>;
|
||||
/**
|
||||
* ScopeGuard is a general implementation of the "Initialization is
|
||||
* Resource Acquisition" idiom. Basically, it guarantees that a function
|
||||
* is executed upon leaving the currrent scope unless otherwise told.
|
||||
* is executed upon leaving the current scope unless otherwise told.
|
||||
*
|
||||
* The MakeGuard() function is used to create a new ScopeGuard object.
|
||||
* It can be instantiated with a lambda function, a std::function<void()>,
|
||||
|
@ -32,7 +32,7 @@
|
||||
#define CAFFE_SDT_ARGSIZE(x) (CAFFE_SDT_ISARRAY(x) ? sizeof(void*) : sizeof(x))
|
||||
|
||||
// Format of each probe arguments as operand.
|
||||
// Size of the arugment tagged with CAFFE_SDT_Sn, with "n" constraint.
|
||||
// Size of the argument tagged with CAFFE_SDT_Sn, with "n" constraint.
|
||||
// Value of the argument tagged with CAFFE_SDT_An, with configured constraint.
|
||||
#define CAFFE_SDT_ARG(n, x) \
|
||||
[CAFFE_SDT_S##n] "n" ((size_t)CAFFE_SDT_ARGSIZE(x)), \
|
||||
|
@ -278,7 +278,7 @@ class CAFFE2_API Workspace {
|
||||
ShouldContinue should_continue = StopOnSignal{});
|
||||
|
||||
/*
|
||||
* Returns a CPU threadpool instace for parallel execution of
|
||||
* Returns a CPU threadpool instance for parallel execution of
|
||||
* work. The threadpool is created lazily; if no operators use it,
|
||||
* then no threadpool will be created.
|
||||
*/
|
||||
|
@ -31,7 +31,7 @@ import caffe2.python.models.resnet as resnet
|
||||
|
||||
'''
|
||||
Simple benchmark that creates a data-parallel resnet-50 model
|
||||
and measurs the time.
|
||||
and measures the time.
|
||||
'''
|
||||
|
||||
|
||||
|
@ -1023,7 +1023,7 @@ void TransformImage(
|
||||
ColorNormalization<Context>(image_data, crop, channels, mean, std);
|
||||
}
|
||||
|
||||
// Only crop / transose the image
|
||||
// Only crop / transpose the image
|
||||
// leave in uint8_t dataType
|
||||
template <class Context>
|
||||
void CropTransposeImage(
|
||||
|
@ -68,7 +68,7 @@
|
||||
* The following example shows a general use case for the C++
|
||||
* bindings, including support for the optional exception feature and
|
||||
* also the supplied vector and string classes, see following sections for
|
||||
* decriptions of these features.
|
||||
* descriptions of these features.
|
||||
*
|
||||
* \code
|
||||
* #define __CL_ENABLE_EXCEPTIONS
|
||||
|
@ -56,7 +56,7 @@ bool NNApi::run(const TensorVector& inputs, TensorVector* outputs) {
|
||||
try {
|
||||
init(inputs, outputs);
|
||||
} catch (const std::exception& e) {
|
||||
LOG(ERROR) << "Error duing model initialization: " << e.what();
|
||||
LOG(ERROR) << "Error during model initialization: " << e.what();
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1657,7 +1657,7 @@ Caffe2BackendRep* Caffe2Backend::Prepare(
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: avoid extra copy by directly feed initialiers to backend blobs
|
||||
// TODO: avoid extra copy by directly feed initializers to backend blobs
|
||||
OnnxToCaffe2(
|
||||
&rep->init_net(),
|
||||
&rep->pred_net(),
|
||||
|
@ -185,7 +185,7 @@ void ssaRewriteForIfOp(
|
||||
OperatorDef* op,
|
||||
std::unordered_map<std::string, int>* blob_versions,
|
||||
std::set<std::string>* is_initialized_tensor) {
|
||||
// Get all the "external" inputs and outpus of the subnet
|
||||
// Get all the "external" inputs and outputs of the subnet
|
||||
// Since then_net and else_net has same external input/output, we only collect
|
||||
// external input/output from one of its subnet And perform the rewrite to
|
||||
// both then_net and else_net
|
||||
|
@ -111,7 +111,7 @@ class CAFFE2_API OnnxExporter {
|
||||
const caffe2::OperatorDef& def,
|
||||
const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
|
||||
|
||||
// \brief Check black listed arguemnts where we won't pass down when
|
||||
// \brief Check black listed arguments where we won't pass down when
|
||||
// converting to ONNX node
|
||||
bool IsBlackListed(const caffe2::Argument& arg);
|
||||
|
||||
|
@ -138,7 +138,7 @@ ONNX_PYTORCH_OPERATOR_SET_SCHEMA(
|
||||
OpSchema()
|
||||
.SetDoc("Mirror Caffe2 BatchMatMul operator")
|
||||
.Input(0, "X", "tensor of shape (dim0, dim1 ... M, K)", "T")
|
||||
.Input(1, "Y", "tensor of shpae (dim0, dim2 ... K, N)", "T")
|
||||
.Input(1, "Y", "tensor of shape (dim0, dim2 ... K, N)", "T")
|
||||
.Output(0, "Z", "tensor of shape (dim0, dim1 ... M, N)", "T")
|
||||
.TypeConstraint(
|
||||
"T",
|
||||
|
@ -31,7 +31,7 @@ class CuDNNActivationOpBase : public Operator<CUDAContext> {
|
||||
const cudnnDataType_t data_type,
|
||||
const int data_size) {
|
||||
if (data_size != input_size_) {
|
||||
// Since the best performance is obtained when the tesor is HW-packed, we
|
||||
// Since the best performance is obtained when the tensor is HW-packed, we
|
||||
// put X.size() to W.
|
||||
input_size_ = data_size;
|
||||
CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
|
||||
|
@ -69,7 +69,7 @@ The lengths is a 1D tensor that splits the following 'boundaries' argument.
|
||||
The boundaries is a 1D tensor containing the border list for each feature.
|
||||
|
||||
With in each batch, `indices` should not have duplicate number,
|
||||
and the number of elements in `indices` should be less than or euqal to `D`.
|
||||
and the number of elements in `indices` should be less than or equal to `D`.
|
||||
Each element in `lengths` vector (lengths[`i`]) represents
|
||||
the number of boundaries in the sub border list.
|
||||
The sum of all elements in `lengths` must be equal to the size of `boundaries`.
|
||||
|
@ -126,7 +126,7 @@ OPERATOR_SCHEMA(BatchMatMul)
|
||||
Batch Matrix multiplication Yi = Ai * Bi, where A has shape (dim0, dim1, ... M, K),
|
||||
B has shape (dim0, dim1, ... K, N), Y has shape (dim0, dim1, ... M, N) and i ranges
|
||||
from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B being
|
||||
two diemnsional, it behaves like normal matrix multiplication.
|
||||
two dimensional, it behaves like normal matrix multiplication.
|
||||
)DOC")
|
||||
.Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)")
|
||||
.Input(1, "B", "tensor of shape (dim0, dim1 ... K, N)")
|
||||
|
@ -46,7 +46,7 @@ OPERATOR_SCHEMA(BisectPercentile)
|
||||
R_2 = [0.3, 1.2];
|
||||
We will build R = [0.1, 0.4, 0.5, 0.3, 1.2]; besides, we have
|
||||
lengths = [3, 2]
|
||||
to indicate the boundries of the percentile information.
|
||||
to indicate the boundaries of the percentile information.
|
||||
|
||||
)DOC")
|
||||
.Arg(
|
||||
|
@ -51,7 +51,7 @@ class BoxWithNMSLimitOp final : public Operator<Context> {
|
||||
"Unexpected soft_nms_method");
|
||||
soft_nms_method_ = (soft_nms_method_str_ == "linear") ? 1 : 2;
|
||||
|
||||
// When input `boxes` doesn't inlcude background class, the score will skip
|
||||
// When input `boxes` doesn't include background class, the score will skip
|
||||
// background class and start with foreground classes directly, and put the
|
||||
// background class in the end, i.e. score[:, 0:NUM_CLASSES-1] represents
|
||||
// foreground classes and score[:,NUM_CLASSES] represents background class.
|
||||
|
@ -97,7 +97,7 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
|
||||
}
|
||||
|
||||
protected:
|
||||
// A helper function to set up the tensor Nd desriptor, depending on the order
|
||||
// A helper function to set up the tensor Nd descriptor, depending on the order
|
||||
// the group and the type given.
|
||||
template <typename T>
|
||||
void SetTensorNdDescriptorWithGroup(
|
||||
|
@ -209,7 +209,7 @@ OPERATOR_SCHEMA(SwapBestPath)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
Given a sequence of idices and a matrix, enforce that these indices have the
|
||||
Given a sequence of indices and a matrix, enforce that these indices have the
|
||||
best columnwise scores
|
||||
score
|
||||
)DOC")
|
||||
|
@ -170,7 +170,7 @@ In Advances in Neural Information Processing Systems, pp. 1508-1518. 2017.
|
||||
)DOC")
|
||||
.Input(0, "input", "Float32 input data")
|
||||
.Output(0, "output", "Fused bitwidth, tail, min, max and quantized data")
|
||||
.Arg("bitwidth", "How many bits to quantiz per data (defaults to 8).")
|
||||
.Arg("bitwidth", "How many bits to quantize per data (defaults to 8).")
|
||||
.Arg("random", "random or not (True). False is set up for unittest.");
|
||||
NO_GRADIENT(FloatToFusedRandRowwiseQuantized);
|
||||
|
||||
|
@ -184,7 +184,7 @@ class GatherOp : public Operator<Context> {
|
||||
// an error.
|
||||
// Right now, we apply index wrapping by default only to axis == 0,
|
||||
// since we have ONNX conversion code that uses it. For other ops it
|
||||
// needs to be speified explicitly with argument or you don't get it.
|
||||
// needs to be specified explicitly with argument or you don't get it.
|
||||
if (OperatorBase::HasArgument("wrap_indices")) {
|
||||
wrap_indices_ = Operator<Context>::template GetSingleArgument<bool>(
|
||||
"wrap_indices", (false));
|
||||
|
@ -69,7 +69,7 @@ CAFFE2_API ERArrXXf ComputeSortedAnchors(
|
||||
} // namespace utils
|
||||
|
||||
// C++ implementation of GenerateProposalsOp
|
||||
// Generate bounding box proposals for Faster RCNN. The propoasls are generated
|
||||
// Generate bounding box proposals for Faster RCNN. The proposals are generated
|
||||
// for a list of images based on image score 'score', bounding box
|
||||
// regression result 'deltas' as well as predefined bounding box shapes
|
||||
// 'anchors'. Greedy non-maximum suppression is applied to generate the
|
||||
|
@ -632,7 +632,7 @@ search tree.
|
||||
.Arg("topN", "Number of nodes in outputs")
|
||||
.Input(0, "X", "Input data from previous layer")
|
||||
.Input(1, "W", "The matrix trained from Softmax Ops")
|
||||
.Input(2, "b", "The bias traiend from Softmax Ops")
|
||||
.Input(2, "b", "The bias trained from Softmax Ops")
|
||||
.Output(
|
||||
0,
|
||||
"Y_names",
|
||||
|
@ -140,7 +140,7 @@ bool HeatmapMaxKeypointOp<float, CPUContext>::RunOnDevice() {
|
||||
}
|
||||
assert(std::abs(delta(0)) <= MAX_DELTA);
|
||||
assert(std::abs(delta(1)) <= MAX_DELTA);
|
||||
// find maximum of detla scores
|
||||
// find maximum of delta scores
|
||||
keypoints(k, 0 * keypoint_count + j) =
|
||||
x0 + (0.5 + maxX + delta(0)) * xLen / heatmap_size;
|
||||
keypoints(k, 1 * keypoint_count + j) =
|
||||
|
@ -74,7 +74,7 @@ class SparseLengths8BitsRowwiseOp : public Operator<Context> {
|
||||
in_block_size,
|
||||
outputSize,
|
||||
indices_size,
|
||||
N, // embeding table length
|
||||
N, // embedding table length
|
||||
input_data,
|
||||
indices,
|
||||
lengths,
|
||||
|
@ -27,7 +27,7 @@ void ProcessBlob(
|
||||
auto& blob_states = *blob_states_ptr;
|
||||
if (blob_states.count(key) == 0) {
|
||||
// We reset the blob so that any existing content is destroyed. This
|
||||
// is to guaranee correct device placement: if we are deserializing
|
||||
// is to guarantee correct device placement: if we are deserializing
|
||||
// into a TensorCUDA, without explicit Reset we might be loading data
|
||||
// into an existing TensorCUDA that has pre-allocated memory on a
|
||||
// different GPU.
|
||||
|
@ -46,7 +46,7 @@ inline void LogCuDNNPerfStats(
|
||||
|
||||
// Easier indexing into force_algo_ vector,
|
||||
// shared by CudnnConvTransposeOpBase and CudnnConvOpBase to force
|
||||
// usage of a particular algortihm instead of searching
|
||||
// usage of a particular algorithm instead of searching
|
||||
enum { ALGO_FWD = 0, ALGO_WGRAD = 1, ALGO_DGRAD = 2 };
|
||||
|
||||
} // namespace caffe2
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user