Convert all tabs to spaces, add CI. (#18959)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18959
ghimport-source-id: a934163fa34cb2019732d5f49dc7290c376bf156

Differential Revision: D14831246

Pulled By: ezyang

fbshipit-source-id: beb92dc4ee8c82f4c8259c081dd72e477fe7a9d0
This commit is contained in:
Edward Yang
2019-04-09 08:02:30 -07:00
committed by Facebook Github Bot
parent 544783fa1d
commit 48a35135fb
80 changed files with 1509 additions and 1505 deletions

View File

@ -16,6 +16,10 @@ matrix:
python: "3.6"
dist: xenial
script: cd .circleci && ./ensure-consistency.py
- name: "Ensure no tabs"
python: "2.7"
script:
- (! git grep -I -l $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above files have tabs; please convert them to spaces"; false))
- name: "Python 2.7 Lint"
python: "2.7"
install: pip install flake8

View File

@ -252,21 +252,21 @@ IF(USE_CUDA AND NOT USE_ROCM)
EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
if(${CUDA_VERSION_MAJOR} EQUAL "8")
SET(CUFFT_FAKELINK_OPTIONS
--generate-code arch=compute_35,code=sm_35
--generate-code arch=compute_50,code=sm_50
--generate-code arch=compute_60,code=sm_60)
--generate-code arch=compute_35,code=sm_35
--generate-code arch=compute_50,code=sm_50
--generate-code arch=compute_60,code=sm_60)
elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
SET(CUFFT_FAKELINK_OPTIONS
--generate-code arch=compute_35,code=sm_35
--generate-code arch=compute_50,code=sm_50
--generate-code arch=compute_60,code=sm_60
--generate-code arch=compute_70,code=sm_70)
--generate-code arch=compute_35,code=sm_35
--generate-code arch=compute_50,code=sm_50
--generate-code arch=compute_60,code=sm_60
--generate-code arch=compute_70,code=sm_70)
elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
SET(CUFFT_FAKELINK_OPTIONS
--generate-code arch=compute_35,code=sm_35
--generate-code arch=compute_50,code=sm_50
--generate-code arch=compute_60,code=sm_60
--generate-code arch=compute_70,code=sm_70)
--generate-code arch=compute_35,code=sm_35
--generate-code arch=compute_50,code=sm_50
--generate-code arch=compute_60,code=sm_60
--generate-code arch=compute_70,code=sm_70)
else()
MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
endif()

View File

@ -19,7 +19,7 @@
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif (defined(__GNUC__) || defined(__xlC__)) && \
(defined(__VEC__) || defined(__ALTIVEC__))
(defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
#elif defined(__GNUC__) && defined(__SPE__)

View File

@ -46,12 +46,12 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
auto sr = right.size(i)>1;
if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size
AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
sum_size *= left.size(i);
AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
sum_size *= left.size(i);
} else if (sl) { // if it is only in one of left and right, we can sum right away
left = left.sum(i, true);
left = left.sum(i, true);
} else if (sr) {
right = right.sum(i, true);
right = right.sum(i, true);
}
} else if (sl && sr) { // now deal with dimensions dimensions that will be in the output
// dimensions nontrivially in both left and right must be of the same size
@ -117,7 +117,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
if (! keepdim) {
for (int i = dim-1; i>=0; i--)
if (sum_dims[i])
result.squeeze_(i);
result.squeeze_(i);
}
return result;
}
@ -183,7 +183,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
}
else { // we have seen an ellipsis before, so we check compatibility
AT_CHECK(candidate_num_ell_idxes == num_ell_idxes,
"ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
"ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
}
for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
current_op_idxes.push_back(first_ell_idx + i);
@ -360,8 +360,8 @@ Tensor einsum(std::string eqn, TensorList tensors) {
// the computation is unrolled in the unroll_dim dimension
// its main purpose is to unify the computations in bilinear and bilinear_backward
Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
IntArrayRef sumdim_, int64_t unroll_dim) {
IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
IntArrayRef sumdim_, int64_t unroll_dim) {
int64_t total_dim = i1_.dim()+expand1_.size();
AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]");
auto expand1 = at::dim_list_to_bitset(expand1_, total_dim);
@ -390,11 +390,11 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
if (expand3[i]) {
i3 = i3.unsqueeze(i);
if (sumdim[i] && (i != unroll_dim))
sum_dims_12.push_back(i);
sum_dims_12.push_back(i);
} else {
s = i3.size(i);
if (sumdim[i] && (i != unroll_dim))
sum_dims_23.push_back(i);
sum_dims_23.push_back(i);
}
output_size.push_back(sumdim[i] ? 1 : s);
if (i == unroll_dim)
@ -408,8 +408,8 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
if (! sumdim[unroll_dim]) {
for (int64_t k = 0; k < unroll_size; k++) {
Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1),
i2.narrow(unroll_dim, k * slicemul2, 1),
sum_dims_12, true);
i2.narrow(unroll_dim, k * slicemul2, 1),
sum_dims_12, true);
buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k * slicemul3, 1), sum_dims_23, true);
output.narrow(unroll_dim, k, 1).add_(buf);
}
@ -417,7 +417,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
else {
for (int64_t k = 0; k < unroll_size; k++) {
Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k*slicemul1, 1),
i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k*slicemul3, 1), sum_dims_23, true);
output.add_(buf);
}
@ -473,7 +473,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
t2 = t2.sum(dims2[i], true);
} else {
AT_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i],
" and second has size ", s2, " in dim ", dims2[i]);
" and second has size ", s2, " in dim ", dims2[i]);
csize *= s1;
}
}

View File

@ -61,7 +61,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
tg_batch_offsets[i] = pos;
pos += target_lengths[i];
if (max_target_length < target_lengths[i])
max_target_length = target_lengths[i];
max_target_length = target_lengths[i];
}
tg_target_stride = targets.stride(0);
checkSize(c, targets_arg, 0, pos);
@ -83,8 +83,8 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
int64_t max_input_length = log_probs.size(0);
for (int64_t b = 0; b < batch_size; b++) {
AT_CHECK(input_lengths[b] <= max_input_length,
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
" (while checking arguments for ", c, ")");
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
" (while checking arguments for ", c, ")");
}
Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
@ -115,11 +115,11 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
// now the loop over the inputs
for (int64_t t=1; t<input_length; t++) {
for (int64_t s=0; s<2*target_length+1; s++) {
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
// for the cuda implementation, that gave a speed boost.
// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
// for the cuda implementation, that gave a speed boost.
// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
scalar_t la1 = log_alpha_a[t-1][s];
scalar_t lamax = la1;
@ -141,7 +141,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
}
if (lamax == neginf) // cannot do neginf-neginf
lamax = 0;
// this is the assignment of eq (6)
// this is the assignment of eq (6)
log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime];
}
}
@ -182,7 +182,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
tg_batch_offsets[i] = pos;
pos += target_lengths[i];
if (max_target_length < target_lengths[i])
max_target_length = target_lengths[i];
max_target_length = target_lengths[i];
}
tg_target_stride = targets.stride(0);
}
@ -268,9 +268,9 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime];
// one might check whether one can vectorize this better when done after the t-loop...
// now that we have beta, we fill in the sum of alpha*beta in eq (16)
// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
// issue (several s can map to the same target character)
// now that we have beta, we fill in the sum of alpha*beta in eq (16)
// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
// issue (several s can map to the same target character)
// collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
scalar_t log_alpha_beta = log_alpha_a[t][s] + log_beta_a[t][s];
scalar_t &lcab = grad_a[t][current_target_prime];
@ -309,9 +309,9 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu(const Tensor& log_probs, const Tensor& t
(void)zero_infinity; // only used for backwards
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cpu", [&] {
if (targets.scalar_type() == kLong) {
return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
} else {
return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
}
});
}
@ -320,9 +320,9 @@ Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cpu", [&] {
if (targets.scalar_type() == kLong) {
return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
} else {
return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
}
});
}

View File

@ -76,11 +76,11 @@ pthreadpool_t nnpack_threadpool() {
enum nnp_status nnpack_status = nnp_initialize();
if (nnpack_status != nnp_status_success) {
if (nnpack_status == nnp_status_out_of_memory) {
throw std::runtime_error("could not initialize NNPack (out of memory)");
throw std::runtime_error("could not initialize NNPack (out of memory)");
} else if (nnpack_status == nnp_status_unsupported_hardware) {
throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
} else {
throw std::runtime_error("could not initialize NNPack (unknown error)");
throw std::runtime_error("could not initialize NNPack (unknown error)");
}
}
unsigned int threads;

View File

@ -614,7 +614,7 @@ std::tuple<Tensor, Tensor> NAME( \
num_layers, dropout_p, train, bidirectional, batch_first); \
return std::make_tuple(output, hy); \
} \
check_device(_input, _params, hx); \
check_device(_input, _params, hx); \
auto input = batch_first ? _input.transpose(0, 1) : _input; \
auto params = gather_params(_params, has_biases); \
auto results = _rnn_impl_with_concat<CELL, FullLayer, FullBidirectionalLayer>( \

View File

@ -126,10 +126,10 @@ Tensor& arange_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
double size_d;
if (std::is_same<scalar_t, int64_t>::value) {
size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
/ step.to<accscalar_t>());
/ step.to<accscalar_t>());
} else {
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
/ step.to<double>());
/ step.to<double>());
}
AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");

View File

@ -100,7 +100,7 @@ typedef union imm_xmm_union {
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
imm_xmm_union u __attribute__((aligned(32))); \
u.imm = imm_; \
u.imm = imm_; \
xmm0_ = u.xmm[0]; \
xmm1_ = u.xmm[1]; \
}
@ -228,8 +228,8 @@ inline v8sf log256_ps(v8sf x) {
return x;
}
_PS256_CONST(exp_hi, 88.3762626647949f);
_PS256_CONST(exp_lo, -88.3762626647949f);
_PS256_CONST(exp_hi, 88.3762626647949f);
_PS256_CONST(exp_lo, -88.3762626647949f);
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS256_CONST(cephes_exp_C1, 0.693359375);

View File

@ -266,7 +266,7 @@ public:
CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1,
exec_type, batch, &ws_size_t));
exec_type, batch, &ws_size_t));
#else
CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,

View File

@ -87,10 +87,10 @@ __global__ void embedding_backward_feature_kernel
match_found_this_thread = 0;
#ifdef __HIP_PLATFORM_HCC__
unsigned long long int matchmask = WARP_BALLOT(match_found_this_thread);
int first_remaining_peer = __ffsll(matchmask) - 1;
int first_remaining_peer = __ffsll(matchmask) - 1;
#else
unsigned int matchmask = WARP_BALLOT(match_found_this_thread);
int first_remaining_peer = __ffs(matchmask) - 1;
int first_remaining_peer = __ffs(matchmask) - 1;
#endif
if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader
@ -103,7 +103,7 @@ __global__ void embedding_backward_feature_kernel
#else
first_remaining_peer = __ffs(matchmask) - 1;
#endif
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
matchmask ^= (1 << first_remaining_peer);
}
if(f < s)

View File

@ -110,8 +110,8 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
for (int64_t t=1; t < max_input_length; t++) {
__syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) {
// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
// lamax is the maximum for the logsumexp trick.
// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
// lamax is the maximum for the logsumexp trick.
scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
scalar_t lamax = la1;
scalar_t la2, la3;
@ -135,7 +135,7 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax
+ log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
} else {
// otherwise we just set to neginf
// otherwise we just set to neginf
if (s < 2*max_target_length+1)
log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
}
@ -218,8 +218,8 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
int64_t max_input_length = log_probs.size(0);
for (int64_t b = 0; b < batch_size; b++) {
AT_CHECK(input_lengths[b] <= max_input_length,
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
" (while checking arguments for ", c, ")");
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
" (while checking arguments for ", c, ")");
}
auto target_lengths_t = at::tensor(target_lengths, targets.options().dtype(kLong));
@ -242,7 +242,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>(
log_alpha.data<scalar_t>(),
log_alpha.data<scalar_t>(),
log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
neg_log_likelihood.data<scalar_t>(),
@ -304,8 +304,8 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
if (s < 2*target_length+1) {
current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
have_three = ((s < 2*target_length-1) &&
(get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
current_target_prime));
(get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
current_target_prime));
} else {
current_target_prime = BLANK;
have_three = false;
@ -377,7 +377,7 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da
int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime!
@ -405,9 +405,9 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da
for (int64_t t = 0; t < input_length; t++) {
scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
-std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
+ nll - lp) * gr);
-std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
+ nll - lp) * gr);
}
}
@ -429,7 +429,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
constexpr scalar_t neginf = -INFINITY;
int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
@ -481,7 +481,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
template<typename scalar_t, ScalarType target_scalar_type>
Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
constexpr scalar_t neginf = -INFINITY;
using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
int64_t batch_size = log_probs.size(1);
@ -500,7 +500,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
tg_batch_offsets_data[i] = pos;
pos += target_lengths[i];
if (max_target_length < target_lengths[i])
max_target_length = target_lengths[i];
max_target_length = target_lengths[i];
}
tg_target_stride = targets.stride(0);
}
@ -558,15 +558,15 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
// maybe we should kernelize this, too.
auto grad_blank = grad.narrow(2, BLANK, 1);
grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
{log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
+ log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
{log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
2, true)
.permute({1, 0, 2})
.add_(neg_log_likelihood.view({1, batch_size, 1}))
.sub_(log_probs.narrow(2, BLANK, 1))
.exp_()
);
{log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
+ log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
{log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
2, true)
.permute({1, 0, 2})
.add_(neg_log_likelihood.view({1, batch_size, 1}))
.sub_(log_probs.narrow(2, BLANK, 1))
.exp_()
);
// scale by output gradient (blanks and first summand of non-blanks)
grad *= grad_out.view({1, batch_size, 1});
if (zero_infinity) {
@ -630,9 +630,9 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& t
(void)zero_infinity; // only used for backward
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cuda", [&] {
if (targets.scalar_type() == kLong) {
return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
} else {
return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
}
});
}
@ -641,9 +641,9 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cuda", [&] {
if (targets.scalar_type() == kLong) {
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
} else {
return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
}
});
}

View File

@ -146,10 +146,10 @@ Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
double size_d;
if (std::is_same<scalar_t, int64_t>::value) {
size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
/ step.to<accscalar_t>());
/ step.to<accscalar_t>());
} else {
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
/ step.to<double>());
/ step.to<double>());
}
AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");

View File

@ -441,18 +441,18 @@ std::tuple<Tensor, Tensor> weight_norm_cuda_backward
{
using accscalar_t = acc_type<scalar_t, true>;
weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
<<<grad_w.size(0),
BLOCK,
BLOCK*sizeof(accscalar_t),
weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
<<<grad_w.size(0),
BLOCK,
BLOCK*sizeof(accscalar_t),
stream>>>
(grad_v.data<scalar_t>(),
grad_g.data<scalar_t>(),
grad_w.data<scalar_t>(),
saved_v.data<scalar_t>(),
saved_g.data<scalar_t>(),
saved_norms.data<accscalar_t>(),
rowSize);
(grad_v.data<scalar_t>(),
grad_g.data<scalar_t>(),
grad_w.data<scalar_t>(),
saved_v.data<scalar_t>(),
saved_g.data<scalar_t>(),
saved_norms.data<accscalar_t>(),
rowSize);
});
}
else if(dim == ndims - 1)

View File

@ -72,17 +72,17 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
size_t workspace_size;
AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(),
targets->data<int>(), target_lengths.data(), input_lengths.data(),
algo, ctc_loss_desc.desc(), &workspace_size));
targets->data<int>(), target_lengths.data(), input_lengths.data(),
algo, ctc_loss_desc.desc(), &workspace_size));
Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte));
Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
targets->data<int>(), target_lengths.data(), input_lengths.data(),
costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
targets->data<int>(), target_lengths.data(), input_lengths.data(),
costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
return std::make_tuple(costs, grad);
}

View File

@ -456,7 +456,7 @@ struct algorithm_search<miopenConvFwdAlgorithm_t> {
args.wdesc.desc(), args.weight.data_ptr(),
args.cdesc.desc(),
args.odesc.desc(), args.output.data_ptr(),
1, // just return the fastest
1, // just return the fastest
&perf_count,
&perf_results,
ws.data,

View File

@ -5,12 +5,12 @@
#define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)
#define THLapackCheck(fmt, func, info , ...) \
if (info < 0) { \
#define THLapackCheck(fmt, func, info , ...) \
if (info < 0) { \
THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
} else if(info > 0) { \
THError(fmt, func, info, ##__VA_ARGS__); \
} \
} else if(info > 0) { \
THError(fmt, func, info, ##__VA_ARGS__); \
} \
#define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...) \
if (info < 0) { \

View File

@ -14,7 +14,7 @@ typedef struct THMemoryFile__
THCharStorage *storage;
ssize_t size;
ssize_t position;
int longSize;
int longSize;
} THMemoryFile;

View File

@ -149,13 +149,13 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
/* get optimal workspace size */
THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
rb__->data<scalar_t>(), ldb,
&wkopt, -1, &info);
rb__->data<scalar_t>(), ldb,
&wkopt, -1, &info);
lwork = (int)wkopt;
work = THTensor_(newWithSize1d)(lwork);
THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
rb__->data<scalar_t>(), ldb,
work->data<scalar_t>(), lwork, &info);
rb__->data<scalar_t>(), ldb,
work->data<scalar_t>(), lwork, &info);
THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
THCleanup(c10::raw::intrusive_ptr::decref(ra__);
@ -378,21 +378,21 @@ void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
}
THLapack_(gesdd)(jobz,
m,n,ra__->data<scalar_t>(),lda,
rs__data,
ru__data,
ldu,
rv__data, ldvt,
&wkopt, -1, THIntTensor_data(iwork), &info);
m,n,ra__->data<scalar_t>(),lda,
rs__data,
ru__data,
ldu,
rv__data, ldvt,
&wkopt, -1, THIntTensor_data(iwork), &info);
lwork = (int)wkopt;
work = THTensor_(newWithSize1d)(lwork);
THLapack_(gesdd)(jobz,
m,n,ra__->data<scalar_t>(),lda,
rs__data,
ru__data,
ldu,
rv__data, ldvt,
work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
m,n,ra__->data<scalar_t>(),lda,
rs__data,
ru__data,
ldu,
rv__data, ldvt,
work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
if (jobz != 'N') {
THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",

View File

@ -999,31 +999,31 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
return equal;
}
#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \
#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \
void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, scalar_t value) \
{ \
THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t, \
*r__data = (*t_data OP value) ? 1 : 0;); \
} \
void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value) \
{ \
THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, \
*r__data = (*t_data OP value) ? 1 : 0;); \
} \
{ \
THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t, \
*r__data = (*t_data OP value) ? 1 : 0;); \
} \
void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value) \
{ \
THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, \
*r__data = (*t_data OP value) ? 1 : 0;); \
} \
void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
{ \
THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb, \
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
} \
{ \
THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb, \
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
} \
void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
{ \
THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb, \
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
} \
{ \
THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb, \
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
} \
TENSOR_IMPLEMENT_LOGICAL(lt,<)
@ -1302,10 +1302,10 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, scalar_t value, int dimension, i
*r__data = TH_MATH_NAME(pow)(sum, 1.0/3), 0);
} else if (value == INFINITY) {
DIM_REDUCE(sum = THMax(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
*r__data = sum, 0);
*r__data = sum, 0);
} else if (value == -INFINITY) {
DIM_REDUCE(sum = THMin(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
*r__data = sum, INFINITY);
*r__data = sum, INFINITY);
} else {
DIM_REDUCE(sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(t_data[i*t_stride]), value),
*r__data = TH_MATH_NAME(pow)(sum, 1.0/value), 0);

View File

@ -17,10 +17,10 @@ TH_API void THVector_(cdiv)(scalar_t *z, const scalar_t *x, const scalar_t *y, c
TH_API void THVector_(divs)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n);
TH_API void THVector_(neg)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
TH_API void THVector_(normal_fill)(scalar_t *data,
const int64_t size,
struct THGenerator *generator,
const scalar_t mean,
const scalar_t stddev);
const int64_t size,
struct THGenerator *generator,
const scalar_t mean,
const scalar_t stddev);
#endif /* non bool only part */

View File

@ -1342,7 +1342,7 @@ static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const
// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
// $ ./vsxtest
//
// TODO
// TODO
//
//
// Finished running all tests. All tests PASSED.

View File

@ -119,7 +119,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
#else
uint32_t a = *eax, b, c = *ecx, d;
asm volatile ( "cpuid\n\t"
: "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
: "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
*eax = a;
*ebx = b;
*ecx = c;

View File

@ -308,12 +308,12 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
if (prop->major >= 5){
THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
THCublasCheck(cublasGemmEx(handle, opa, opb,
THCublasCheck(cublasGemmEx(handle, opa, opb,
i_m, i_n, i_k, &fAlpha,
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc,
CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP));
THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
}else{
THCublasCheck(cublasSgemmEx(handle, opa, opb,
i_m, i_n, i_k, &fAlpha,

View File

@ -48,7 +48,7 @@ inline bool getCatGrid(THCState* state, ptrdiff_t nTensors, dim3& grid) {
//X dim of grid for cat array cooperates on a single tensor in the cat.
//Given half of the GPU, full utilization will always occur.
grid = dim3( 2LL * numSM, (long long) nTensors );
return true;
}
@ -131,7 +131,7 @@ __global__ void CatArrayBatchedCopy(
while( tid < nElements){
IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
os.outputSize, os.outputStride, dimSize, concatDim, tid);
os.outputSize, os.outputStride, dimSize, concatDim, tid);
output[dataOffset + elementOffset] = data[tid];
tid += stride;

View File

@ -79,7 +79,7 @@ condDiv(T *q, int64_t *J, int64_t inputsize, T q_max) {
q[idx] = one;
} else {
if (THCNumerics<T>::gt(q_max, one)) {
q[idx] = THCNumerics<T>::div(q[idx], q_max);
q[idx] = THCNumerics<T>::div(q[idx], q_max);
}
}
}
@ -236,7 +236,7 @@ sampleMultinomialOnce(int64_t* dest,
THCNumerics<AccT>::div(
ScalarConvert<T, AccT>::to(dist[curDist * stride_dist + cat * stride_categories]),
sum) :
accZero);
accZero);
smem[threadIdx.x] = dist_val;
__syncthreads();

View File

@ -42,7 +42,7 @@ THCTensor_(numel)(THCState *state, THCTensor *t)
}
void THCTensor_(cat)(THCState *state, THCTensor *result,
THCTensor *ta, THCTensor *tb, int dimension)
THCTensor *ta, THCTensor *tb, int dimension)
{
THCTensor* inputs[2];
inputs[0] = ta;
@ -73,7 +73,7 @@ inline void THCTensor_(check_shape_except_dim)(THCState *state,
}
void THCTensor_(catArray)(THCState *state, THCTensor *result,
THCTensor **inputs, int numInputs, int dimension)
THCTensor **inputs, int numInputs, int dimension)
{
// previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
// to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors

View File

@ -34,9 +34,9 @@ THC_API scalar_t THCTensor_(maxall)(THCState *state, THCTensor *self);
THC_API scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self);
THC_API void THCTensor_(median)(THCState *state,
THCTensor *values,
THCudaLongTensor *indices,
THCTensor *src, int dim, int keepdim);
THCTensor *values,
THCudaLongTensor *indices,
THCTensor *src, int dim, int keepdim);
THC_API accreal THCTensor_(dist)(THCState *state, THCTensor *self, THCTensor *src,
scalar_t value);

View File

@ -249,7 +249,7 @@ void THCTensor_(multinomial)(struct THCState *state,
THCudaLongTensor_data(state, self),
numDist, numCategories,
THCTensor_(data)(state, prefixSum),
THCTensor_(data)(state, normDist));
THCTensor_(data)(state, normDist));
} else {
// Sample without replacement

View File

@ -7,12 +7,12 @@
#define ZERO_MACRO zero<T>()
template <typename T>
inline __device__ typename std::enable_if<std::is_same<T, double>::value, T>::type zero() {
return 0.;
return 0.;
}
template <typename T>
inline __device__ typename std::enable_if<!std::is_same<T, double>::value, T>::type zero() {
return 0.f;
return 0.f;
}
#else
#define ZERO_MACRO 0.f

View File

@ -88,7 +88,7 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature
#else
first_remaining_peer = __ffs(matchmask) - 1;
#endif
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
matchmask ^= (1 << first_remaining_peer);
}
if(f < s)

View File

@ -49,14 +49,14 @@ __global__ void cunn_LookupTableBag_updateOutputKernel(
for (int64_t emb = begin; emb < end; emb++) {
const int weightRow = ((int) input[emb]) * stride;
weightFeatSum += ScalarConvert<Dtype, Acctype>::to(weightFeat[weightRow]);
bag_size_ ++;
bag_size_ ++;
if (featureDim == 0) {
offset2bag[emb] = bag;
}
}
if (mode == MODE_MEAN) {
weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
bag_size[bag] = bag_size_;
weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
bag_size[bag] = bag_size_;
}
(void) MODE_SUM; //silence warnings about unused MODE_SUM;
output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
@ -114,9 +114,9 @@ __global__ void cunn_LookupTableBag_accGradParametersKernel(
if (featureDim < stride)
{
gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
if (mode == MODE_MEAN) {
gradient[ii] /= bag_size[seq_number];
}
if (mode == MODE_MEAN) {
gradient[ii] /= bag_size[seq_number];
}
weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
}
}

View File

@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
C10_LAUNCH_BOUNDS_1(1024)
#endif
__global__ void nearest_neighbor_4d_kernel(
const int n,
const THCDeviceTensor<Dtype, 4> data1,
THCDeviceTensor<Dtype, 4> data2) {
const int n,
const THCDeviceTensor<Dtype, 4> data1,
THCDeviceTensor<Dtype, 4> data2) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = data1.getSize(0);
const int channels = data1.getSize(1);
@ -37,10 +37,10 @@ __global__ void nearest_neighbor_4d_kernel(
const int h1 = h2;
const int w1 = w2;
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const Dtype val = data1[n][c][h1][w1];
data2[n][c][h2][w2] = val;
}
for (int c = 0; c < channels; ++c) {
const Dtype val = data1[n][c][h1][w1];
data2[n][c][h2][w2] = val;
}
}
return;
}
@ -49,8 +49,8 @@ __global__ void nearest_neighbor_4d_kernel(
const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const Dtype val = data1[n][c][h1][w1];
data2[n][c][h2][w2] = val;
const Dtype val = data1[n][c][h1][w1];
data2[n][c][h2][w2] = val;
}
}
}
@ -62,9 +62,9 @@ template <typename Dtype, typename Acctype>
C10_LAUNCH_BOUNDS_1(1024)
#endif
__global__ void nearest_neighbor_4d_kernel_backward(
const int n,
THCDeviceTensor<Dtype, 4> data1,
const THCDeviceTensor<Dtype, 4> data2) {
const int n,
THCDeviceTensor<Dtype, 4> data1,
const THCDeviceTensor<Dtype, 4> data2) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = data1.getSize(0);
const int channels = data1.getSize(1);
@ -83,10 +83,10 @@ __global__ void nearest_neighbor_4d_kernel_backward(
const int h1 = h2;
const int w1 = w2;
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const Dtype val = data2[n][c][h2][w2];
data1[n][c][h1][w1] = val;
}
for (int c = 0; c < channels; ++c) {
const Dtype val = data2[n][c][h2][w2];
data1[n][c][h1][w1] = val;
}
}
return;
}

View File

@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
C10_LAUNCH_BOUNDS_1(1024)
#endif
__global__ void nearest_neighbor_3d_kernel(
const int n,
const THCDeviceTensor<Dtype, 3> data1,
THCDeviceTensor<Dtype, 3> data2) {
const int n,
const THCDeviceTensor<Dtype, 3> data1,
THCDeviceTensor<Dtype, 3> data2) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = data1.getSize(0);
const int channels = data1.getSize(1);
@ -32,10 +32,10 @@ __global__ void nearest_neighbor_3d_kernel(
if (width1 == width2) {
const int w1 = w2;
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const Dtype val = data1[n][c][w1];
data2[n][c][w2] = val;
}
for (int c = 0; c < channels; ++c) {
const Dtype val = data1[n][c][w1];
data2[n][c][w2] = val;
}
}
return;
}
@ -43,8 +43,8 @@ __global__ void nearest_neighbor_3d_kernel(
const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const Dtype val = data1[n][c][w1];
data2[n][c][w2] = val;
const Dtype val = data1[n][c][w1];
data2[n][c][w2] = val;
}
}
}
@ -56,9 +56,9 @@ template <typename Dtype, typename Acctype>
C10_LAUNCH_BOUNDS_1(1024)
#endif
__global__ void nearest_neighbor_3d_kernel_backward(
const int n,
THCDeviceTensor<Dtype, 3> data1,
const THCDeviceTensor<Dtype, 3> data2) {
const int n,
THCDeviceTensor<Dtype, 3> data1,
const THCDeviceTensor<Dtype, 3> data2) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = data1.getSize(0);
const int channels = data1.getSize(1);
@ -72,10 +72,10 @@ __global__ void nearest_neighbor_3d_kernel_backward(
if (width1 == width2) {
const int w1 = w2;
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const Dtype val = data2[n][c][w1];
data1[n][c][w2] = val;
}
for (int c = 0; c < channels; ++c) {
const Dtype val = data2[n][c][w1];
data1[n][c][w2] = val;
}
}
return;
}

View File

@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
C10_LAUNCH_BOUNDS_1(1024)
#endif
__global__ void nearest_neighbor_5d_kernel(
const int n,
const THCDeviceTensor<Dtype, 5> data1,
THCDeviceTensor<Dtype, 5> data2) {
const int n,
const THCDeviceTensor<Dtype, 5> data1,
THCDeviceTensor<Dtype, 5> data2) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = data1.getSize(0);
const int channels = data1.getSize(1);
@ -55,8 +55,8 @@ __global__ void nearest_neighbor_5d_kernel(
const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const Dtype val = data1[n][c][d1][h1][w1];
data2[n][c][d2][h2][w2] = val;
const Dtype val = data1[n][c][d1][h1][w1];
data2[n][c][d2][h2][w2] = val;
}
}
}
@ -68,9 +68,9 @@ template <typename Dtype, typename Acctype>
C10_LAUNCH_BOUNDS_1(1024)
#endif
__global__ void nearest_neighbor_5d_kernel_backward(
const int n,
THCDeviceTensor<Dtype, 5> data1,
const THCDeviceTensor<Dtype, 5> data2) {
const int n,
THCDeviceTensor<Dtype, 5> data1,
const THCDeviceTensor<Dtype, 5> data2) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = data1.getSize(0);
const int channels = data1.getSize(1);
@ -108,8 +108,8 @@ __global__ void nearest_neighbor_5d_kernel_backward(
const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; ++c) {
const Dtype val = data2[n][c][d2][h2][w2];
atomicAdd(data1[n][c][d1][h1][w1].data(), val);
const Dtype val = data2[n][c][d2][h2][w2];
atomicAdd(data1[n][c][d1][h1][w1].data(), val);
}
}
}

View File

@ -24,7 +24,7 @@ inline int GET_BLOCKS(const int N)
}
#define THCUNN_check_shape(STATE, I1, I2) \
if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \
if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \
{ \
THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1); \
THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \
@ -47,20 +47,20 @@ inline int GET_BLOCKS(const int N)
ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1); \
ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2); \
if (n1 != n2) \
{ \
{ \
THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1); \
THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2); \
THError(#I1 " and " #I2 " have different number of elements: " \
THError(#I1 " and " #I2 " have different number of elements: " \
#I1 "%s has %ld elements, while " \
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
} \
} \
}
#define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \
THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \
THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
}

View File

@ -10,7 +10,7 @@ void THNN_(LookupTableBag_updateOutput)(
THCTensor *weight,
THCTensor *output,
THCIndexTensor *offset2bag,
int mode,
int mode,
THCIndexTensor *bag_size)
{
THCUNN_assertSameGPU(state, 5, input, offsets, weight, output, offset2bag);
@ -65,8 +65,8 @@ void THNN_(LookupTableBag_accGradParameters)(
THCIndexTensor *sortedIndices,
THCIndexTensor *origIndices,
bool scaleGradByFreq,
int mode,
THCIndexTensor *bag_size,
int mode,
THCIndexTensor *bag_size,
accreal scale_)
{
scalar_t scale = ScalarConvert<accreal, scalar_t>::to(scale_);

View File

@ -88,7 +88,7 @@ static THCTensor* THNN_(newViewWeightMM2d)(THCState *state, THCTensor *weight) {
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
THCTensor *old_weight = weight;
weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(),
s1, -1, s2, -1);
s1, -1, s2, -1);
THCTensor_(free)(state, old_weight);
}
return weight;

View File

@ -11,7 +11,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
int kH, int kW, int dH, int dW, int padH, int padW,
int dilationH, int dilationW, int weight_nullable) {
THArgCheck(kW > 0 && kH > 0, 9,
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
THArgCheck(dW > 0 && dH > 0, 11,
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
THArgCheck(dilationW > 0 && dilationH > 0, 14,

View File

@ -34,7 +34,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
int outputHeight,
int outputHeight,
int outputWidth)
{
THCUNN_assertSameGPU(state, 2, input, output);
@ -44,14 +44,14 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
int inputWidth = THCTensor_(size)(state, input, 3);
THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
inputHeight, inputWidth,
outputHeight, outputWidth);
inputHeight, inputWidth,
outputHeight, outputWidth);
THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
THCTensor_(resize4d)(state, output,
THCTensor_(size)(state, input, 0),
THCTensor_(size)(state, input, 1),
outputHeight,
outputHeight,
outputWidth);
THCTensor_(zero)(state, output);
@ -62,7 +62,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
cudaStream_t stream = THCState_getCurrentStream(state);
nearest_neighbor_4d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
0, stream>>>(num_kernels, idata, odata);
0, stream>>>(num_kernels, idata, odata);
THCudaCheck(cudaGetLastError());
}
@ -73,15 +73,15 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
THCTensor *gradOutput,
THCTensor *gradInput,
int nbatch,
int nchannels,
int inputHeight,
int inputWidth,
int outputHeight,
int outputWidth)
int nchannels,
int inputHeight,
int inputWidth,
int outputHeight,
int outputWidth)
{
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
inputHeight, inputWidth, outputHeight, outputWidth);
inputHeight, inputWidth, outputHeight, outputWidth);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
@ -94,7 +94,7 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
cudaStream_t stream = THCState_getCurrentStream(state);
nearest_neighbor_4d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
num_threads, 0, stream>>>(num_kernels, data1, data2);
num_threads, 0, stream>>>(num_kernels, data1, data2);
THCudaCheck(cudaGetLastError());
THCTensor_(free)(state, gradOutput);
}

View File

@ -243,7 +243,7 @@ THC_API void THNN_(LookupTableBag_updateOutput)(
THCTensor *weight,
THCTensor *output,
THCIndexTensor *offset2bag,
int mode,
int mode,
THCIndexTensor *seq_length); // [OPTIONAL]
THC_API void THNN_(LookupTableBag_accGradParameters)(
@ -256,8 +256,8 @@ THC_API void THNN_(LookupTableBag_accGradParameters)(
THCIndexTensor *sortedIndices,
THCIndexTensor *origIndices,
bool scaleGradByFreq,
int mode,
THCIndexTensor *seq_length, // [OPTIONAL]
int mode,
THCIndexTensor *seq_length, // [OPTIONAL]
accreal scale_);
THC_API void THNN_(L1Cost_updateOutput)(

View File

@ -54,7 +54,7 @@ void THNN_(TemporalUpSamplingNearest_updateOutput)(
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
cudaStream_t stream = THCState_getCurrentStream(state);
nearest_neighbor_3d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
0, stream>>>(num_kernels, idata, odata);
0, stream>>>(num_kernels, idata, odata);
THCudaCheck(cudaGetLastError());
}
@ -82,7 +82,7 @@ void THNN_(TemporalUpSamplingNearest_updateGradInput)(
cudaStream_t stream = THCState_getCurrentStream(state);
nearest_neighbor_3d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
num_threads, 0, stream>>>(num_kernels, data1, data2);
num_threads, 0, stream>>>(num_kernels, data1, data2);
THCudaCheck(cudaGetLastError());
THCTensor_(free)(state, gradOutput);

View File

@ -47,10 +47,10 @@ void THNN_(VolumetricUpSamplingNearest_updateOutput)(
int inputWidth = THCTensor_(size)(state, input, 4);
THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
inputDepth, inputHeight, inputWidth,
outputDepth, outputHeight, outputWidth);
inputDepth, inputHeight, inputWidth,
outputDepth, outputHeight, outputWidth);
THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
THCTensor_(resize5d)(state, output,
THCTensor_(size)(state, input, 0),
@ -67,7 +67,7 @@ void THNN_(VolumetricUpSamplingNearest_updateOutput)(
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
cudaStream_t stream = THCState_getCurrentStream(state);
nearest_neighbor_5d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
0, stream>>>(num_kernels, idata, odata);
0, stream>>>(num_kernels, idata, odata);
THCudaCheck(cudaGetLastError());
}
@ -88,8 +88,8 @@ void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
{
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
inputDepth, inputHeight, inputWidth,
outputDepth, outputHeight, outputWidth);
inputDepth, inputHeight, inputWidth,
outputDepth, outputHeight, outputWidth);
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);
@ -100,7 +100,7 @@ void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
cudaStream_t stream = THCState_getCurrentStream(state);
nearest_neighbor_5d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
num_threads, 0, stream>>>(num_kernels, data1, data2);
num_threads, 0, stream>>>(num_kernels, data1, data2);
THCudaCheck(cudaGetLastError());
THCTensor_(free)(state, gradOutput);
}

View File

@ -36,7 +36,7 @@ static Acctype linear_upsampling_compute_source_index(
__device__ __forceinline__
static int nearest_neighbor_compute_source_index(
const float scale, int dst_index, int inputSize) {
const float scale, int dst_index, int inputSize) {
const int src_index = MIN(floor(dst_index * scale), inputSize - 1);
return src_index;
}

View File

@ -29,16 +29,16 @@ void THNN_(BCECriterion_updateOutput)(
scalar_t y = *target_data;
THAssertMsg(x >= 0. && x <= 1.,
"input value should be between 0~1, but got %f",
(double) x);
*output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
(double) x);
*output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
);
if (weights) {
if (weights) {
THTensor_(cmul)(output, output, weights);
}
return;
}
THTensor_(resize0d)(output);
THTensor_(resize0d)(output);
scalar_t sum = 0;
if (weights) {
@ -48,7 +48,7 @@ void THNN_(BCECriterion_updateOutput)(
scalar_t w = *weights_data;
THAssertMsg(x >= 0. && x <= 1.,
"input value should be between 0~1, but got %f",
(double) x);
(double) x);
sum -= (safe_log(x) * y + safe_log(1. - x) * (1. - y)) * w;
);
} else {
@ -57,7 +57,7 @@ void THNN_(BCECriterion_updateOutput)(
scalar_t y = *target_data;
THAssertMsg(x >= 0. && x <= 1.,
"input value should be between 0~1, but got %f",
(double) x);
(double) x);
sum -= safe_log(x) * y + safe_log(1. - x) * (1. - y);
);
}

View File

@ -25,7 +25,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
if (weights && THTensor_(nElement)(weights) != n_classes) {
THDescBuff s1 = THTensor_(sizeDesc)(weights);
THError("weight tensor should be defined either for all %d classes or no classes"
" but got weight tensor of shape: %s", n_classes, s1.str);
" but got weight tensor of shape: %s", n_classes, s1.str);
}
if (reduction == Reduction::None && n_dims == 2) {
@ -39,8 +39,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
int cur_target = THLongTensor_fastGetLegacy1dNoScalars(target, i);
if (cur_target == ignore_index) {
THTensor_(fastSet1d)(output, i, 0.0f);
continue;
THTensor_(fastSet1d)(output, i, 0.0f);
continue;
}
if (cur_target >= 0 && cur_target < n_classes) {
scalar_t cur_weight = weights ? THTensor_(fastGetLegacy1dNoScalars)(weights, cur_target) : 1.0f;

View File

@ -40,7 +40,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
{
THIndex_t idx = THIndexTensor_(get1d)(target, t);
THArgCheck((idx >= 0) && (idx < dim), 3,
"target out of range");
"target out of range");
}
input = THTensor_(newContiguous)(input);

View File

@ -98,7 +98,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
if (input->dim() == 4)
{

View File

@ -6,9 +6,9 @@
#include <algorithm>
static inline void THNN_(SpatialAveragePooling_shapeCheck)(
THTensor *input, THTensor *gradOutput,
int kH, int kW, int dH, int dW, int padH, int padW,
bool ceil_mode) {
THTensor *input, THTensor *gradOutput,
int kH, int kW, int dH, int dW, int padH, int padW,
bool ceil_mode) {
THArgCheck(kW > 0 && kH > 0, 5,
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@ -27,12 +27,12 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
}
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
"non-empty 3D or 4D input tensor expected but got: %s");
"non-empty 3D or 4D input tensor expected but got: %s");
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
"pad should be smaller than half of kernel size, but got "
"padW = %d, padH = %d, kW = %d, kH = %d",
padW, padH, kW, kH);
"pad should be smaller than half of kernel size, but got "
"padW = %d, padH = %d, kW = %d, kH = %d",
padW, padH, kW, kH);
int64_t nInputPlane = input->size(dimh-1);
int64_t inputHeight = input->size(dimh);
@ -44,7 +44,7 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
if (outputWidth < 1 || outputHeight < 1)
THError("Given input size: (%dx%dx%d). "
"Calculated output size: (%dx%dx%d). Output size is too small",
"Calculated output size: (%dx%dx%d). Output size is too small",
nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
if (gradOutput != NULL) {

View File

@ -4,12 +4,12 @@
#define INITIAL_CHECK \
THArgCheck(THIndexTensor_(nDimensionLegacyAll)(target) == 3, 3, \
"only batches of spatial targets supported (3D tensors)" \
" but got targets of dimension: %d", \
THIndexTensor_(nDimensionLegacyAll)(target)); \
THArgCheck(THTensor_(nDimensionLegacyAll)(input) == 4, 2, \
"only batches of spatial inputs supported (4D tensors), " \
"but got input of dimension: %d", THTensor_(nDimensionLegacyAll)(input)); \
"only batches of spatial targets supported (3D tensors)" \
" but got targets of dimension: %d", \
THIndexTensor_(nDimensionLegacyAll)(target)); \
THArgCheck(THTensor_(nDimensionLegacyAll)(input) == 4, 2, \
"only batches of spatial inputs supported (4D tensors), " \
"but got input of dimension: %d", THTensor_(nDimensionLegacyAll)(input)); \
if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) { \
THError("weight tensor should be defined either for all or no classes"); \
} \
@ -30,8 +30,8 @@
#define GRADOUTPUT_SHAPE_CHECK \
THArgCheck(THTensor_(nDimensionLegacyAll)(gradOutput) == 3, 3, \
"gradOutput must have same dimension as target (3)" \
" but got dimension: %d", \
THTensor_(nDimensionLegacyAll)(gradOutput)); \
" but got dimension: %d", \
THTensor_(nDimensionLegacyAll)(gradOutput)); \
{ \
int64_t gradOutput0 = THTensor_(size)(gradOutput, 0); \
int64_t gradOutput1 = THTensor_(size)(gradOutput, 1); \

View File

@ -5,14 +5,14 @@
#include <ATen/div_rtn.h>
static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
THTensor *input, THTensor *gradOutput,
THTensor *weight, THTensor *bias,
int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {
THTensor *input, THTensor *gradOutput,
THTensor *weight, THTensor *bias,
int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {
THArgCheck(kW > 0 && kH > 0, 9,
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
THArgCheck(dW > 0 && dH > 0, 11,
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
if (weight != NULL) {
THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
@ -36,7 +36,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
}
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
"non-empty 3D or 4D input tensor expected but got: %s");
"non-empty 3D or 4D input tensor expected but got: %s");
int64_t inputHeight = input->size(dimh);
int64_t inputWidth = input->size(dimw);
@ -87,8 +87,8 @@ static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) {
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
THTensor *old_weight = weight;
weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
s1, -1, s2, -1);
c10::raw::intrusive_ptr::decref(old_weight);
s1, -1, s2, -1);
c10::raw::intrusive_ptr::decref(old_weight);
}
return weight;
}
@ -116,8 +116,8 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
THTensor *output2d;
THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
nInputPlane, inputWidth, inputHeight,
outputWidth, outputHeight);
nInputPlane, inputWidth, inputHeight,
outputWidth, outputHeight);
output2d = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(output), output->storage_offset(),
nOutputPlane, -1,
@ -125,8 +125,8 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
if (bias) {
for(i = 0; i < nOutputPlane; i++)
THVector_(fill)
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
THTensor_(get1d)(bias, i), outputHeight*outputWidth);
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
THTensor_(get1d)(bias, i), outputHeight*outputWidth);
} else {
THTensor_(zero)(output);
}
@ -202,10 +202,10 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
THNN_(SpatialConvolutionMM_updateOutput_frame)
(input_t, output_t, weight, bias, finput_t,
kW, kH, dW, dH, padW, padH,
nInputPlane, inputWidth, inputHeight,
nOutputPlane, outputWidth, outputHeight);
(input_t, output_t, weight, bias, finput_t,
kW, kH, dW, dH, padW, padH,
nInputPlane, inputWidth, inputHeight,
nOutputPlane, outputWidth, outputHeight);
c10::raw::intrusive_ptr::decref(input_t);
c10::raw::intrusive_ptr::decref(output_t);
@ -239,9 +239,9 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
THTensor_(zero)(gradInput);
THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
padW, padH,
gradInput->size(0), gradInput->size(2), gradInput->size(1),
gradOutput->size(2), gradOutput->size(1));
padW, padH,
gradInput->size(0), gradInput->size(2), gradInput->size(1),
gradOutput->size(2), gradOutput->size(1));
}
void THNN_(SpatialConvolutionMM_updateGradInput)(
@ -280,8 +280,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
if(input->dim() == 3)
{
THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
tweight, fgradInput,
kW, kH, dW, dH, padW, padH);
tweight, fgradInput,
kW, kH, dW, dH, padW, padH);
}
else
{
@ -296,8 +296,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
tweight, fgradInput_t,
kW, kH, dW, dH, padW, padH);
tweight, fgradInput_t,
kW, kH, dW, dH, padW, padH);
c10::raw::intrusive_ptr::decref(gradInput_t);
c10::raw::intrusive_ptr::decref(gradOutput_t);
@ -380,7 +380,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
if(input->dim() == 3)
{
THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
gradBias, finput, scale);
gradBias, finput, scale);
}
else
{
@ -396,7 +396,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
}
THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
gradBias, finput_t, scale);
gradBias, finput_t, scale);
c10::raw::intrusive_ptr::decref(gradOutput_t);
if (gradWeight) {

View File

@ -5,10 +5,10 @@
#include <ATen/div_rtn.h>
static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
THTensor *input, THTensor *gradOutput,
THTensor *weight, THTensor *bias,
int kH, int kW, int dH, int dW, int padH, int padW,
int dilationH, int dilationW, int weight_nullable) {
THTensor *input, THTensor *gradOutput,
THTensor *weight, THTensor *bias,
int kH, int kW, int dH, int dW, int padH, int padW,
int dilationH, int dilationW, int weight_nullable) {
THArgCheck(kW > 0 && kH > 0, 9,
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
THArgCheck(dW > 0 && dH > 0, 11,
@ -40,7 +40,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
}
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
"non-empty 3D or 4D input tensor expected but got: %s");
"non-empty 3D or 4D input tensor expected but got: %s");
int64_t inputHeight = input->size(dimh);
int64_t inputWidth = input->size(dimw);
@ -235,7 +235,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
is_batch = 0;
THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1),
gradOutput->size(2));
gradOutput->size(2));
}
int64_t inputWidth = input->size(3);
@ -342,7 +342,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
is_batch = 0;
THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0),
gradOutput->size(1), gradOutput->size(2));
gradOutput->size(1), gradOutput->size(2));
}
int64_t nInputPlane = input->size(1);

View File

@ -6,9 +6,9 @@
#include <algorithm>
static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
int kH, int kW, int dH, int dW, int padH, int padW,
int dilationH, int dilationW, bool ceil_mode) {
THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
int kH, int kW, int dH, int dW, int padH, int padW,
int dilationH, int dilationW, bool ceil_mode) {
THArgCheck(kW > 0 && kH > 0, 5,
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@ -30,12 +30,12 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
}
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
"non-empty 3D or 4D input tensor expected but got: %s");
"non-empty 3D or 4D input tensor expected but got: %s");
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
"pad should be smaller than half of kernel size, but got "
"padW = %d, padH = %d, kW = %d, kH = %d",
padW, padH, kW, kH);
"pad should be smaller than half of kernel size, but got "
"padW = %d, padH = %d, kW = %d, kH = %d",
padW, padH, kW, kH);
int64_t nInputPlane = input->size(dimh-1);
int64_t inputHeight = input->size(dimh);
@ -47,7 +47,7 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
if (outputWidth < 1 || outputHeight < 1)
THError("Given input size: (%dx%dx%d). "
"Calculated output size: (%dx%dx%d). Output size is too small",
"Calculated output size: (%dx%dx%d). Output size is too small",
nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
if (gradOutput != NULL) {
@ -221,16 +221,16 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
for (p = 0; p < nbatch; p++)
{
THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
(input_data+p*nInputPlane*inputWidth*inputHeight,
output_data+p*nInputPlane*outputWidth*outputHeight,
indices_data+p*nInputPlane*outputWidth*outputHeight,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH, dW, dH,
padW, padH,
dilationW, dilationH
);
(input_data+p*nInputPlane*inputWidth*inputHeight,
output_data+p*nInputPlane*outputWidth*outputHeight,
indices_data+p*nInputPlane*outputWidth*outputHeight,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH, dW, dH,
padW, padH,
dilationW, dilationH
);
}
}
@ -266,10 +266,10 @@ static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
{
/* retrieve position of max */
int64_t maxp = ind_p_k[i*outputWidth + j];
if (maxp != -1) {
/* update gradient */
gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
}
if (maxp != -1) {
/* update gradient */
gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
}
}
}
}
@ -350,13 +350,13 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
for (p = 0; p < nbatch; p++)
{
THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
indices_data+p*nInputPlane*outputWidth*outputHeight,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
dW, dH);
(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
indices_data+p*nInputPlane*outputWidth*outputHeight,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
dW, dH);
}
}

View File

@ -3,15 +3,15 @@
#else
static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
THTensor *input, THTensor *gradOutput,
THTensor *weight, THTensor *bias,
int kH, int kW, int dH, int dW, int padH, int padW,
int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {
THTensor *input, THTensor *gradOutput,
THTensor *weight, THTensor *bias,
int kH, int kW, int dH, int dW, int padH, int padW,
int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {
THArgCheck(kW > 0 && kH > 0, 9,
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
THArgCheck(dW > 0 && dH > 0, 11,
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
THArgCheck(dilationW > 0 && dilationH > 0, 15,
"dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
dilationH, dilationW);
@ -41,7 +41,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
}
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
"non-empty 3D or 4D input tensor expected but got: %s");
"non-empty 3D or 4D input tensor expected but got: %s");
int64_t inputHeight = input->size(dimh);
int64_t inputWidth = input->size(dimw);
@ -50,8 +50,8 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
if (outputWidth < 1 || outputHeight < 1) {
THError("Given input size per channel: (%ld x %ld). "
"Calculated output size per channel: (%ld x %ld). Output size is too small",
inputHeight, inputWidth, outputHeight, outputWidth);
"Calculated output size per channel: (%ld x %ld). Output size is too small",
inputHeight, inputWidth, outputHeight, outputWidth);
}
if (weight != NULL) {

View File

@ -111,12 +111,12 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
for (p = 0; p < nbatch; p++)
{
THNN_(SpatialMaxUnpooling_updateOutput_frame)(
input_data+p*nslices*iwidth*iheight,
output_data+p*nslices*owidth*oheight,
indices_data+p*nslices*iwidth*iheight,
nslices,
iwidth, iheight,
owidth, oheight);
input_data+p*nslices*iwidth*iheight,
output_data+p*nslices*owidth*oheight,
indices_data+p*nslices*iwidth*iheight,
nslices,
iwidth, iheight,
owidth, oheight);
}
}
@ -196,7 +196,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){
THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
}
/* get raw pointers */

View File

@ -3,467 +3,467 @@
#else
static inline void THNN_(TemporalRowConvolution_shapeCheck)(
THNNState *state,
THTensor *input,
THTensor *gradOutput,
THTensor *weight,
THTensor *bias,
int kW,
int dW,
int padW) {
THNNState *state,
THTensor *input,
THTensor *gradOutput,
THTensor *weight,
THTensor *bias,
int kW,
int dW,
int padW) {
THArgCheck(kW > 0, 5,
"kernel size should be greater than zero, but got kW: %d", kW);
THArgCheck(dW > 0, 6,
"stride should be greater than zero, but got dW: %d", dW);
THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
"non-empty 3D weight tensor expected, but got: %s");
THArgCheck(kW > 0, 5,
"kernel size should be greater than zero, but got kW: %d", kW);
THArgCheck(dW > 0, 6,
"stride should be greater than zero, but got dW: %d", dW);
THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
"non-empty 3D weight tensor expected, but got: %s");
THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
if (bias != NULL) {
THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
}
if (bias != NULL) {
THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
}
// we're always looking at (possibly batch) x feats x seq
int ndim = input->dim();
int dimF = 0;
int dimS = 1;
// we're always looking at (possibly batch) x feats x seq
int ndim = input->dim();
int dimF = 0;
int dimS = 1;
if (ndim == 3) {
++dimS;
++dimF;
}
if (ndim == 3) {
++dimS;
++dimF;
}
THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
"non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
"non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
int64_t nInputFrame = input->size(dimS);
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
int64_t nInputFrame = input->size(dimS);
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
if (nOutputFrame < 1) {
THError("Given input size: (%d x %d). "
"Calculated output size: (%d x %d). Output size is too small",
inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
}
if (nOutputFrame < 1) {
THError("Given input size: (%d x %d). "
"Calculated output size: (%d x %d). Output size is too small",
inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
}
THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
if (gradOutput != NULL) {
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
}
if (gradOutput != NULL) {
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
}
}
static void THNN_(unfolded_acc_row)(
THTensor *finput,
THTensor *input,
int kW,
int dW,
int padW,
int64_t inputFrameSize,
int64_t nInputFrame,
int64_t nOutputFrame) {
THTensor *finput,
THTensor *input,
int kW,
int dW,
int padW,
int64_t inputFrameSize,
int64_t nInputFrame,
int64_t nOutputFrame) {
int64_t c;
scalar_t *input_data = input->data<scalar_t>();
scalar_t *finput_data = finput->data<scalar_t>();
int64_t c;
scalar_t *input_data = input->data<scalar_t>();
scalar_t *finput_data = finput->data<scalar_t>();
// #pragma omp parallel for private(c)
for (c = 0; c < inputFrameSize; c++) {
int64_t kw, x;
int64_t ix = 0;
for (c = 0; c < inputFrameSize; c++) {
int64_t kw, x;
int64_t ix = 0;
for (kw = 0; kw < kW; kw++) {
scalar_t *src = finput_data
+ c * (kW * nOutputFrame)
+ kw * (nOutputFrame);
scalar_t *dst = input_data + c * (nInputFrame);
for (kw = 0; kw < kW; kw++) {
scalar_t *src = finput_data
+ c * (kW * nOutputFrame)
+ kw * (nOutputFrame);
scalar_t *dst = input_data + c * (nInputFrame);
ix = (size_t)(kw);
if (dW == 1) {
scalar_t *dst_slice = dst + (size_t)(ix);
THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
} else {
for (x = 0; x < nOutputFrame; x++) {
scalar_t *dst_slice = dst + (size_t)(ix + x * dW);
THVector_(cadd)(dst_slice, dst_slice,
src + (size_t)(x), 1, 1);
}
}
}
}
ix = (size_t)(kw);
if (dW == 1) {
scalar_t *dst_slice = dst + (size_t)(ix);
THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
} else {
for (x = 0; x < nOutputFrame; x++) {
scalar_t *dst_slice = dst + (size_t)(ix + x * dW);
THVector_(cadd)(dst_slice, dst_slice,
src + (size_t)(x), 1, 1);
}
}
}
}
}
static void THNN_(unfolded_copy_row)(
THTensor *finput,
THTensor *input,
int kW,
int dW,
int padW,
int64_t inputFrameSize,
int64_t nInputFrame,
int64_t nOutputFrame) {
THTensor *finput,
THTensor *input,
int kW,
int dW,
int padW,
int64_t inputFrameSize,
int64_t nInputFrame,
int64_t nOutputFrame) {
int64_t k;
scalar_t *input_data = input->data<scalar_t>();
scalar_t *finput_data = finput->data<scalar_t>();
int64_t k;
scalar_t *input_data = input->data<scalar_t>();
scalar_t *finput_data = finput->data<scalar_t>();
// #pragma omp parallel for private(k)
for (k = 0; k < inputFrameSize * kW; k++) {
int64_t c = k / kW;
int64_t rest = k % kW;
int64_t kw = rest % kW;
int64_t x;
int64_t ix;
scalar_t *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
scalar_t *src = input_data + c * (nInputFrame);
for (k = 0; k < inputFrameSize * kW; k++) {
int64_t c = k / kW;
int64_t rest = k % kW;
int64_t kw = rest % kW;
int64_t x;
int64_t ix;
scalar_t *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
scalar_t *src = input_data + c * (nInputFrame);
ix = (size_t)(kw);
if (dW == 1) {
memcpy(dst, src+(size_t)(ix), sizeof(scalar_t) * (nOutputFrame));
} else {
for (x = 0; x < nOutputFrame; x++) {
memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
sizeof(scalar_t) * 1);
}
}
}
ix = (size_t)(kw);
if (dW == 1) {
memcpy(dst, src+(size_t)(ix), sizeof(scalar_t) * (nOutputFrame));
} else {
for (x = 0; x < nOutputFrame; x++) {
memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
sizeof(scalar_t) * 1);
}
}
}
}
static void THNN_(TemporalRowConvolution_updateOutput_frame)(
THTensor *input,
THTensor *output,
THTensor *weight,
THTensor *bias,
THTensor *finput,
int kW,
int dW,
int padW,
int64_t inputFrameSize,
int64_t nInputFrame,
int64_t nOutputFrame) {
THTensor *input,
THTensor *output,
THTensor *weight,
THTensor *bias,
THTensor *finput,
int kW,
int dW,
int padW,
int64_t inputFrameSize,
int64_t nInputFrame,
int64_t nOutputFrame) {
int64_t i;
int64_t i;
THTensor *output3d = THTensor_(newWithStorage3d)(
THTensor_getStoragePtr(output), output->storage_offset(),
inputFrameSize, -1,
1, -1,
nOutputFrame, -1);
THTensor *output3d = THTensor_(newWithStorage3d)(
THTensor_getStoragePtr(output), output->storage_offset(),
inputFrameSize, -1,
1, -1,
nOutputFrame, -1);
THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
THTensor_(zero)(output);
THTensor_(zero)(output);
if (bias != NULL) {
for (i = 0; i < inputFrameSize; i++)
THVector_(fill)
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
+ output->stride(0) * i,
THTensor_(get1d)(bias, i), nOutputFrame);
}
if (bias != NULL) {
for (i = 0; i < inputFrameSize; i++)
THVector_(fill)
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
+ output->stride(0) * i,
THTensor_(get1d)(bias, i), nOutputFrame);
}
THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
c10::raw::intrusive_ptr::decref(output3d);
c10::raw::intrusive_ptr::decref(output3d);
}
void THNN_(TemporalRowConvolution_updateOutput)(
THNNState *state,
THTensor *input,
THTensor *output,
THTensor *weight,
THTensor *bias,
THTensor *finput,
THTensor *fgradInput, // unused here but needed for Cuda
int kW,
int dW,
int padW,
bool featFirst) {
THNNState *state,
THTensor *input,
THTensor *output,
THTensor *weight,
THTensor *bias,
THTensor *finput,
THTensor *fgradInput, // unused here but needed for Cuda
int kW,
int dW,
int padW,
bool featFirst) {
int ndim = input->dim();
int ndim = input->dim();
THTensor *tinput = NULL;
if (!featFirst) {
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
input = THTensor_(newContiguous)(tinput);
} else {
input = THTensor_(newContiguous)(input);
}
THTensor *tinput = NULL;
if (!featFirst) {
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
input = THTensor_(newContiguous)(tinput);
} else {
input = THTensor_(newContiguous)(input);
}
THNN_(TemporalRowConvolution_shapeCheck)(
state, input, NULL, weight, bias, kW, dW, padW);
THNN_(TemporalRowConvolution_shapeCheck)(
state, input, NULL, weight, bias, kW, dW, padW);
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
int64_t nInputFrame = input->size(ndim - 1);
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
int64_t nInputFrame = input->size(ndim - 1);
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
if (ndim == 2) { /* non-batch mode */
if (ndim == 2) { /* non-batch mode */
THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
THTensor_(zero)(finput);
THTensor_(zero)(output);
THTensor_(zero)(finput);
THTensor_(zero)(output);
THNN_(TemporalRowConvolution_updateOutput_frame)
(input, output, weight, bias, finput,
kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
THNN_(TemporalRowConvolution_updateOutput_frame)
(input, output, weight, bias, finput,
kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
} else {
int64_t T = input->size(0);
int64_t t;
} else {
int64_t T = input->size(0);
int64_t t;
THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
THTensor_(zero)(finput);
THTensor_(zero)(output);
THTensor_(zero)(finput);
THTensor_(zero)(output);
#pragma omp parallel for private(t)
for (t = 0; t < T; t++) {
THTensor *input_t = THTensor_(newSelect)(input, 0, t);
THTensor *output_t = THTensor_(newSelect)(output, 0, t);
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
for (t = 0; t < T; t++) {
THTensor *input_t = THTensor_(newSelect)(input, 0, t);
THTensor *output_t = THTensor_(newSelect)(output, 0, t);
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
THNN_(TemporalRowConvolution_updateOutput_frame)
(input_t, output_t, weight, bias, finput_t,
kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
THNN_(TemporalRowConvolution_updateOutput_frame)
(input_t, output_t, weight, bias, finput_t,
kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
c10::raw::intrusive_ptr::decref(input_t);
c10::raw::intrusive_ptr::decref(output_t);
c10::raw::intrusive_ptr::decref(finput_t);
}
}
c10::raw::intrusive_ptr::decref(input_t);
c10::raw::intrusive_ptr::decref(output_t);
c10::raw::intrusive_ptr::decref(finput_t);
}
}
if (!featFirst) { // NOTE: output will NOT be contiguous in this case
THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
c10::raw::intrusive_ptr::decref(tinput);
}
if (!featFirst) { // NOTE: output will NOT be contiguous in this case
THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
c10::raw::intrusive_ptr::decref(tinput);
}
c10::raw::intrusive_ptr::decref(input);
c10::raw::intrusive_ptr::decref(input);
}
static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
THTensor *gradInput,
THTensor *gradOutput,
THTensor *weight,
THTensor *fgradInput,
int kW,
int dW,
int padW,
int64_t inputFrameSize,
int64_t nInputFrame,
int64_t nOutputFrame) {
THTensor *gradInput,
THTensor *gradOutput,
THTensor *weight,
THTensor *fgradInput,
int kW,
int dW,
int padW,
int64_t inputFrameSize,
int64_t nInputFrame,
int64_t nOutputFrame) {
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
inputFrameSize, -1,
1, -1,
nOutputFrame, -1);
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
inputFrameSize, -1,
1, -1,
nOutputFrame, -1);
// weight: inputFrameSize x kW x 1
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
// fgradInput: inputFrameSize x kW x nOutputFrame
c10::raw::intrusive_ptr::decref(gradOutput3d);
// weight: inputFrameSize x kW x 1
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
// fgradInput: inputFrameSize x kW x nOutputFrame
c10::raw::intrusive_ptr::decref(gradOutput3d);
THTensor_(zero)(gradInput);
THTensor_(zero)(gradInput);
THNN_(unfolded_acc_row)(fgradInput, gradInput,
kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
THNN_(unfolded_acc_row)(fgradInput, gradInput,
kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
}
void THNN_(TemporalRowConvolution_updateGradInput)(
THNNState *state,
THTensor *input,
THTensor *gradOutput,
THTensor *gradInput,
THTensor *weight,
THTensor *finput,
THTensor *fgradInput,
int kW,
int dW,
int padW,
bool featFirst) {
THNNState *state,
THTensor *input,
THTensor *gradOutput,
THTensor *gradInput,
THTensor *weight,
THTensor *finput,
THTensor *fgradInput,
int kW,
int dW,
int padW,
bool featFirst) {
int ndim = input->dim();
int ndim = input->dim();
THTensor *tinput, *tgradOutput;
THTensor *tinput, *tgradOutput;
if (!featFirst) {
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
if (!featFirst) {
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
input = THTensor_(newContiguous)(tinput);
gradOutput = THTensor_(newContiguous)(tgradOutput);
input = THTensor_(newContiguous)(tinput);
gradOutput = THTensor_(newContiguous)(tgradOutput);
} else {
input = THTensor_(newContiguous)(input);
gradOutput = THTensor_(newContiguous)(gradOutput);
}
} else {
input = THTensor_(newContiguous)(input);
gradOutput = THTensor_(newContiguous)(gradOutput);
}
THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
NULL, kW, dW, padW);
THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
NULL, kW, dW, padW);
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
int64_t nInputFrame = input->size(ndim - 1);
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
int64_t nInputFrame = input->size(ndim - 1);
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
THTensor_(resizeAs)(fgradInput, finput);
THTensor_(resizeAs)(gradInput, input);
THTensor_(resizeAs)(fgradInput, finput);
THTensor_(resizeAs)(gradInput, input);
THTensor_(zero)(fgradInput);
THTensor_(zero)(gradInput);
THTensor_(zero)(fgradInput);
THTensor_(zero)(gradInput);
THTensor *tweight = THTensor_(new)();
THTensor_(transpose)(tweight, weight, 1, 2);
if (ndim == 2) {
THNN_(TemporalRowConvolution_updateGradInput_frame)
(gradInput, gradOutput, tweight, fgradInput,
kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
} else {
int64_t T = input->size(0);
int64_t t;
if (ndim == 2) {
THNN_(TemporalRowConvolution_updateGradInput_frame)
(gradInput, gradOutput, tweight, fgradInput,
kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
} else {
int64_t T = input->size(0);
int64_t t;
#pragma omp parallel for private(t)
for (t = 0; t < T; t++) {
for (t = 0; t < T; t++) {
THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
THNN_(TemporalRowConvolution_updateGradInput_frame)
(gradInput_t, gradOutput_t, tweight, fgradInput_t,
kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
THNN_(TemporalRowConvolution_updateGradInput_frame)
(gradInput_t, gradOutput_t, tweight, fgradInput_t,
kW, dW, padW,
inputFrameSize, nInputFrame, nOutputFrame);
c10::raw::intrusive_ptr::decref(gradInput_t);
c10::raw::intrusive_ptr::decref(gradOutput_t);
c10::raw::intrusive_ptr::decref(fgradInput_t);
}
}
c10::raw::intrusive_ptr::decref(gradInput_t);
c10::raw::intrusive_ptr::decref(gradOutput_t);
c10::raw::intrusive_ptr::decref(fgradInput_t);
}
}
c10::raw::intrusive_ptr::decref(tweight);
if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
c10::raw::intrusive_ptr::decref(tinput);
c10::raw::intrusive_ptr::decref(tgradOutput);
c10::raw::intrusive_ptr::decref(tinput);
c10::raw::intrusive_ptr::decref(tgradOutput);
THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
}
THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
}
c10::raw::intrusive_ptr::decref(input);
c10::raw::intrusive_ptr::decref(gradOutput);
c10::raw::intrusive_ptr::decref(input);
c10::raw::intrusive_ptr::decref(gradOutput);
}
static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
THTensor *finput, scalar_t scale) {
THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
THTensor *finput, scalar_t scale) {
int64_t i;
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
gradOutput->size(0), -1,
1, -1,
gradOutput->size(1), -1);
int64_t i;
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
gradOutput->size(0), -1,
1, -1,
gradOutput->size(1), -1);
THTensor *tfinput = THTensor_(new)();
THTensor_(transpose)(tfinput, finput, 1, 2);
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
// finput: inputFrameSize x nOutputFrame x kW
THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
// gradWeight: inputFrameSize x 1 x kW
THTensor_(transpose)(tfinput, finput, 1, 2);
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
// finput: inputFrameSize x nOutputFrame x kW
THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
// gradWeight: inputFrameSize x 1 x kW
c10::raw::intrusive_ptr::decref(tfinput);
if (gradBias != NULL) {
for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
int64_t k;
scalar_t sum = 0;
scalar_t *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
+ gradOutput3d->storage_offset()
+ i * gradOutput3d->stride(0);
for (k = 0; k < gradOutput3d->size(2); k++) {
sum += data[k];
}
(THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
+= scale * sum;
}
}
if (gradBias != NULL) {
for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
int64_t k;
scalar_t sum = 0;
scalar_t *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
+ gradOutput3d->storage_offset()
+ i * gradOutput3d->stride(0);
for (k = 0; k < gradOutput3d->size(2); k++) {
sum += data[k];
}
(THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
+= scale * sum;
}
}
c10::raw::intrusive_ptr::decref(gradOutput3d);
c10::raw::intrusive_ptr::decref(gradOutput3d);
}
void THNN_(TemporalRowConvolution_accGradParameters)(
THNNState *state,
THTensor *input,
THTensor *gradOutput,
THTensor *gradWeight,
THTensor *gradBias,
THTensor *finput,
THTensor *fgradInput,
int kW,
int dW,
int padW,
bool featFirst,
accreal scale_) {
THNNState *state,
THTensor *input,
THTensor *gradOutput,
THTensor *gradWeight,
THTensor *gradBias,
THTensor *finput,
THTensor *fgradInput,
int kW,
int dW,
int padW,
bool featFirst,
accreal scale_) {
scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
int ndim = input->dim();
int ndim = input->dim();
THTensor *tinput = NULL;
THTensor *tgradOutput = NULL;
THTensor *tinput = NULL;
THTensor *tgradOutput = NULL;
if (!featFirst) {
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
if (!featFirst) {
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
input = THTensor_(newContiguous)(tinput);
gradOutput = THTensor_(newContiguous)(tgradOutput);
} else {
input = THTensor_(newContiguous)(input);
gradOutput = THTensor_(newContiguous)(gradOutput);
}
input = THTensor_(newContiguous)(tinput);
gradOutput = THTensor_(newContiguous)(tgradOutput);
} else {
input = THTensor_(newContiguous)(input);
gradOutput = THTensor_(newContiguous)(gradOutput);
}
THNN_(TemporalRowConvolution_shapeCheck)
(state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
THNN_(TemporalRowConvolution_shapeCheck)
(state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
if (ndim == 2) {
THNN_(TemporalRowConvolution_accGradParameters_frame)(
gradOutput, gradWeight, gradBias, finput, scale);
} else {
int64_t T = input->size(0);
int64_t t;
if (ndim == 2) {
THNN_(TemporalRowConvolution_accGradParameters_frame)(
gradOutput, gradWeight, gradBias, finput, scale);
} else {
int64_t T = input->size(0);
int64_t t;
for (t = 0; t < T; t++) {
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
for (t = 0; t < T; t++) {
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
THNN_(TemporalRowConvolution_accGradParameters_frame)(
gradOutput_t, gradWeight, gradBias, finput_t, scale);
THNN_(TemporalRowConvolution_accGradParameters_frame)(
gradOutput_t, gradWeight, gradBias, finput_t, scale);
c10::raw::intrusive_ptr::decref(gradOutput_t);
c10::raw::intrusive_ptr::decref(finput_t);
}
}
c10::raw::intrusive_ptr::decref(gradOutput_t);
c10::raw::intrusive_ptr::decref(finput_t);
}
}
if (!featFirst) {
c10::raw::intrusive_ptr::decref(tinput);
c10::raw::intrusive_ptr::decref(tgradOutput);
}
if (!featFirst) {
c10::raw::intrusive_ptr::decref(tinput);
c10::raw::intrusive_ptr::decref(tgradOutput);
}
c10::raw::intrusive_ptr::decref(input);
c10::raw::intrusive_ptr::decref(gradOutput);
c10::raw::intrusive_ptr::decref(input);
c10::raw::intrusive_ptr::decref(gradOutput);
}
#endif

View File

@ -105,7 +105,7 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
if (input->dim() == 5)
{

View File

@ -75,7 +75,7 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
if (otime < 1 || owidth < 1 || oheight < 1)
THError("Given input size: (%dx%dx%dx%d). "
"Calculated output size: (%dx%dx%dx%d). Output size is too small",
"Calculated output size: (%dx%dx%dx%d). Output size is too small",
nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
if (gradOutput != NULL) {

View File

@ -119,7 +119,7 @@ static THTensor* THNN_(newViewWeight)(THTensor *weight)
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
THTensor *old_weight = weight;
weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
s1, -1, s2, -1);
s1, -1, s2, -1);
c10::raw::intrusive_ptr::decref(old_weight);
}
return weight;

View File

@ -274,7 +274,7 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
const int64_t k_ = 1;
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
if (bias) {
if (bias) {
THBlas_(gemm)(
't', 'n',
n_, m_, k_,

View File

@ -7,13 +7,13 @@
#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
#define THNN_CHECK_SHAPE(I1, I2) \
if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2)) \
{ \
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
THError(#I1 " and " #I2 " shapes do not match: " \
#I1 " %s, " #I2 " %s", s1.str, s2.str); \
#define THNN_CHECK_SHAPE(I1, I2) \
if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2)) \
{ \
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
THError(#I1 " and " #I2 " shapes do not match: " \
#I1 " %s, " #I2 " %s", s1.str, s2.str); \
}
#define THNN_CHECK_SHAPE_INDICES(I1, I2) \
@ -26,39 +26,39 @@
}
#define THNN_CHECK_NELEMENT(I1, I2) \
if (I1 != NULL && I2 != NULL ) { \
ptrdiff_t n1 = THTensor_(nElement)(I1); \
ptrdiff_t n2 = THTensor_(nElement)(I2); \
if (n1 != n2) \
{ \
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
THError(#I1 " and " #I2 " have different number of elements: " \
#I1 "%s has %ld elements, while " \
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
} \
if (I1 != NULL && I2 != NULL ) { \
ptrdiff_t n1 = THTensor_(nElement)(I1); \
ptrdiff_t n2 = THTensor_(nElement)(I2); \
if (n1 != n2) \
{ \
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
THError(#I1 " and " #I2 " have different number of elements: " \
#I1 "%s has %ld elements, while " \
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
} \
}
#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
THDescBuff s1 = THTensor_(sizeDesc)(T); \
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
THDescBuff s1 = THTensor_(sizeDesc)(T); \
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
}
#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \
if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \
if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
}
#define THNN_ARGCHECK(COND, ARG, T, FORMAT) \
if (!(COND)) { \
THDescBuff s1 = THTensor_(sizeDesc)(T); \
THArgCheck(COND, ARG, FORMAT, s1.str); \
#define THNN_ARGCHECK(COND, ARG, T, FORMAT) \
if (!(COND)) { \
THDescBuff s1 = THTensor_(sizeDesc)(T); \
THArgCheck(COND, ARG, FORMAT, s1.str); \
}
#include <THNN/generic/AbsCriterion.c>

View File

@ -206,7 +206,7 @@ TEST(LeftRightTest, givenInt_whenWriteThrowsExceptionOnSecondCall_thenKeepsNewSt
write_called = true;
}
}),
MyException
MyException
);
// check reading it returns new value

View File

@ -85,41 +85,41 @@ namespace detail {
* @note The implementation doesn't use any floating-point operations.
*/
inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
/*
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
* +---+-----+------------+-------------------+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 31 26-30 16-25 0-15
*
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
*/
const uint32_t w = (uint32_t) h << 16;
/*
* Extract the sign of the input number into the high bit of the 32-bit word:
*
* +---+----------------------------------+
* | S |0000000 00000000 00000000 00000000|
* +---+----------------------------------+
* Bits 31 0-31
*/
const uint32_t sign = w & UINT32_C(0x80000000);
/*
* Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
*
* +---+-----+------------+-------------------+
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 30 27-31 17-26 0-16
*/
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
/*
* Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
* If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
* In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
* denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
* biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
*/
/*
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
* +---+-----+------------+-------------------+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 31 26-30 16-25 0-15
*
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
*/
const uint32_t w = (uint32_t) h << 16;
/*
* Extract the sign of the input number into the high bit of the 32-bit word:
*
* +---+----------------------------------+
* | S |0000000 00000000 00000000 00000000|
* +---+----------------------------------+
* Bits 31 0-31
*/
const uint32_t sign = w & UINT32_C(0x80000000);
/*
* Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
*
* +---+-----+------------+-------------------+
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 30 27-31 17-26 0-16
*/
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
/*
* Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
* If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
* In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
* denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
* biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
*/
#ifdef _MSC_VER
unsigned long nonsign_bsr;
_BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
@ -176,62 +176,62 @@ namespace detail {
* floating-point operations and bitcasts between integer and floating-point variables.
*/
inline float fp16_ieee_to_fp32_value(uint16_t h) {
/*
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
* +---+-----+------------+-------------------+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 31 26-30 16-25 0-15
*
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
*/
const uint32_t w = (uint32_t) h << 16;
/*
* Extract the sign of the input number into the high bit of the 32-bit word:
*
* +---+----------------------------------+
* | S |0000000 00000000 00000000 00000000|
* +---+----------------------------------+
* Bits 31 0-31
*/
const uint32_t sign = w & UINT32_C(0x80000000);
/*
* Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
*
* +-----+------------+---------------------+
* |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
* +-----+------------+---------------------+
* Bits 27-31 17-26 0-16
*/
const uint32_t two_w = w + w;
/*
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
* +---+-----+------------+-------------------+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 31 26-30 16-25 0-15
*
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
*/
const uint32_t w = (uint32_t) h << 16;
/*
* Extract the sign of the input number into the high bit of the 32-bit word:
*
* +---+----------------------------------+
* | S |0000000 00000000 00000000 00000000|
* +---+----------------------------------+
* Bits 31 0-31
*/
const uint32_t sign = w & UINT32_C(0x80000000);
/*
* Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
*
* +-----+------------+---------------------+
* |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
* +-----+------------+---------------------+
* Bits 27-31 17-26 0-16
*/
const uint32_t two_w = w + w;
/*
* Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
* of a single-precision floating-point number:
*
* S|Exponent | Mantissa
* +-+---+-----+------------+----------------+
* |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
* +-+---+-----+------------+----------------+
* Bits | 23-31 | 0-22
*
* Next, there are some adjustments to the exponent:
* - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
* formats (0x7F - 0xF = 0x70)
* - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
* Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
* of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
* - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
* by the difference in the exponent bias (see above).
* - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
* exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
* The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
* partially IEEE754-compliant implementations.
*
* Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
* operate on denormal inputs, and do not produce denormal results.
*/
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
/*
* Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
* of a single-precision floating-point number:
*
* S|Exponent | Mantissa
* +-+---+-----+------------+----------------+
* |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
* +-+---+-----+------------+----------------+
* Bits | 23-31 | 0-22
*
* Next, there are some adjustments to the exponent:
* - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
* formats (0x7F - 0xF = 0x70)
* - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
* Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
* of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
* - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
* by the difference in the exponent bias (see above).
* - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
* exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
* The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
* partially IEEE754-compliant implementations.
*
* Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
* operate on denormal inputs, and do not produce denormal results.
*/
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
// const float exp_scale = 0x1.0p-112f;
uint32_t scale_bits = (uint32_t) 15 << 23;
float exp_scale_val;
@ -239,48 +239,48 @@ namespace detail {
const float exp_scale = exp_scale_val;
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
/*
* Convert denormalized half-precision inputs into single-precision results (always normalized).
* Zero inputs are also handled here.
*
* In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
* First, we shift mantissa into bits 0-9 of the 32-bit word.
*
* zeros | mantissa
* +---------------------------+------------+
* |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
* +---------------------------+------------+
* Bits 10-31 0-9
*
* Now, remember that denormalized half-precision numbers are represented as:
* FP16 = mantissa * 2**(-24).
* The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
* and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
* A normalized single-precision floating-point number is represented as:
* FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
* Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
* number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
*
* The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
* is zero, the constructed single-precision number has the value of
* FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
* Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
* the input half-precision number.
*/
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
/*
* Convert denormalized half-precision inputs into single-precision results (always normalized).
* Zero inputs are also handled here.
*
* In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
* First, we shift mantissa into bits 0-9 of the 32-bit word.
*
* zeros | mantissa
* +---------------------------+------------+
* |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
* +---------------------------+------------+
* Bits 10-31 0-9
*
* Now, remember that denormalized half-precision numbers are represented as:
* FP16 = mantissa * 2**(-24).
* The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
* and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
* A normalized single-precision floating-point number is represented as:
* FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
* Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
* number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
*
* The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
* is zero, the constructed single-precision number has the value of
* FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
* Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
* the input half-precision number.
*/
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
/*
* - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
* input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
* input is either a denormal number, or zero.
* - Combine the result of conversion of exponent and mantissa with the sign of the input number.
*/
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
/*
* - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
* input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
* input is either a denormal number, or zero.
* - Combine the result of conversion of exponent and mantissa with the sign of the input number.
*/
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
/*
@ -301,22 +301,22 @@ namespace detail {
const float scale_to_inf = scale_to_inf_val;
const float scale_to_zero = scale_to_zero_val;
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
} // namespace detail

View File

@ -55,7 +55,7 @@ Assertion Passed!
</details>
)DOC")
)DOC")
.Arg(
"error_msg",
"(*string*): custom error message to be thrown when the input does not pass assertion",

View File

@ -107,17 +107,17 @@ Testing CountUp operator...
'count' value after CountUp test: 10
Testing CountDown operator...
'count' value after CountDown: 9 'done' value: False
'count' value after CountDown: 8 'done' value: False
'count' value after CountDown: 7 'done' value: False
'count' value after CountDown: 6 'done' value: False
'count' value after CountDown: 5 'done' value: False
'count' value after CountDown: 4 'done' value: False
'count' value after CountDown: 3 'done' value: False
'count' value after CountDown: 2 'done' value: False
'count' value after CountDown: 1 'done' value: False
'count' value after CountDown: 0 'done' value: False
'count' value after CountDown: -1 'done' value: True
'count' value after CountDown: 9 'done' value: False
'count' value after CountDown: 8 'done' value: False
'count' value after CountDown: 7 'done' value: False
'count' value after CountDown: 6 'done' value: False
'count' value after CountDown: 5 'done' value: False
'count' value after CountDown: 4 'done' value: False
'count' value after CountDown: 3 'done' value: False
'count' value after CountDown: 2 'done' value: False
'count' value after CountDown: 1 'done' value: False
'count' value after CountDown: 0 'done' value: False
'count' value after CountDown: -1 'done' value: True
```
</details>

View File

@ -24,11 +24,11 @@ OPERATOR_SCHEMA(Expand)
.NumInputs(2)
.NumOutputs(1)
.SetDoc(R"DOC(
Broadcast the input tensor to a materialized new tensor using given shape.
Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
Dimensions are right alignment;
Two corresponding dimensions must have the same value, or one of them
equals to 1.
Broadcast the input tensor to a materialized new tensor using given shape.
Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
Dimensions are right alignment;
Two corresponding dimensions must have the same value, or one of them
equals to 1.
In order to align with PyTorch's `expand`, `shape` is allowed to have entries
equal to -1, which means to preserve the size of the corresponding dimension
in `X` (so it's actually equivalent to equal to 1).

View File

@ -758,10 +758,10 @@ if(USE_CUDA)
endif()
if(CAFFE2_USE_CUDNN)
IF(CUDNN_STATIC_LINKAGE)
LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
ELSE()
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
ENDIF()
else()
caffe2_update_option(USE_CUDNN OFF)
@ -1166,7 +1166,7 @@ if (NOT BUILD_ATEN_MOBILE)
CHECK_C_SOURCE_COMPILES("#include <stdint.h>
static inline void cpuid(uint32_t *eax, uint32_t *ebx,
uint32_t *ecx, uint32_t *edx)
uint32_t *ecx, uint32_t *edx)
{
uint32_t a = *eax, b, c = *ecx, d;
asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );

View File

@ -3,16 +3,16 @@
# CUB_INCLUDE_DIRS - the CUB include directory
find_path(CUB_INCLUDE_DIR
NAMES cub/cub.cuh
DOC "The directory where CUB includes reside"
NAMES cub/cub.cuh
DOC "The directory where CUB includes reside"
)
set(CUB_INCLUDE_DIRS ${CUB_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CUB
FOUND_VAR CUB_FOUND
REQUIRED_VARS CUB_INCLUDE_DIR
FOUND_VAR CUB_FOUND
REQUIRED_VARS CUB_INCLUDE_DIR
)
mark_as_advanced(CUB_FOUND)

View File

@ -35,20 +35,20 @@ find_package_handle_standard_args(
MIOPEN DEFAULT_MSG MIOPEN_INCLUDE_DIR MIOPEN_LIBRARY)
if(MIOPEN_FOUND)
# get MIOpen version
# get MIOpen version
file(READ ${MIOPEN_INCLUDE_DIR}/version.h MIOPEN_HEADER_CONTENTS)
string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
# Assemble MIOpen version
if(NOT MIOPEN_VERSION_MAJOR)
set(MIOPEN_VERSION "?")

View File

@ -3,16 +3,16 @@
# pybind11_INCLUDE_DIRS - the pybind11 include directory
find_path(pybind11_INCLUDE_DIR
NAMES pybind11/pybind11.h
DOC "The directory where pybind11 includes reside"
NAMES pybind11/pybind11.h
DOC "The directory where pybind11 includes reside"
)
set(pybind11_INCLUDE_DIRS ${pybind11_INCLUDE_DIR})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(pybind11
FOUND_VAR pybind11_FOUND
REQUIRED_VARS pybind11_INCLUDE_DIR
FOUND_VAR pybind11_FOUND
REQUIRED_VARS pybind11_INCLUDE_DIR
)
mark_as_advanced(pybind11_FOUND)

File diff suppressed because it is too large Load Diff

View File

@ -145,10 +145,10 @@ allowed values for these axes at the moment are:
.. tip::
There exist "Rust-style" shorthands for dtypes, like ``kF32`` instead of
``kFloat32``. See `here
<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/types.h>`_
for the full list.
There exist "Rust-style" shorthands for dtypes, like ``kF32`` instead of
``kFloat32``. See `here
<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/types.h>`_
for the full list.
An instance of ``TensorOptions`` stores a concrete value for each of these
@ -314,8 +314,8 @@ we can convert it from ``int64`` to ``float32``:
.. attention::
The result of the conversion, ``float_tensor``, is a new tensor pointing to
new memory, unrelated to the source ``source_tensor``.
The result of the conversion, ``float_tensor``, is a new tensor pointing to
new memory, unrelated to the source ``source_tensor``.
We can then move it from CPU memory to GPU memory:

View File

@ -5,7 +5,7 @@ pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
@ -15,15 +15,15 @@ if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%

View File

@ -770,34 +770,34 @@ Interpreting Graphs
The example script above produces the graph::
graph(%len : int) {
%15 : int = prim::Constant[value=1]()
%9 : bool = prim::Constant[value=1]()
%7 : Device = prim::Constant[value="cpu"]()
%6 : int = prim::Constant[value=0]()
%5 : int = prim::Constant[value=6]()
%1 : int = prim::Constant[value=3]()
%2 : int = prim::Constant[value=4]()
%11 : int = prim::Constant[value=10]()
%14 : float = prim::Constant[value=1]()
%4 : int[] = prim::ListConstruct(%1, %2)
%rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
%rv : Tensor = prim::Loop(%len, %9, %rv.1)
block0(%i : int, %13 : Tensor) {
%12 : bool = aten::lt(%i, %11)
%rv.4 : Tensor = prim::If(%12)
block0() {
%rv.2 : Tensor = aten::sub(%13, %14, %15)
-> (%rv.2)
}
block1() {
%rv.3 : Tensor = aten::add(%13, %14, %15)
-> (%rv.3)
}
-> (%9, %rv.4)
}
return (%rv);
}
graph(%len : int) {
%15 : int = prim::Constant[value=1]()
%9 : bool = prim::Constant[value=1]()
%7 : Device = prim::Constant[value="cpu"]()
%6 : int = prim::Constant[value=0]()
%5 : int = prim::Constant[value=6]()
%1 : int = prim::Constant[value=3]()
%2 : int = prim::Constant[value=4]()
%11 : int = prim::Constant[value=10]()
%14 : float = prim::Constant[value=1]()
%4 : int[] = prim::ListConstruct(%1, %2)
%rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
%rv : Tensor = prim::Loop(%len, %9, %rv.1)
block0(%i : int, %13 : Tensor) {
%12 : bool = aten::lt(%i, %11)
%rv.4 : Tensor = prim::If(%12)
block0() {
%rv.2 : Tensor = aten::sub(%13, %14, %15)
-> (%rv.2)
}
block1() {
%rv.3 : Tensor = aten::add(%13, %14, %15)
-> (%rv.3)
}
-> (%9, %rv.4)
}
return (%rv);
}
Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
@ -850,39 +850,39 @@ Automatic Trace Checking
traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
Gives us the following diagnostic information::
ERROR: Graphs differed across invocations!
Graph diff::
ERROR: Graphs differed across invocations!
Graph diff::
graph(%x : Tensor) {
%1 : int = prim::Constant[value=0]()
%2 : int = prim::Constant[value=0]()
%result.1 : Tensor = aten::select(%x, %1, %2)
%4 : int = prim::Constant[value=0]()
%5 : int = prim::Constant[value=0]()
%6 : Tensor = aten::select(%x, %4, %5)
%result.2 : Tensor = aten::mul(%result.1, %6)
%8 : int = prim::Constant[value=0]()
%9 : int = prim::Constant[value=1]()
%10 : Tensor = aten::select(%x, %8, %9)
- %result : Tensor = aten::mul(%result.2, %10)
+ %result.3 : Tensor = aten::mul(%result.2, %10)
? ++
%12 : int = prim::Constant[value=0]()
%13 : int = prim::Constant[value=2]()
%14 : Tensor = aten::select(%x, %12, %13)
+ %result : Tensor = aten::mul(%result.3, %14)
+ %16 : int = prim::Constant[value=0]()
+ %17 : int = prim::Constant[value=3]()
+ %18 : Tensor = aten::select(%x, %16, %17)
- %15 : Tensor = aten::mul(%result, %14)
? ^ ^
+ %19 : Tensor = aten::mul(%result, %18)
? ^ ^
- return (%15);
? ^
+ return (%19);
? ^
}
graph(%x : Tensor) {
%1 : int = prim::Constant[value=0]()
%2 : int = prim::Constant[value=0]()
%result.1 : Tensor = aten::select(%x, %1, %2)
%4 : int = prim::Constant[value=0]()
%5 : int = prim::Constant[value=0]()
%6 : Tensor = aten::select(%x, %4, %5)
%result.2 : Tensor = aten::mul(%result.1, %6)
%8 : int = prim::Constant[value=0]()
%9 : int = prim::Constant[value=1]()
%10 : Tensor = aten::select(%x, %8, %9)
- %result : Tensor = aten::mul(%result.2, %10)
+ %result.3 : Tensor = aten::mul(%result.2, %10)
? ++
%12 : int = prim::Constant[value=0]()
%13 : int = prim::Constant[value=2]()
%14 : Tensor = aten::select(%x, %12, %13)
+ %result : Tensor = aten::mul(%result.3, %14)
+ %16 : int = prim::Constant[value=0]()
+ %17 : int = prim::Constant[value=3]()
+ %18 : Tensor = aten::select(%x, %16, %17)
- %15 : Tensor = aten::mul(%result, %14)
? ^ ^
+ %19 : Tensor = aten::mul(%result, %18)
? ^ ^
- return (%15);
? ^
+ return (%19);
? ^
}
This message indicates to us that the computation differed between when
@ -912,19 +912,19 @@ Automatic Trace Checking
Which produces::
graph(%x : Tensor) {
%5 : bool = prim::Constant[value=1]()
%1 : int = prim::Constant[value=0]()
%result.1 : Tensor = aten::select(%x, %1, %1)
%4 : int = aten::size(%x, %1)
%result : Tensor = prim::Loop(%4, %5, %result.1)
block0(%i : int, %7 : Tensor) {
%10 : Tensor = aten::select(%x, %1, %i)
%result.2 : Tensor = aten::mul(%7, %10)
-> (%5, %result.2)
}
return (%result);
}
graph(%x : Tensor) {
%5 : bool = prim::Constant[value=1]()
%1 : int = prim::Constant[value=0]()
%result.1 : Tensor = aten::select(%x, %1, %1)
%4 : int = aten::size(%x, %1)
%result : Tensor = prim::Loop(%4, %5, %result.1)
block0(%i : int, %7 : Tensor) {
%10 : Tensor = aten::select(%x, %1, %i)
%result.2 : Tensor = aten::mul(%7, %10)
-> (%5, %result.2)
}
return (%result);
}
Tracer Warnings
^^^^^^^^^^^^^^^

View File

@ -213,8 +213,8 @@ Multiprocessing error without if-clause protection
.. code-block:: py3tb
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom

View File

@ -8,24 +8,24 @@
PyInit*;
init*;
state;
_ZGVZN2at*;
_ZGVZN2at*;
_ZN2at*;
_ZNK2at*Type*;
_ZNK2at*Tensor*;
_ZNK2at*Storage*;
_ZNK2at*Scalar*;
_ZNK2at*CUDA*;
*2at7Context*;
_ZTIN2at*;
_ZTIZN2at*;
_ZTSN2at*;
_ZTSPN2at*;
_ZTSZN2at*;
_ZTVN2at*;
_ZZN2at*;
_Z*torch*;
_Z*Tensor*;
_Z*tensor*;
_ZNK2at*Type*;
_ZNK2at*Tensor*;
_ZNK2at*Storage*;
_ZNK2at*Scalar*;
_ZNK2at*CUDA*;
*2at7Context*;
_ZTIN2at*;
_ZTIZN2at*;
_ZTSN2at*;
_ZTSPN2at*;
_ZTSZN2at*;
_ZTVN2at*;
_ZZN2at*;
_Z*torch*;
_Z*Tensor*;
_Z*tensor*;
local:
*;
};

View File

@ -18,9 +18,9 @@ struct ConvOptions {
int64_t input_channels,
int64_t output_channels,
ExpandingArray<D> kernel_size) :
input_channels_(input_channels),
output_channels_(output_channels),
kernel_size_(std::move(kernel_size)) {}
input_channels_(input_channels),
output_channels_(output_channels),
kernel_size_(std::move(kernel_size)) {}
/// The number of channels the input volumes will have.
/// Changing this parameter after construction __has no effect__.

View File

@ -370,21 +370,21 @@ As the trace runs, individual operators create Nodes in the Graph being traced t
torch::jit::Node* node = nullptr;
std::shared_ptr<jit::tracer::TracingState> tracer_state;
if (jit::tracer::isTracing()) {
tracer_state = jit::tracer::getTracingState();
at::Symbol op_name;
op_name = jit::Symbol::fromQualString("aten::__ilshift__");
node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
jit::tracer::recordSourceLocation(node);
jit::tracer::addInputs(node, "self", self);
jit::tracer::addInputs(node, "other", other);
tracer_state->graph->insertNode(node);
tracer_state = jit::tracer::getTracingState();
at::Symbol op_name;
op_name = jit::Symbol::fromQualString("aten::__ilshift__");
node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
jit::tracer::recordSourceLocation(node);
jit::tracer::addInputs(node, "self", self);
jit::tracer::addInputs(node, "other", other);
tracer_state->graph->insertNode(node);
jit::tracer::setTracingState(nullptr);
jit::tracer::setTracingState(nullptr);
}
TypeDefault::__ilshift__(self, other);
if (tracer_state) {
jit::tracer::setTracingState(std::move(tracer_state));
jit::tracer::addOutput(node, self);
jit::tracer::setTracingState(std::move(tracer_state));
jit::tracer::addOutput(node, self);
}
```
@ -412,15 +412,15 @@ Our frontends produce ASTs in the form of Tree objects. Trees are similar to [s-
```
(-
(+
(variable (ident x))
(variable (ident y)))
(apply
(.
(variable (ident z))
(ident sigmoid))
(list)
(list))))
(+
(variable (ident x))
(variable (ident y)))
(apply
(.
(variable (ident z))
(ident sigmoid))
(list)
(list))))
```
This is printed in s-expression style with `(kind ...)` representing compound trees and `string_value` representing strings.
@ -454,16 +454,16 @@ The typical way to traverse a tree is to `switch` on the kind and then construct
```cpp
switch (tree.kind()) {
case TK_VAR:
auto var = Var(tree); // construct tree-view
return environment_stack->getSugaredVar(var.name());
auto var = Var(tree); // construct tree-view
return environment_stack->getSugaredVar(var.name());
case '.': {
auto select = Select(tree); // construct tree-view
auto sv = emitSugaredExpr(select.value(), 1);
return sv->attr(select.range(), method, select.selector().name());
auto select = Select(tree); // construct tree-view
auto sv = emitSugaredExpr(select.value(), 1);
return sv->attr(select.range(), method, select.selector().name());
}
case TK_APPLY: {
auto apply = Apply(tree); // construct tree-view
return emitApplyExpr(apply, n_binders);
auto apply = Apply(tree); // construct tree-view
return emitApplyExpr(apply, n_binders);
} break;
```
@ -507,7 +507,7 @@ Tokens are either keywords (`def`), operators (`+`), literals (`3.4`), or identi
```cpp
if (lexer.nextIf('+')) {
// handle + ...
// handle + ...
}
```
@ -650,10 +650,10 @@ using Operation = std::function<int(Stack&)>;
// schema: example_add(Tensor a, Tensor b) -> Tensor
int example_add(Stack& stack) {
Tensor a, b;
// stack before: ? ? ? a b <- back
pop(stack, a, b); //Templated helper function
// that pops a, b and converts them to tensor
Tensor a, b;
// stack before: ? ? ? a b <- back
pop(stack, a, b); //Templated helper function
// that pops a, b and converts them to tensor
push(stack, a + b);
// stack after:
// ? ? ? c <- back
@ -1126,7 +1126,7 @@ As a more involved example, the following TorchScript snippet:
```python
@torch.jit.script
def foo(a : Tensor, b : Tensor):
c = 2 * b
c = 2 * b
a += 1
if a.max() > 4:
r = a[0]

View File

@ -71,15 +71,15 @@ public:
auto size = tuple ? PyTuple_GET_SIZE(source) : PyList_GET_SIZE(source);
v_value.resize(size);
for (int idx = 0; idx < size; idx++) {
PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
if (THPVariable_Check(obj)) {
v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
} else if (PyLong_Check(obj)) {
// use THPUtils_unpackLong after it is safe to include python_numbers.h
v_value[idx] = THPUtils_unpackLong(obj);
} else {
return false;
}
PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
if (THPVariable_Check(obj)) {
v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
} else if (PyLong_Check(obj)) {
// use THPUtils_unpackLong after it is safe to include python_numbers.h
v_value[idx] = THPUtils_unpackLong(obj);
} else {
return false;
}
}
value = v_value;
return true;

View File

@ -195,7 +195,7 @@ ScalarType numpy_dtype_to_aten(int dtype) {
bool is_numpy_scalar(PyObject* obj) {
return (PyArray_IsIntegerScalar(obj) ||
PyArray_IsScalar(obj, Floating));
PyArray_IsScalar(obj, Floating));
}
}} // namespace torch::utils