mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Convert all tabs to spaces, add CI. (#18959)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18959 ghimport-source-id: a934163fa34cb2019732d5f49dc7290c376bf156 Differential Revision: D14831246 Pulled By: ezyang fbshipit-source-id: beb92dc4ee8c82f4c8259c081dd72e477fe7a9d0
This commit is contained in:
committed by
Facebook Github Bot
parent
544783fa1d
commit
48a35135fb
@ -16,6 +16,10 @@ matrix:
|
||||
python: "3.6"
|
||||
dist: xenial
|
||||
script: cd .circleci && ./ensure-consistency.py
|
||||
- name: "Ensure no tabs"
|
||||
python: "2.7"
|
||||
script:
|
||||
- (! git grep -I -l $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above files have tabs; please convert them to spaces"; false))
|
||||
- name: "Python 2.7 Lint"
|
||||
python: "2.7"
|
||||
install: pip install flake8
|
||||
|
@ -252,21 +252,21 @@ IF(USE_CUDA AND NOT USE_ROCM)
|
||||
EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
|
||||
if(${CUDA_VERSION_MAJOR} EQUAL "8")
|
||||
SET(CUFFT_FAKELINK_OPTIONS
|
||||
--generate-code arch=compute_35,code=sm_35
|
||||
--generate-code arch=compute_50,code=sm_50
|
||||
--generate-code arch=compute_60,code=sm_60)
|
||||
--generate-code arch=compute_35,code=sm_35
|
||||
--generate-code arch=compute_50,code=sm_50
|
||||
--generate-code arch=compute_60,code=sm_60)
|
||||
elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
|
||||
SET(CUFFT_FAKELINK_OPTIONS
|
||||
--generate-code arch=compute_35,code=sm_35
|
||||
--generate-code arch=compute_50,code=sm_50
|
||||
--generate-code arch=compute_60,code=sm_60
|
||||
--generate-code arch=compute_70,code=sm_70)
|
||||
--generate-code arch=compute_35,code=sm_35
|
||||
--generate-code arch=compute_50,code=sm_50
|
||||
--generate-code arch=compute_60,code=sm_60
|
||||
--generate-code arch=compute_70,code=sm_70)
|
||||
elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
|
||||
SET(CUFFT_FAKELINK_OPTIONS
|
||||
--generate-code arch=compute_35,code=sm_35
|
||||
--generate-code arch=compute_50,code=sm_50
|
||||
--generate-code arch=compute_60,code=sm_60
|
||||
--generate-code arch=compute_70,code=sm_70)
|
||||
--generate-code arch=compute_35,code=sm_35
|
||||
--generate-code arch=compute_50,code=sm_50
|
||||
--generate-code arch=compute_60,code=sm_60
|
||||
--generate-code arch=compute_70,code=sm_70)
|
||||
else()
|
||||
MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
|
||||
endif()
|
||||
|
@ -19,7 +19,7 @@
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && \
|
||||
(defined(__VEC__) || defined(__ALTIVEC__))
|
||||
(defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
#elif defined(__GNUC__) && defined(__SPE__)
|
||||
|
@ -46,12 +46,12 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
|
||||
auto sr = right.size(i)>1;
|
||||
if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
|
||||
if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size
|
||||
AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
|
||||
sum_size *= left.size(i);
|
||||
AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
|
||||
sum_size *= left.size(i);
|
||||
} else if (sl) { // if it is only in one of left and right, we can sum right away
|
||||
left = left.sum(i, true);
|
||||
left = left.sum(i, true);
|
||||
} else if (sr) {
|
||||
right = right.sum(i, true);
|
||||
right = right.sum(i, true);
|
||||
}
|
||||
} else if (sl && sr) { // now deal with dimensions dimensions that will be in the output
|
||||
// dimensions nontrivially in both left and right must be of the same size
|
||||
@ -117,7 +117,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
|
||||
if (! keepdim) {
|
||||
for (int i = dim-1; i>=0; i--)
|
||||
if (sum_dims[i])
|
||||
result.squeeze_(i);
|
||||
result.squeeze_(i);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@ -183,7 +183,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
|
||||
}
|
||||
else { // we have seen an ellipsis before, so we check compatibility
|
||||
AT_CHECK(candidate_num_ell_idxes == num_ell_idxes,
|
||||
"ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
|
||||
"ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
|
||||
}
|
||||
for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
|
||||
current_op_idxes.push_back(first_ell_idx + i);
|
||||
@ -360,8 +360,8 @@ Tensor einsum(std::string eqn, TensorList tensors) {
|
||||
// the computation is unrolled in the unroll_dim dimension
|
||||
// its main purpose is to unify the computations in bilinear and bilinear_backward
|
||||
Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
||||
IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
|
||||
IntArrayRef sumdim_, int64_t unroll_dim) {
|
||||
IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
|
||||
IntArrayRef sumdim_, int64_t unroll_dim) {
|
||||
int64_t total_dim = i1_.dim()+expand1_.size();
|
||||
AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]");
|
||||
auto expand1 = at::dim_list_to_bitset(expand1_, total_dim);
|
||||
@ -390,11 +390,11 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
||||
if (expand3[i]) {
|
||||
i3 = i3.unsqueeze(i);
|
||||
if (sumdim[i] && (i != unroll_dim))
|
||||
sum_dims_12.push_back(i);
|
||||
sum_dims_12.push_back(i);
|
||||
} else {
|
||||
s = i3.size(i);
|
||||
if (sumdim[i] && (i != unroll_dim))
|
||||
sum_dims_23.push_back(i);
|
||||
sum_dims_23.push_back(i);
|
||||
}
|
||||
output_size.push_back(sumdim[i] ? 1 : s);
|
||||
if (i == unroll_dim)
|
||||
@ -408,8 +408,8 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
||||
if (! sumdim[unroll_dim]) {
|
||||
for (int64_t k = 0; k < unroll_size; k++) {
|
||||
Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1),
|
||||
i2.narrow(unroll_dim, k * slicemul2, 1),
|
||||
sum_dims_12, true);
|
||||
i2.narrow(unroll_dim, k * slicemul2, 1),
|
||||
sum_dims_12, true);
|
||||
buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k * slicemul3, 1), sum_dims_23, true);
|
||||
output.narrow(unroll_dim, k, 1).add_(buf);
|
||||
}
|
||||
@ -417,7 +417,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
||||
else {
|
||||
for (int64_t k = 0; k < unroll_size; k++) {
|
||||
Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k*slicemul1, 1),
|
||||
i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
|
||||
i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
|
||||
buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k*slicemul3, 1), sum_dims_23, true);
|
||||
output.add_(buf);
|
||||
}
|
||||
@ -473,7 +473,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
|
||||
t2 = t2.sum(dims2[i], true);
|
||||
} else {
|
||||
AT_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i],
|
||||
" and second has size ", s2, " in dim ", dims2[i]);
|
||||
" and second has size ", s2, " in dim ", dims2[i]);
|
||||
csize *= s1;
|
||||
}
|
||||
}
|
||||
|
@ -61,7 +61,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
tg_batch_offsets[i] = pos;
|
||||
pos += target_lengths[i];
|
||||
if (max_target_length < target_lengths[i])
|
||||
max_target_length = target_lengths[i];
|
||||
max_target_length = target_lengths[i];
|
||||
}
|
||||
tg_target_stride = targets.stride(0);
|
||||
checkSize(c, targets_arg, 0, pos);
|
||||
@ -83,8 +83,8 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
int64_t max_input_length = log_probs.size(0);
|
||||
for (int64_t b = 0; b < batch_size; b++) {
|
||||
AT_CHECK(input_lengths[b] <= max_input_length,
|
||||
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
|
||||
" (while checking arguments for ", c, ")");
|
||||
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
|
||||
" (while checking arguments for ", c, ")");
|
||||
}
|
||||
|
||||
Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
|
||||
@ -115,11 +115,11 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
// now the loop over the inputs
|
||||
for (int64_t t=1; t<input_length; t++) {
|
||||
for (int64_t s=0; s<2*target_length+1; s++) {
|
||||
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
||||
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
|
||||
// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
|
||||
// for the cuda implementation, that gave a speed boost.
|
||||
// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
|
||||
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
||||
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
|
||||
// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
|
||||
// for the cuda implementation, that gave a speed boost.
|
||||
// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
|
||||
|
||||
scalar_t la1 = log_alpha_a[t-1][s];
|
||||
scalar_t lamax = la1;
|
||||
@ -141,7 +141,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
||||
}
|
||||
if (lamax == neginf) // cannot do neginf-neginf
|
||||
lamax = 0;
|
||||
// this is the assignment of eq (6)
|
||||
// this is the assignment of eq (6)
|
||||
log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime];
|
||||
}
|
||||
}
|
||||
@ -182,7 +182,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
tg_batch_offsets[i] = pos;
|
||||
pos += target_lengths[i];
|
||||
if (max_target_length < target_lengths[i])
|
||||
max_target_length = target_lengths[i];
|
||||
max_target_length = target_lengths[i];
|
||||
}
|
||||
tg_target_stride = targets.stride(0);
|
||||
}
|
||||
@ -268,9 +268,9 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
|
||||
log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime];
|
||||
// one might check whether one can vectorize this better when done after the t-loop...
|
||||
// now that we have beta, we fill in the sum of alpha*beta in eq (16)
|
||||
// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
|
||||
// issue (several s can map to the same target character)
|
||||
// now that we have beta, we fill in the sum of alpha*beta in eq (16)
|
||||
// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
|
||||
// issue (several s can map to the same target character)
|
||||
// collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
|
||||
scalar_t log_alpha_beta = log_alpha_a[t][s] + log_beta_a[t][s];
|
||||
scalar_t &lcab = grad_a[t][current_target_prime];
|
||||
@ -309,9 +309,9 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu(const Tensor& log_probs, const Tensor& t
|
||||
(void)zero_infinity; // only used for backwards
|
||||
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cpu", [&] {
|
||||
if (targets.scalar_type() == kLong) {
|
||||
return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||
return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||
} else {
|
||||
return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||
return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -320,9 +320,9 @@ Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const
|
||||
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
||||
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cpu", [&] {
|
||||
if (targets.scalar_type() == kLong) {
|
||||
return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||
return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||
} else {
|
||||
return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||
return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -76,11 +76,11 @@ pthreadpool_t nnpack_threadpool() {
|
||||
enum nnp_status nnpack_status = nnp_initialize();
|
||||
if (nnpack_status != nnp_status_success) {
|
||||
if (nnpack_status == nnp_status_out_of_memory) {
|
||||
throw std::runtime_error("could not initialize NNPack (out of memory)");
|
||||
throw std::runtime_error("could not initialize NNPack (out of memory)");
|
||||
} else if (nnpack_status == nnp_status_unsupported_hardware) {
|
||||
throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
|
||||
throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
|
||||
} else {
|
||||
throw std::runtime_error("could not initialize NNPack (unknown error)");
|
||||
throw std::runtime_error("could not initialize NNPack (unknown error)");
|
||||
}
|
||||
}
|
||||
unsigned int threads;
|
||||
|
@ -614,7 +614,7 @@ std::tuple<Tensor, Tensor> NAME( \
|
||||
num_layers, dropout_p, train, bidirectional, batch_first); \
|
||||
return std::make_tuple(output, hy); \
|
||||
} \
|
||||
check_device(_input, _params, hx); \
|
||||
check_device(_input, _params, hx); \
|
||||
auto input = batch_first ? _input.transpose(0, 1) : _input; \
|
||||
auto params = gather_params(_params, has_biases); \
|
||||
auto results = _rnn_impl_with_concat<CELL, FullLayer, FullBidirectionalLayer>( \
|
||||
|
@ -126,10 +126,10 @@ Tensor& arange_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
|
||||
double size_d;
|
||||
if (std::is_same<scalar_t, int64_t>::value) {
|
||||
size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
|
||||
/ step.to<accscalar_t>());
|
||||
/ step.to<accscalar_t>());
|
||||
} else {
|
||||
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
|
||||
/ step.to<double>());
|
||||
/ step.to<double>());
|
||||
}
|
||||
|
||||
AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
|
||||
|
@ -100,7 +100,7 @@ typedef union imm_xmm_union {
|
||||
|
||||
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
|
||||
imm_xmm_union u __attribute__((aligned(32))); \
|
||||
u.imm = imm_; \
|
||||
u.imm = imm_; \
|
||||
xmm0_ = u.xmm[0]; \
|
||||
xmm1_ = u.xmm[1]; \
|
||||
}
|
||||
@ -228,8 +228,8 @@ inline v8sf log256_ps(v8sf x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
_PS256_CONST(exp_hi, 88.3762626647949f);
|
||||
_PS256_CONST(exp_lo, -88.3762626647949f);
|
||||
_PS256_CONST(exp_hi, 88.3762626647949f);
|
||||
_PS256_CONST(exp_lo, -88.3762626647949f);
|
||||
|
||||
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
|
||||
_PS256_CONST(cephes_exp_C1, 0.693359375);
|
||||
|
@ -266,7 +266,7 @@ public:
|
||||
CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
|
||||
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
|
||||
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1,
|
||||
exec_type, batch, &ws_size_t));
|
||||
exec_type, batch, &ws_size_t));
|
||||
#else
|
||||
CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
|
||||
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
|
||||
|
@ -87,10 +87,10 @@ __global__ void embedding_backward_feature_kernel
|
||||
match_found_this_thread = 0;
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
unsigned long long int matchmask = WARP_BALLOT(match_found_this_thread);
|
||||
int first_remaining_peer = __ffsll(matchmask) - 1;
|
||||
int first_remaining_peer = __ffsll(matchmask) - 1;
|
||||
#else
|
||||
unsigned int matchmask = WARP_BALLOT(match_found_this_thread);
|
||||
int first_remaining_peer = __ffs(matchmask) - 1;
|
||||
int first_remaining_peer = __ffs(matchmask) - 1;
|
||||
#endif
|
||||
|
||||
if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader
|
||||
@ -103,7 +103,7 @@ __global__ void embedding_backward_feature_kernel
|
||||
#else
|
||||
first_remaining_peer = __ffs(matchmask) - 1;
|
||||
#endif
|
||||
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
|
||||
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
|
||||
matchmask ^= (1 << first_remaining_peer);
|
||||
}
|
||||
if(f < s)
|
||||
|
@ -110,8 +110,8 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
|
||||
for (int64_t t=1; t < max_input_length; t++) {
|
||||
__syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
|
||||
if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) {
|
||||
// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
|
||||
// lamax is the maximum for the logsumexp trick.
|
||||
// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
|
||||
// lamax is the maximum for the logsumexp trick.
|
||||
scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
|
||||
scalar_t lamax = la1;
|
||||
scalar_t la2, la3;
|
||||
@ -135,7 +135,7 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
|
||||
log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax
|
||||
+ log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
|
||||
} else {
|
||||
// otherwise we just set to neginf
|
||||
// otherwise we just set to neginf
|
||||
if (s < 2*max_target_length+1)
|
||||
log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
|
||||
}
|
||||
@ -218,8 +218,8 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
|
||||
int64_t max_input_length = log_probs.size(0);
|
||||
for (int64_t b = 0; b < batch_size; b++) {
|
||||
AT_CHECK(input_lengths[b] <= max_input_length,
|
||||
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
|
||||
" (while checking arguments for ", c, ")");
|
||||
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
|
||||
" (while checking arguments for ", c, ")");
|
||||
}
|
||||
|
||||
auto target_lengths_t = at::tensor(target_lengths, targets.options().dtype(kLong));
|
||||
@ -242,7 +242,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>(
|
||||
log_alpha.data<scalar_t>(),
|
||||
log_alpha.data<scalar_t>(),
|
||||
log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
|
||||
targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
|
||||
neg_log_likelihood.data<scalar_t>(),
|
||||
@ -304,8 +304,8 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
|
||||
if (s < 2*target_length+1) {
|
||||
current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
||||
have_three = ((s < 2*target_length-1) &&
|
||||
(get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
|
||||
current_target_prime));
|
||||
(get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
|
||||
current_target_prime));
|
||||
} else {
|
||||
current_target_prime = BLANK;
|
||||
have_three = false;
|
||||
@ -377,7 +377,7 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da
|
||||
int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
|
||||
int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
|
||||
const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
|
||||
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
|
||||
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
|
||||
int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime!
|
||||
|
||||
@ -405,9 +405,9 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da
|
||||
for (int64_t t = 0; t < input_length; t++) {
|
||||
scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
|
||||
atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
|
||||
-std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
|
||||
+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
|
||||
+ nll - lp) * gr);
|
||||
-std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
|
||||
+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
|
||||
+ nll - lp) * gr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -429,7 +429,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
|
||||
int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
|
||||
int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
|
||||
const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
|
||||
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
|
||||
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
|
||||
|
||||
constexpr scalar_t neginf = -INFINITY;
|
||||
int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
@ -481,7 +481,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
|
||||
// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
|
||||
template<typename scalar_t, ScalarType target_scalar_type>
|
||||
Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
|
||||
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
||||
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
||||
constexpr scalar_t neginf = -INFINITY;
|
||||
using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
|
||||
int64_t batch_size = log_probs.size(1);
|
||||
@ -500,7 +500,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
tg_batch_offsets_data[i] = pos;
|
||||
pos += target_lengths[i];
|
||||
if (max_target_length < target_lengths[i])
|
||||
max_target_length = target_lengths[i];
|
||||
max_target_length = target_lengths[i];
|
||||
}
|
||||
tg_target_stride = targets.stride(0);
|
||||
}
|
||||
@ -558,15 +558,15 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
// maybe we should kernelize this, too.
|
||||
auto grad_blank = grad.narrow(2, BLANK, 1);
|
||||
grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
|
||||
{log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
|
||||
+ log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
|
||||
{log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
|
||||
2, true)
|
||||
.permute({1, 0, 2})
|
||||
.add_(neg_log_likelihood.view({1, batch_size, 1}))
|
||||
.sub_(log_probs.narrow(2, BLANK, 1))
|
||||
.exp_()
|
||||
);
|
||||
{log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
|
||||
+ log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
|
||||
{log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
|
||||
2, true)
|
||||
.permute({1, 0, 2})
|
||||
.add_(neg_log_likelihood.view({1, batch_size, 1}))
|
||||
.sub_(log_probs.narrow(2, BLANK, 1))
|
||||
.exp_()
|
||||
);
|
||||
// scale by output gradient (blanks and first summand of non-blanks)
|
||||
grad *= grad_out.view({1, batch_size, 1});
|
||||
if (zero_infinity) {
|
||||
@ -630,9 +630,9 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& t
|
||||
(void)zero_infinity; // only used for backward
|
||||
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cuda", [&] {
|
||||
if (targets.scalar_type() == kLong) {
|
||||
return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||
return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||
} else {
|
||||
return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||
return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -641,9 +641,9 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
|
||||
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
||||
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cuda", [&] {
|
||||
if (targets.scalar_type() == kLong) {
|
||||
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||
} else {
|
||||
return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||
return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -146,10 +146,10 @@ Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
|
||||
double size_d;
|
||||
if (std::is_same<scalar_t, int64_t>::value) {
|
||||
size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
|
||||
/ step.to<accscalar_t>());
|
||||
/ step.to<accscalar_t>());
|
||||
} else {
|
||||
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
|
||||
/ step.to<double>());
|
||||
/ step.to<double>());
|
||||
}
|
||||
|
||||
AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
|
||||
|
@ -441,18 +441,18 @@ std::tuple<Tensor, Tensor> weight_norm_cuda_backward
|
||||
{
|
||||
using accscalar_t = acc_type<scalar_t, true>;
|
||||
|
||||
weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
|
||||
<<<grad_w.size(0),
|
||||
BLOCK,
|
||||
BLOCK*sizeof(accscalar_t),
|
||||
weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
|
||||
<<<grad_w.size(0),
|
||||
BLOCK,
|
||||
BLOCK*sizeof(accscalar_t),
|
||||
stream>>>
|
||||
(grad_v.data<scalar_t>(),
|
||||
grad_g.data<scalar_t>(),
|
||||
grad_w.data<scalar_t>(),
|
||||
saved_v.data<scalar_t>(),
|
||||
saved_g.data<scalar_t>(),
|
||||
saved_norms.data<accscalar_t>(),
|
||||
rowSize);
|
||||
(grad_v.data<scalar_t>(),
|
||||
grad_g.data<scalar_t>(),
|
||||
grad_w.data<scalar_t>(),
|
||||
saved_v.data<scalar_t>(),
|
||||
saved_g.data<scalar_t>(),
|
||||
saved_norms.data<accscalar_t>(),
|
||||
rowSize);
|
||||
});
|
||||
}
|
||||
else if(dim == ndims - 1)
|
||||
|
@ -72,17 +72,17 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
|
||||
|
||||
size_t workspace_size;
|
||||
AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(),
|
||||
targets->data<int>(), target_lengths.data(), input_lengths.data(),
|
||||
algo, ctc_loss_desc.desc(), &workspace_size));
|
||||
targets->data<int>(), target_lengths.data(), input_lengths.data(),
|
||||
algo, ctc_loss_desc.desc(), &workspace_size));
|
||||
|
||||
|
||||
Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte));
|
||||
Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
|
||||
|
||||
AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
|
||||
targets->data<int>(), target_lengths.data(), input_lengths.data(),
|
||||
costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
|
||||
ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
|
||||
targets->data<int>(), target_lengths.data(), input_lengths.data(),
|
||||
costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
|
||||
ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
|
||||
|
||||
return std::make_tuple(costs, grad);
|
||||
}
|
||||
|
@ -456,7 +456,7 @@ struct algorithm_search<miopenConvFwdAlgorithm_t> {
|
||||
args.wdesc.desc(), args.weight.data_ptr(),
|
||||
args.cdesc.desc(),
|
||||
args.odesc.desc(), args.output.data_ptr(),
|
||||
1, // just return the fastest
|
||||
1, // just return the fastest
|
||||
&perf_count,
|
||||
&perf_results,
|
||||
ws.data,
|
||||
|
@ -5,12 +5,12 @@
|
||||
|
||||
#define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)
|
||||
|
||||
#define THLapackCheck(fmt, func, info , ...) \
|
||||
if (info < 0) { \
|
||||
#define THLapackCheck(fmt, func, info , ...) \
|
||||
if (info < 0) { \
|
||||
THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
|
||||
} else if(info > 0) { \
|
||||
THError(fmt, func, info, ##__VA_ARGS__); \
|
||||
} \
|
||||
} else if(info > 0) { \
|
||||
THError(fmt, func, info, ##__VA_ARGS__); \
|
||||
} \
|
||||
|
||||
#define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...) \
|
||||
if (info < 0) { \
|
||||
|
@ -14,7 +14,7 @@ typedef struct THMemoryFile__
|
||||
THCharStorage *storage;
|
||||
ssize_t size;
|
||||
ssize_t position;
|
||||
int longSize;
|
||||
int longSize;
|
||||
|
||||
} THMemoryFile;
|
||||
|
||||
|
@ -149,13 +149,13 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
|
||||
|
||||
/* get optimal workspace size */
|
||||
THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
|
||||
rb__->data<scalar_t>(), ldb,
|
||||
&wkopt, -1, &info);
|
||||
rb__->data<scalar_t>(), ldb,
|
||||
&wkopt, -1, &info);
|
||||
lwork = (int)wkopt;
|
||||
work = THTensor_(newWithSize1d)(lwork);
|
||||
THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
|
||||
rb__->data<scalar_t>(), ldb,
|
||||
work->data<scalar_t>(), lwork, &info);
|
||||
rb__->data<scalar_t>(), ldb,
|
||||
work->data<scalar_t>(), lwork, &info);
|
||||
|
||||
THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
|
||||
THCleanup(c10::raw::intrusive_ptr::decref(ra__);
|
||||
@ -378,21 +378,21 @@ void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
|
||||
}
|
||||
|
||||
THLapack_(gesdd)(jobz,
|
||||
m,n,ra__->data<scalar_t>(),lda,
|
||||
rs__data,
|
||||
ru__data,
|
||||
ldu,
|
||||
rv__data, ldvt,
|
||||
&wkopt, -1, THIntTensor_data(iwork), &info);
|
||||
m,n,ra__->data<scalar_t>(),lda,
|
||||
rs__data,
|
||||
ru__data,
|
||||
ldu,
|
||||
rv__data, ldvt,
|
||||
&wkopt, -1, THIntTensor_data(iwork), &info);
|
||||
lwork = (int)wkopt;
|
||||
work = THTensor_(newWithSize1d)(lwork);
|
||||
THLapack_(gesdd)(jobz,
|
||||
m,n,ra__->data<scalar_t>(),lda,
|
||||
rs__data,
|
||||
ru__data,
|
||||
ldu,
|
||||
rv__data, ldvt,
|
||||
work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
|
||||
m,n,ra__->data<scalar_t>(),lda,
|
||||
rs__data,
|
||||
ru__data,
|
||||
ldu,
|
||||
rv__data, ldvt,
|
||||
work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
|
||||
|
||||
if (jobz != 'N') {
|
||||
THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",
|
||||
|
@ -999,31 +999,31 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
|
||||
return equal;
|
||||
}
|
||||
|
||||
#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \
|
||||
#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \
|
||||
void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, scalar_t value) \
|
||||
{ \
|
||||
THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
|
||||
TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t, \
|
||||
*r__data = (*t_data OP value) ? 1 : 0;); \
|
||||
} \
|
||||
void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value) \
|
||||
{ \
|
||||
THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
|
||||
TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, \
|
||||
*r__data = (*t_data OP value) ? 1 : 0;); \
|
||||
} \
|
||||
{ \
|
||||
THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
|
||||
TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t, \
|
||||
*r__data = (*t_data OP value) ? 1 : 0;); \
|
||||
} \
|
||||
void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value) \
|
||||
{ \
|
||||
THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
|
||||
TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, \
|
||||
*r__data = (*t_data OP value) ? 1 : 0;); \
|
||||
} \
|
||||
void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
|
||||
{ \
|
||||
THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
|
||||
TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb, \
|
||||
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
|
||||
} \
|
||||
{ \
|
||||
THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
|
||||
TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb, \
|
||||
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
|
||||
} \
|
||||
void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
|
||||
{ \
|
||||
THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
|
||||
TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb, \
|
||||
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
|
||||
} \
|
||||
{ \
|
||||
THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
|
||||
TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb, \
|
||||
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
|
||||
} \
|
||||
|
||||
|
||||
TENSOR_IMPLEMENT_LOGICAL(lt,<)
|
||||
@ -1302,10 +1302,10 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, scalar_t value, int dimension, i
|
||||
*r__data = TH_MATH_NAME(pow)(sum, 1.0/3), 0);
|
||||
} else if (value == INFINITY) {
|
||||
DIM_REDUCE(sum = THMax(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
|
||||
*r__data = sum, 0);
|
||||
*r__data = sum, 0);
|
||||
} else if (value == -INFINITY) {
|
||||
DIM_REDUCE(sum = THMin(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
|
||||
*r__data = sum, INFINITY);
|
||||
*r__data = sum, INFINITY);
|
||||
} else {
|
||||
DIM_REDUCE(sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(t_data[i*t_stride]), value),
|
||||
*r__data = TH_MATH_NAME(pow)(sum, 1.0/value), 0);
|
||||
|
@ -17,10 +17,10 @@ TH_API void THVector_(cdiv)(scalar_t *z, const scalar_t *x, const scalar_t *y, c
|
||||
TH_API void THVector_(divs)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n);
|
||||
TH_API void THVector_(neg)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
|
||||
TH_API void THVector_(normal_fill)(scalar_t *data,
|
||||
const int64_t size,
|
||||
struct THGenerator *generator,
|
||||
const scalar_t mean,
|
||||
const scalar_t stddev);
|
||||
const int64_t size,
|
||||
struct THGenerator *generator,
|
||||
const scalar_t mean,
|
||||
const scalar_t stddev);
|
||||
|
||||
#endif /* non bool only part */
|
||||
|
||||
|
@ -1342,7 +1342,7 @@ static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const
|
||||
// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
|
||||
// $ ./vsxtest
|
||||
//
|
||||
// TODO
|
||||
// TODO
|
||||
//
|
||||
//
|
||||
// Finished running all tests. All tests PASSED.
|
||||
|
@ -119,7 +119,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
|
||||
#else
|
||||
uint32_t a = *eax, b, c = *ecx, d;
|
||||
asm volatile ( "cpuid\n\t"
|
||||
: "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
|
||||
: "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
|
||||
*eax = a;
|
||||
*ebx = b;
|
||||
*ecx = c;
|
||||
|
@ -308,12 +308,12 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
if (prop->major >= 5){
|
||||
THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
|
||||
THCublasCheck(cublasGemmEx(handle, opa, opb,
|
||||
THCublasCheck(cublasGemmEx(handle, opa, opb,
|
||||
i_m, i_n, i_k, &fAlpha,
|
||||
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
|
||||
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc,
|
||||
CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP));
|
||||
THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
|
||||
THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
|
||||
}else{
|
||||
THCublasCheck(cublasSgemmEx(handle, opa, opb,
|
||||
i_m, i_n, i_k, &fAlpha,
|
||||
|
@ -48,7 +48,7 @@ inline bool getCatGrid(THCState* state, ptrdiff_t nTensors, dim3& grid) {
|
||||
//X dim of grid for cat array cooperates on a single tensor in the cat.
|
||||
//Given half of the GPU, full utilization will always occur.
|
||||
grid = dim3( 2LL * numSM, (long long) nTensors );
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -131,7 +131,7 @@ __global__ void CatArrayBatchedCopy(
|
||||
|
||||
while( tid < nElements){
|
||||
IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
|
||||
os.outputSize, os.outputStride, dimSize, concatDim, tid);
|
||||
os.outputSize, os.outputStride, dimSize, concatDim, tid);
|
||||
output[dataOffset + elementOffset] = data[tid];
|
||||
|
||||
tid += stride;
|
||||
|
@ -79,7 +79,7 @@ condDiv(T *q, int64_t *J, int64_t inputsize, T q_max) {
|
||||
q[idx] = one;
|
||||
} else {
|
||||
if (THCNumerics<T>::gt(q_max, one)) {
|
||||
q[idx] = THCNumerics<T>::div(q[idx], q_max);
|
||||
q[idx] = THCNumerics<T>::div(q[idx], q_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -236,7 +236,7 @@ sampleMultinomialOnce(int64_t* dest,
|
||||
THCNumerics<AccT>::div(
|
||||
ScalarConvert<T, AccT>::to(dist[curDist * stride_dist + cat * stride_categories]),
|
||||
sum) :
|
||||
accZero);
|
||||
accZero);
|
||||
|
||||
smem[threadIdx.x] = dist_val;
|
||||
__syncthreads();
|
||||
|
@ -42,7 +42,7 @@ THCTensor_(numel)(THCState *state, THCTensor *t)
|
||||
}
|
||||
|
||||
void THCTensor_(cat)(THCState *state, THCTensor *result,
|
||||
THCTensor *ta, THCTensor *tb, int dimension)
|
||||
THCTensor *ta, THCTensor *tb, int dimension)
|
||||
{
|
||||
THCTensor* inputs[2];
|
||||
inputs[0] = ta;
|
||||
@ -73,7 +73,7 @@ inline void THCTensor_(check_shape_except_dim)(THCState *state,
|
||||
}
|
||||
|
||||
void THCTensor_(catArray)(THCState *state, THCTensor *result,
|
||||
THCTensor **inputs, int numInputs, int dimension)
|
||||
THCTensor **inputs, int numInputs, int dimension)
|
||||
{
|
||||
// previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
|
||||
// to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
|
||||
|
@ -34,9 +34,9 @@ THC_API scalar_t THCTensor_(maxall)(THCState *state, THCTensor *self);
|
||||
THC_API scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self);
|
||||
|
||||
THC_API void THCTensor_(median)(THCState *state,
|
||||
THCTensor *values,
|
||||
THCudaLongTensor *indices,
|
||||
THCTensor *src, int dim, int keepdim);
|
||||
THCTensor *values,
|
||||
THCudaLongTensor *indices,
|
||||
THCTensor *src, int dim, int keepdim);
|
||||
|
||||
THC_API accreal THCTensor_(dist)(THCState *state, THCTensor *self, THCTensor *src,
|
||||
scalar_t value);
|
||||
|
@ -249,7 +249,7 @@ void THCTensor_(multinomial)(struct THCState *state,
|
||||
THCudaLongTensor_data(state, self),
|
||||
numDist, numCategories,
|
||||
THCTensor_(data)(state, prefixSum),
|
||||
THCTensor_(data)(state, normDist));
|
||||
THCTensor_(data)(state, normDist));
|
||||
} else {
|
||||
// Sample without replacement
|
||||
|
||||
|
@ -7,12 +7,12 @@
|
||||
#define ZERO_MACRO zero<T>()
|
||||
template <typename T>
|
||||
inline __device__ typename std::enable_if<std::is_same<T, double>::value, T>::type zero() {
|
||||
return 0.;
|
||||
return 0.;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline __device__ typename std::enable_if<!std::is_same<T, double>::value, T>::type zero() {
|
||||
return 0.f;
|
||||
return 0.f;
|
||||
}
|
||||
#else
|
||||
#define ZERO_MACRO 0.f
|
||||
|
@ -88,7 +88,7 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature
|
||||
#else
|
||||
first_remaining_peer = __ffs(matchmask) - 1;
|
||||
#endif
|
||||
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
|
||||
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
|
||||
matchmask ^= (1 << first_remaining_peer);
|
||||
}
|
||||
if(f < s)
|
||||
|
@ -49,14 +49,14 @@ __global__ void cunn_LookupTableBag_updateOutputKernel(
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
const int weightRow = ((int) input[emb]) * stride;
|
||||
weightFeatSum += ScalarConvert<Dtype, Acctype>::to(weightFeat[weightRow]);
|
||||
bag_size_ ++;
|
||||
bag_size_ ++;
|
||||
if (featureDim == 0) {
|
||||
offset2bag[emb] = bag;
|
||||
}
|
||||
}
|
||||
if (mode == MODE_MEAN) {
|
||||
weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
|
||||
bag_size[bag] = bag_size_;
|
||||
weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
|
||||
bag_size[bag] = bag_size_;
|
||||
}
|
||||
(void) MODE_SUM; //silence warnings about unused MODE_SUM;
|
||||
output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
|
||||
@ -114,9 +114,9 @@ __global__ void cunn_LookupTableBag_accGradParametersKernel(
|
||||
if (featureDim < stride)
|
||||
{
|
||||
gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
|
||||
if (mode == MODE_MEAN) {
|
||||
gradient[ii] /= bag_size[seq_number];
|
||||
}
|
||||
if (mode == MODE_MEAN) {
|
||||
gradient[ii] /= bag_size[seq_number];
|
||||
}
|
||||
weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
|
||||
}
|
||||
}
|
||||
|
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
|
||||
C10_LAUNCH_BOUNDS_1(1024)
|
||||
#endif
|
||||
__global__ void nearest_neighbor_4d_kernel(
|
||||
const int n,
|
||||
const THCDeviceTensor<Dtype, 4> data1,
|
||||
THCDeviceTensor<Dtype, 4> data2) {
|
||||
const int n,
|
||||
const THCDeviceTensor<Dtype, 4> data1,
|
||||
THCDeviceTensor<Dtype, 4> data2) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int batchsize = data1.getSize(0);
|
||||
const int channels = data1.getSize(1);
|
||||
@ -37,10 +37,10 @@ __global__ void nearest_neighbor_4d_kernel(
|
||||
const int h1 = h2;
|
||||
const int w1 = w2;
|
||||
for (int n = 0; n < batchsize; n++) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data1[n][c][h1][w1];
|
||||
data2[n][c][h2][w2] = val;
|
||||
}
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data1[n][c][h1][w1];
|
||||
data2[n][c][h2][w2] = val;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -49,8 +49,8 @@ __global__ void nearest_neighbor_4d_kernel(
|
||||
const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
|
||||
for (int n = 0; n < batchsize; n++) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data1[n][c][h1][w1];
|
||||
data2[n][c][h2][w2] = val;
|
||||
const Dtype val = data1[n][c][h1][w1];
|
||||
data2[n][c][h2][w2] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -62,9 +62,9 @@ template <typename Dtype, typename Acctype>
|
||||
C10_LAUNCH_BOUNDS_1(1024)
|
||||
#endif
|
||||
__global__ void nearest_neighbor_4d_kernel_backward(
|
||||
const int n,
|
||||
THCDeviceTensor<Dtype, 4> data1,
|
||||
const THCDeviceTensor<Dtype, 4> data2) {
|
||||
const int n,
|
||||
THCDeviceTensor<Dtype, 4> data1,
|
||||
const THCDeviceTensor<Dtype, 4> data2) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int batchsize = data1.getSize(0);
|
||||
const int channels = data1.getSize(1);
|
||||
@ -83,10 +83,10 @@ __global__ void nearest_neighbor_4d_kernel_backward(
|
||||
const int h1 = h2;
|
||||
const int w1 = w2;
|
||||
for (int n = 0; n < batchsize; n++) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data2[n][c][h2][w2];
|
||||
data1[n][c][h1][w1] = val;
|
||||
}
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data2[n][c][h2][w2];
|
||||
data1[n][c][h1][w1] = val;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
|
||||
C10_LAUNCH_BOUNDS_1(1024)
|
||||
#endif
|
||||
__global__ void nearest_neighbor_3d_kernel(
|
||||
const int n,
|
||||
const THCDeviceTensor<Dtype, 3> data1,
|
||||
THCDeviceTensor<Dtype, 3> data2) {
|
||||
const int n,
|
||||
const THCDeviceTensor<Dtype, 3> data1,
|
||||
THCDeviceTensor<Dtype, 3> data2) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int batchsize = data1.getSize(0);
|
||||
const int channels = data1.getSize(1);
|
||||
@ -32,10 +32,10 @@ __global__ void nearest_neighbor_3d_kernel(
|
||||
if (width1 == width2) {
|
||||
const int w1 = w2;
|
||||
for (int n = 0; n < batchsize; n++) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data1[n][c][w1];
|
||||
data2[n][c][w2] = val;
|
||||
}
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data1[n][c][w1];
|
||||
data2[n][c][w2] = val;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -43,8 +43,8 @@ __global__ void nearest_neighbor_3d_kernel(
|
||||
const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
|
||||
for (int n = 0; n < batchsize; n++) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data1[n][c][w1];
|
||||
data2[n][c][w2] = val;
|
||||
const Dtype val = data1[n][c][w1];
|
||||
data2[n][c][w2] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -56,9 +56,9 @@ template <typename Dtype, typename Acctype>
|
||||
C10_LAUNCH_BOUNDS_1(1024)
|
||||
#endif
|
||||
__global__ void nearest_neighbor_3d_kernel_backward(
|
||||
const int n,
|
||||
THCDeviceTensor<Dtype, 3> data1,
|
||||
const THCDeviceTensor<Dtype, 3> data2) {
|
||||
const int n,
|
||||
THCDeviceTensor<Dtype, 3> data1,
|
||||
const THCDeviceTensor<Dtype, 3> data2) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int batchsize = data1.getSize(0);
|
||||
const int channels = data1.getSize(1);
|
||||
@ -72,10 +72,10 @@ __global__ void nearest_neighbor_3d_kernel_backward(
|
||||
if (width1 == width2) {
|
||||
const int w1 = w2;
|
||||
for (int n = 0; n < batchsize; n++) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data2[n][c][w1];
|
||||
data1[n][c][w2] = val;
|
||||
}
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data2[n][c][w1];
|
||||
data1[n][c][w2] = val;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
|
||||
C10_LAUNCH_BOUNDS_1(1024)
|
||||
#endif
|
||||
__global__ void nearest_neighbor_5d_kernel(
|
||||
const int n,
|
||||
const THCDeviceTensor<Dtype, 5> data1,
|
||||
THCDeviceTensor<Dtype, 5> data2) {
|
||||
const int n,
|
||||
const THCDeviceTensor<Dtype, 5> data1,
|
||||
THCDeviceTensor<Dtype, 5> data2) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int batchsize = data1.getSize(0);
|
||||
const int channels = data1.getSize(1);
|
||||
@ -55,8 +55,8 @@ __global__ void nearest_neighbor_5d_kernel(
|
||||
const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
|
||||
for (int n = 0; n < batchsize; n++) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data1[n][c][d1][h1][w1];
|
||||
data2[n][c][d2][h2][w2] = val;
|
||||
const Dtype val = data1[n][c][d1][h1][w1];
|
||||
data2[n][c][d2][h2][w2] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -68,9 +68,9 @@ template <typename Dtype, typename Acctype>
|
||||
C10_LAUNCH_BOUNDS_1(1024)
|
||||
#endif
|
||||
__global__ void nearest_neighbor_5d_kernel_backward(
|
||||
const int n,
|
||||
THCDeviceTensor<Dtype, 5> data1,
|
||||
const THCDeviceTensor<Dtype, 5> data2) {
|
||||
const int n,
|
||||
THCDeviceTensor<Dtype, 5> data1,
|
||||
const THCDeviceTensor<Dtype, 5> data2) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int batchsize = data1.getSize(0);
|
||||
const int channels = data1.getSize(1);
|
||||
@ -108,8 +108,8 @@ __global__ void nearest_neighbor_5d_kernel_backward(
|
||||
const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
|
||||
for (int n = 0; n < batchsize; n++) {
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
const Dtype val = data2[n][c][d2][h2][w2];
|
||||
atomicAdd(data1[n][c][d1][h1][w1].data(), val);
|
||||
const Dtype val = data2[n][c][d2][h2][w2];
|
||||
atomicAdd(data1[n][c][d1][h1][w1].data(), val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -24,7 +24,7 @@ inline int GET_BLOCKS(const int N)
|
||||
}
|
||||
|
||||
#define THCUNN_check_shape(STATE, I1, I2) \
|
||||
if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \
|
||||
if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \
|
||||
{ \
|
||||
THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1); \
|
||||
THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \
|
||||
@ -47,20 +47,20 @@ inline int GET_BLOCKS(const int N)
|
||||
ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1); \
|
||||
ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2); \
|
||||
if (n1 != n2) \
|
||||
{ \
|
||||
{ \
|
||||
THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1); \
|
||||
THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2); \
|
||||
THError(#I1 " and " #I2 " have different number of elements: " \
|
||||
THError(#I1 " and " #I2 " have different number of elements: " \
|
||||
#I1 "%s has %ld elements, while " \
|
||||
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
|
||||
if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \
|
||||
THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \
|
||||
THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \
|
||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,7 @@ void THNN_(LookupTableBag_updateOutput)(
|
||||
THCTensor *weight,
|
||||
THCTensor *output,
|
||||
THCIndexTensor *offset2bag,
|
||||
int mode,
|
||||
int mode,
|
||||
THCIndexTensor *bag_size)
|
||||
{
|
||||
THCUNN_assertSameGPU(state, 5, input, offsets, weight, output, offset2bag);
|
||||
@ -65,8 +65,8 @@ void THNN_(LookupTableBag_accGradParameters)(
|
||||
THCIndexTensor *sortedIndices,
|
||||
THCIndexTensor *origIndices,
|
||||
bool scaleGradByFreq,
|
||||
int mode,
|
||||
THCIndexTensor *bag_size,
|
||||
int mode,
|
||||
THCIndexTensor *bag_size,
|
||||
accreal scale_)
|
||||
{
|
||||
scalar_t scale = ScalarConvert<accreal, scalar_t>::to(scale_);
|
||||
|
@ -88,7 +88,7 @@ static THCTensor* THNN_(newViewWeightMM2d)(THCState *state, THCTensor *weight) {
|
||||
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
|
||||
THCTensor *old_weight = weight;
|
||||
weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(),
|
||||
s1, -1, s2, -1);
|
||||
s1, -1, s2, -1);
|
||||
THCTensor_(free)(state, old_weight);
|
||||
}
|
||||
return weight;
|
||||
|
@ -11,7 +11,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
int dilationH, int dilationW, int weight_nullable) {
|
||||
THArgCheck(kW > 0 && kH > 0, 9,
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||
THArgCheck(dW > 0 && dH > 0, 11,
|
||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||
THArgCheck(dilationW > 0 && dilationH > 0, 14,
|
||||
|
@ -34,7 +34,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *output,
|
||||
int outputHeight,
|
||||
int outputHeight,
|
||||
int outputWidth)
|
||||
{
|
||||
THCUNN_assertSameGPU(state, 2, input, output);
|
||||
@ -44,14 +44,14 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
|
||||
int inputWidth = THCTensor_(size)(state, input, 3);
|
||||
|
||||
THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
|
||||
inputHeight, inputWidth,
|
||||
outputHeight, outputWidth);
|
||||
inputHeight, inputWidth,
|
||||
outputHeight, outputWidth);
|
||||
THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
|
||||
|
||||
THCTensor_(resize4d)(state, output,
|
||||
THCTensor_(size)(state, input, 0),
|
||||
THCTensor_(size)(state, input, 1),
|
||||
outputHeight,
|
||||
outputHeight,
|
||||
outputWidth);
|
||||
THCTensor_(zero)(state, output);
|
||||
|
||||
@ -62,7 +62,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
|
||||
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||
nearest_neighbor_4d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
||||
0, stream>>>(num_kernels, idata, odata);
|
||||
0, stream>>>(num_kernels, idata, odata);
|
||||
THCudaCheck(cudaGetLastError());
|
||||
}
|
||||
|
||||
@ -73,15 +73,15 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
int nbatch,
|
||||
int nchannels,
|
||||
int inputHeight,
|
||||
int inputWidth,
|
||||
int outputHeight,
|
||||
int outputWidth)
|
||||
int nchannels,
|
||||
int inputHeight,
|
||||
int inputWidth,
|
||||
int outputHeight,
|
||||
int outputWidth)
|
||||
{
|
||||
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
|
||||
THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
|
||||
inputHeight, inputWidth, outputHeight, outputWidth);
|
||||
inputHeight, inputWidth, outputHeight, outputWidth);
|
||||
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
|
||||
THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
|
||||
|
||||
@ -94,7 +94,7 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
|
||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||
|
||||
nearest_neighbor_4d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||
THCudaCheck(cudaGetLastError());
|
||||
THCTensor_(free)(state, gradOutput);
|
||||
}
|
||||
|
@ -243,7 +243,7 @@ THC_API void THNN_(LookupTableBag_updateOutput)(
|
||||
THCTensor *weight,
|
||||
THCTensor *output,
|
||||
THCIndexTensor *offset2bag,
|
||||
int mode,
|
||||
int mode,
|
||||
THCIndexTensor *seq_length); // [OPTIONAL]
|
||||
|
||||
THC_API void THNN_(LookupTableBag_accGradParameters)(
|
||||
@ -256,8 +256,8 @@ THC_API void THNN_(LookupTableBag_accGradParameters)(
|
||||
THCIndexTensor *sortedIndices,
|
||||
THCIndexTensor *origIndices,
|
||||
bool scaleGradByFreq,
|
||||
int mode,
|
||||
THCIndexTensor *seq_length, // [OPTIONAL]
|
||||
int mode,
|
||||
THCIndexTensor *seq_length, // [OPTIONAL]
|
||||
accreal scale_);
|
||||
|
||||
THC_API void THNN_(L1Cost_updateOutput)(
|
||||
|
@ -54,7 +54,7 @@ void THNN_(TemporalUpSamplingNearest_updateOutput)(
|
||||
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||
nearest_neighbor_3d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
||||
0, stream>>>(num_kernels, idata, odata);
|
||||
0, stream>>>(num_kernels, idata, odata);
|
||||
THCudaCheck(cudaGetLastError());
|
||||
}
|
||||
|
||||
@ -82,7 +82,7 @@ void THNN_(TemporalUpSamplingNearest_updateGradInput)(
|
||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||
|
||||
nearest_neighbor_3d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||
|
||||
THCudaCheck(cudaGetLastError());
|
||||
THCTensor_(free)(state, gradOutput);
|
||||
|
@ -47,10 +47,10 @@ void THNN_(VolumetricUpSamplingNearest_updateOutput)(
|
||||
int inputWidth = THCTensor_(size)(state, input, 4);
|
||||
|
||||
THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
|
||||
inputDepth, inputHeight, inputWidth,
|
||||
outputDepth, outputHeight, outputWidth);
|
||||
inputDepth, inputHeight, inputWidth,
|
||||
outputDepth, outputHeight, outputWidth);
|
||||
THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
|
||||
outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
|
||||
outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
|
||||
|
||||
THCTensor_(resize5d)(state, output,
|
||||
THCTensor_(size)(state, input, 0),
|
||||
@ -67,7 +67,7 @@ void THNN_(VolumetricUpSamplingNearest_updateOutput)(
|
||||
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||
nearest_neighbor_5d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
||||
0, stream>>>(num_kernels, idata, odata);
|
||||
0, stream>>>(num_kernels, idata, odata);
|
||||
THCudaCheck(cudaGetLastError());
|
||||
}
|
||||
|
||||
@ -88,8 +88,8 @@ void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
|
||||
{
|
||||
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
|
||||
THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
|
||||
inputDepth, inputHeight, inputWidth,
|
||||
outputDepth, outputHeight, outputWidth);
|
||||
inputDepth, inputHeight, inputWidth,
|
||||
outputDepth, outputHeight, outputWidth);
|
||||
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
|
||||
THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);
|
||||
|
||||
@ -100,7 +100,7 @@ void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
|
||||
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||
nearest_neighbor_5d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||
THCudaCheck(cudaGetLastError());
|
||||
THCTensor_(free)(state, gradOutput);
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ static Acctype linear_upsampling_compute_source_index(
|
||||
|
||||
__device__ __forceinline__
|
||||
static int nearest_neighbor_compute_source_index(
|
||||
const float scale, int dst_index, int inputSize) {
|
||||
const float scale, int dst_index, int inputSize) {
|
||||
const int src_index = MIN(floor(dst_index * scale), inputSize - 1);
|
||||
return src_index;
|
||||
}
|
||||
|
@ -29,16 +29,16 @@ void THNN_(BCECriterion_updateOutput)(
|
||||
scalar_t y = *target_data;
|
||||
THAssertMsg(x >= 0. && x <= 1.,
|
||||
"input value should be between 0~1, but got %f",
|
||||
(double) x);
|
||||
*output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
|
||||
(double) x);
|
||||
*output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
|
||||
);
|
||||
if (weights) {
|
||||
if (weights) {
|
||||
THTensor_(cmul)(output, output, weights);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
THTensor_(resize0d)(output);
|
||||
THTensor_(resize0d)(output);
|
||||
scalar_t sum = 0;
|
||||
|
||||
if (weights) {
|
||||
@ -48,7 +48,7 @@ void THNN_(BCECriterion_updateOutput)(
|
||||
scalar_t w = *weights_data;
|
||||
THAssertMsg(x >= 0. && x <= 1.,
|
||||
"input value should be between 0~1, but got %f",
|
||||
(double) x);
|
||||
(double) x);
|
||||
sum -= (safe_log(x) * y + safe_log(1. - x) * (1. - y)) * w;
|
||||
);
|
||||
} else {
|
||||
@ -57,7 +57,7 @@ void THNN_(BCECriterion_updateOutput)(
|
||||
scalar_t y = *target_data;
|
||||
THAssertMsg(x >= 0. && x <= 1.,
|
||||
"input value should be between 0~1, but got %f",
|
||||
(double) x);
|
||||
(double) x);
|
||||
sum -= safe_log(x) * y + safe_log(1. - x) * (1. - y);
|
||||
);
|
||||
}
|
||||
|
@ -25,7 +25,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
|
||||
if (weights && THTensor_(nElement)(weights) != n_classes) {
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(weights);
|
||||
THError("weight tensor should be defined either for all %d classes or no classes"
|
||||
" but got weight tensor of shape: %s", n_classes, s1.str);
|
||||
" but got weight tensor of shape: %s", n_classes, s1.str);
|
||||
}
|
||||
|
||||
if (reduction == Reduction::None && n_dims == 2) {
|
||||
@ -39,8 +39,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
|
||||
int cur_target = THLongTensor_fastGetLegacy1dNoScalars(target, i);
|
||||
|
||||
if (cur_target == ignore_index) {
|
||||
THTensor_(fastSet1d)(output, i, 0.0f);
|
||||
continue;
|
||||
THTensor_(fastSet1d)(output, i, 0.0f);
|
||||
continue;
|
||||
}
|
||||
if (cur_target >= 0 && cur_target < n_classes) {
|
||||
scalar_t cur_weight = weights ? THTensor_(fastGetLegacy1dNoScalars)(weights, cur_target) : 1.0f;
|
||||
|
@ -40,7 +40,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
|
||||
{
|
||||
THIndex_t idx = THIndexTensor_(get1d)(target, t);
|
||||
THArgCheck((idx >= 0) && (idx < dim), 3,
|
||||
"target out of range");
|
||||
"target out of range");
|
||||
}
|
||||
|
||||
input = THTensor_(newContiguous)(input);
|
||||
|
@ -98,7 +98,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
|
||||
|
||||
|
||||
THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
|
||||
"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
|
||||
"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
|
||||
|
||||
if (input->dim() == 4)
|
||||
{
|
||||
|
@ -6,9 +6,9 @@
|
||||
#include <algorithm>
|
||||
|
||||
static inline void THNN_(SpatialAveragePooling_shapeCheck)(
|
||||
THTensor *input, THTensor *gradOutput,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
bool ceil_mode) {
|
||||
THTensor *input, THTensor *gradOutput,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
bool ceil_mode) {
|
||||
|
||||
THArgCheck(kW > 0 && kH > 0, 5,
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||
@ -27,12 +27,12 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
|
||||
}
|
||||
|
||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
|
||||
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
|
||||
"pad should be smaller than half of kernel size, but got "
|
||||
"padW = %d, padH = %d, kW = %d, kH = %d",
|
||||
padW, padH, kW, kH);
|
||||
"pad should be smaller than half of kernel size, but got "
|
||||
"padW = %d, padH = %d, kW = %d, kH = %d",
|
||||
padW, padH, kW, kH);
|
||||
|
||||
int64_t nInputPlane = input->size(dimh-1);
|
||||
int64_t inputHeight = input->size(dimh);
|
||||
@ -44,7 +44,7 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
|
||||
|
||||
if (outputWidth < 1 || outputHeight < 1)
|
||||
THError("Given input size: (%dx%dx%d). "
|
||||
"Calculated output size: (%dx%dx%d). Output size is too small",
|
||||
"Calculated output size: (%dx%dx%d). Output size is too small",
|
||||
nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
|
||||
|
||||
if (gradOutput != NULL) {
|
||||
|
@ -4,12 +4,12 @@
|
||||
|
||||
#define INITIAL_CHECK \
|
||||
THArgCheck(THIndexTensor_(nDimensionLegacyAll)(target) == 3, 3, \
|
||||
"only batches of spatial targets supported (3D tensors)" \
|
||||
" but got targets of dimension: %d", \
|
||||
THIndexTensor_(nDimensionLegacyAll)(target)); \
|
||||
THArgCheck(THTensor_(nDimensionLegacyAll)(input) == 4, 2, \
|
||||
"only batches of spatial inputs supported (4D tensors), " \
|
||||
"but got input of dimension: %d", THTensor_(nDimensionLegacyAll)(input)); \
|
||||
"only batches of spatial targets supported (3D tensors)" \
|
||||
" but got targets of dimension: %d", \
|
||||
THIndexTensor_(nDimensionLegacyAll)(target)); \
|
||||
THArgCheck(THTensor_(nDimensionLegacyAll)(input) == 4, 2, \
|
||||
"only batches of spatial inputs supported (4D tensors), " \
|
||||
"but got input of dimension: %d", THTensor_(nDimensionLegacyAll)(input)); \
|
||||
if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) { \
|
||||
THError("weight tensor should be defined either for all or no classes"); \
|
||||
} \
|
||||
@ -30,8 +30,8 @@
|
||||
#define GRADOUTPUT_SHAPE_CHECK \
|
||||
THArgCheck(THTensor_(nDimensionLegacyAll)(gradOutput) == 3, 3, \
|
||||
"gradOutput must have same dimension as target (3)" \
|
||||
" but got dimension: %d", \
|
||||
THTensor_(nDimensionLegacyAll)(gradOutput)); \
|
||||
" but got dimension: %d", \
|
||||
THTensor_(nDimensionLegacyAll)(gradOutput)); \
|
||||
{ \
|
||||
int64_t gradOutput0 = THTensor_(size)(gradOutput, 0); \
|
||||
int64_t gradOutput1 = THTensor_(size)(gradOutput, 1); \
|
||||
|
@ -5,14 +5,14 @@
|
||||
#include <ATen/div_rtn.h>
|
||||
|
||||
static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
|
||||
THTensor *input, THTensor *gradOutput,
|
||||
THTensor *weight, THTensor *bias,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {
|
||||
THTensor *input, THTensor *gradOutput,
|
||||
THTensor *weight, THTensor *bias,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {
|
||||
|
||||
THArgCheck(kW > 0 && kH > 0, 9,
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||
THArgCheck(dW > 0 && dH > 0, 11,
|
||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||
|
||||
if (weight != NULL) {
|
||||
THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
|
||||
@ -36,7 +36,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
|
||||
}
|
||||
|
||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
|
||||
int64_t inputHeight = input->size(dimh);
|
||||
int64_t inputWidth = input->size(dimw);
|
||||
@ -87,8 +87,8 @@ static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) {
|
||||
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
|
||||
THTensor *old_weight = weight;
|
||||
weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
|
||||
s1, -1, s2, -1);
|
||||
c10::raw::intrusive_ptr::decref(old_weight);
|
||||
s1, -1, s2, -1);
|
||||
c10::raw::intrusive_ptr::decref(old_weight);
|
||||
}
|
||||
return weight;
|
||||
}
|
||||
@ -116,8 +116,8 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
|
||||
THTensor *output2d;
|
||||
|
||||
THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
|
||||
nInputPlane, inputWidth, inputHeight,
|
||||
outputWidth, outputHeight);
|
||||
nInputPlane, inputWidth, inputHeight,
|
||||
outputWidth, outputHeight);
|
||||
|
||||
output2d = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(output), output->storage_offset(),
|
||||
nOutputPlane, -1,
|
||||
@ -125,8 +125,8 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
|
||||
if (bias) {
|
||||
for(i = 0; i < nOutputPlane; i++)
|
||||
THVector_(fill)
|
||||
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
|
||||
THTensor_(get1d)(bias, i), outputHeight*outputWidth);
|
||||
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
|
||||
THTensor_(get1d)(bias, i), outputHeight*outputWidth);
|
||||
} else {
|
||||
THTensor_(zero)(output);
|
||||
}
|
||||
@ -202,10 +202,10 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
|
||||
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
||||
|
||||
THNN_(SpatialConvolutionMM_updateOutput_frame)
|
||||
(input_t, output_t, weight, bias, finput_t,
|
||||
kW, kH, dW, dH, padW, padH,
|
||||
nInputPlane, inputWidth, inputHeight,
|
||||
nOutputPlane, outputWidth, outputHeight);
|
||||
(input_t, output_t, weight, bias, finput_t,
|
||||
kW, kH, dW, dH, padW, padH,
|
||||
nInputPlane, inputWidth, inputHeight,
|
||||
nOutputPlane, outputWidth, outputHeight);
|
||||
|
||||
c10::raw::intrusive_ptr::decref(input_t);
|
||||
c10::raw::intrusive_ptr::decref(output_t);
|
||||
@ -239,9 +239,9 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
|
||||
THTensor_(zero)(gradInput);
|
||||
|
||||
THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
|
||||
padW, padH,
|
||||
gradInput->size(0), gradInput->size(2), gradInput->size(1),
|
||||
gradOutput->size(2), gradOutput->size(1));
|
||||
padW, padH,
|
||||
gradInput->size(0), gradInput->size(2), gradInput->size(1),
|
||||
gradOutput->size(2), gradOutput->size(1));
|
||||
}
|
||||
|
||||
void THNN_(SpatialConvolutionMM_updateGradInput)(
|
||||
@ -280,8 +280,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
|
||||
if(input->dim() == 3)
|
||||
{
|
||||
THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
|
||||
tweight, fgradInput,
|
||||
kW, kH, dW, dH, padW, padH);
|
||||
tweight, fgradInput,
|
||||
kW, kH, dW, dH, padW, padH);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -296,8 +296,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
|
||||
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
|
||||
|
||||
THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
|
||||
tweight, fgradInput_t,
|
||||
kW, kH, dW, dH, padW, padH);
|
||||
tweight, fgradInput_t,
|
||||
kW, kH, dW, dH, padW, padH);
|
||||
|
||||
c10::raw::intrusive_ptr::decref(gradInput_t);
|
||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||
@ -380,7 +380,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
|
||||
if(input->dim() == 3)
|
||||
{
|
||||
THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
|
||||
gradBias, finput, scale);
|
||||
gradBias, finput, scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -396,7 +396,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
|
||||
}
|
||||
|
||||
THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
|
||||
gradBias, finput_t, scale);
|
||||
gradBias, finput_t, scale);
|
||||
|
||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||
if (gradWeight) {
|
||||
|
@ -5,10 +5,10 @@
|
||||
#include <ATen/div_rtn.h>
|
||||
|
||||
static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
|
||||
THTensor *input, THTensor *gradOutput,
|
||||
THTensor *weight, THTensor *bias,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
int dilationH, int dilationW, int weight_nullable) {
|
||||
THTensor *input, THTensor *gradOutput,
|
||||
THTensor *weight, THTensor *bias,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
int dilationH, int dilationW, int weight_nullable) {
|
||||
THArgCheck(kW > 0 && kH > 0, 9,
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||
THArgCheck(dW > 0 && dH > 0, 11,
|
||||
@ -40,7 +40,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
|
||||
}
|
||||
|
||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
|
||||
int64_t inputHeight = input->size(dimh);
|
||||
int64_t inputWidth = input->size(dimw);
|
||||
@ -235,7 +235,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
|
||||
is_batch = 0;
|
||||
THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
|
||||
THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1),
|
||||
gradOutput->size(2));
|
||||
gradOutput->size(2));
|
||||
}
|
||||
|
||||
int64_t inputWidth = input->size(3);
|
||||
@ -342,7 +342,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
|
||||
is_batch = 0;
|
||||
THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
|
||||
THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0),
|
||||
gradOutput->size(1), gradOutput->size(2));
|
||||
gradOutput->size(1), gradOutput->size(2));
|
||||
}
|
||||
|
||||
int64_t nInputPlane = input->size(1);
|
||||
|
@ -6,9 +6,9 @@
|
||||
#include <algorithm>
|
||||
|
||||
static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
|
||||
THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
int dilationH, int dilationW, bool ceil_mode) {
|
||||
THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
int dilationH, int dilationW, bool ceil_mode) {
|
||||
|
||||
THArgCheck(kW > 0 && kH > 0, 5,
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||
@ -30,12 +30,12 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
|
||||
}
|
||||
|
||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
|
||||
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
|
||||
"pad should be smaller than half of kernel size, but got "
|
||||
"padW = %d, padH = %d, kW = %d, kH = %d",
|
||||
padW, padH, kW, kH);
|
||||
"pad should be smaller than half of kernel size, but got "
|
||||
"padW = %d, padH = %d, kW = %d, kH = %d",
|
||||
padW, padH, kW, kH);
|
||||
|
||||
int64_t nInputPlane = input->size(dimh-1);
|
||||
int64_t inputHeight = input->size(dimh);
|
||||
@ -47,7 +47,7 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
|
||||
|
||||
if (outputWidth < 1 || outputHeight < 1)
|
||||
THError("Given input size: (%dx%dx%d). "
|
||||
"Calculated output size: (%dx%dx%d). Output size is too small",
|
||||
"Calculated output size: (%dx%dx%d). Output size is too small",
|
||||
nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
|
||||
|
||||
if (gradOutput != NULL) {
|
||||
@ -221,16 +221,16 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
|
||||
for (p = 0; p < nbatch; p++)
|
||||
{
|
||||
THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
|
||||
(input_data+p*nInputPlane*inputWidth*inputHeight,
|
||||
output_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
kW, kH, dW, dH,
|
||||
padW, padH,
|
||||
dilationW, dilationH
|
||||
);
|
||||
(input_data+p*nInputPlane*inputWidth*inputHeight,
|
||||
output_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
kW, kH, dW, dH,
|
||||
padW, padH,
|
||||
dilationW, dilationH
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@ -266,10 +266,10 @@ static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
|
||||
{
|
||||
/* retrieve position of max */
|
||||
int64_t maxp = ind_p_k[i*outputWidth + j];
|
||||
if (maxp != -1) {
|
||||
/* update gradient */
|
||||
gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
|
||||
}
|
||||
if (maxp != -1) {
|
||||
/* update gradient */
|
||||
gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -350,13 +350,13 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
|
||||
for (p = 0; p < nbatch; p++)
|
||||
{
|
||||
THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
|
||||
(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
|
||||
gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
dW, dH);
|
||||
(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
|
||||
gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
dW, dH);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3,15 +3,15 @@
|
||||
#else
|
||||
|
||||
static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
|
||||
THTensor *input, THTensor *gradOutput,
|
||||
THTensor *weight, THTensor *bias,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {
|
||||
THTensor *input, THTensor *gradOutput,
|
||||
THTensor *weight, THTensor *bias,
|
||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||
int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {
|
||||
|
||||
THArgCheck(kW > 0 && kH > 0, 9,
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||
THArgCheck(dW > 0 && dH > 0, 11,
|
||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||
THArgCheck(dilationW > 0 && dilationH > 0, 15,
|
||||
"dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
|
||||
dilationH, dilationW);
|
||||
@ -41,7 +41,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
|
||||
}
|
||||
|
||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||
|
||||
int64_t inputHeight = input->size(dimh);
|
||||
int64_t inputWidth = input->size(dimw);
|
||||
@ -50,8 +50,8 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
|
||||
|
||||
if (outputWidth < 1 || outputHeight < 1) {
|
||||
THError("Given input size per channel: (%ld x %ld). "
|
||||
"Calculated output size per channel: (%ld x %ld). Output size is too small",
|
||||
inputHeight, inputWidth, outputHeight, outputWidth);
|
||||
"Calculated output size per channel: (%ld x %ld). Output size is too small",
|
||||
inputHeight, inputWidth, outputHeight, outputWidth);
|
||||
}
|
||||
|
||||
if (weight != NULL) {
|
||||
|
@ -111,12 +111,12 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
|
||||
for (p = 0; p < nbatch; p++)
|
||||
{
|
||||
THNN_(SpatialMaxUnpooling_updateOutput_frame)(
|
||||
input_data+p*nslices*iwidth*iheight,
|
||||
output_data+p*nslices*owidth*oheight,
|
||||
indices_data+p*nslices*iwidth*iheight,
|
||||
nslices,
|
||||
iwidth, iheight,
|
||||
owidth, oheight);
|
||||
input_data+p*nslices*iwidth*iheight,
|
||||
output_data+p*nslices*owidth*oheight,
|
||||
indices_data+p*nslices*iwidth*iheight,
|
||||
nslices,
|
||||
iwidth, iheight,
|
||||
owidth, oheight);
|
||||
}
|
||||
}
|
||||
|
||||
@ -196,7 +196,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
|
||||
|
||||
if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){
|
||||
THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
|
||||
oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
|
||||
oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
|
||||
}
|
||||
|
||||
/* get raw pointers */
|
||||
|
@ -3,467 +3,467 @@
|
||||
#else
|
||||
|
||||
static inline void THNN_(TemporalRowConvolution_shapeCheck)(
|
||||
THNNState *state,
|
||||
THTensor *input,
|
||||
THTensor *gradOutput,
|
||||
THTensor *weight,
|
||||
THTensor *bias,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW) {
|
||||
THNNState *state,
|
||||
THTensor *input,
|
||||
THTensor *gradOutput,
|
||||
THTensor *weight,
|
||||
THTensor *bias,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW) {
|
||||
|
||||
THArgCheck(kW > 0, 5,
|
||||
"kernel size should be greater than zero, but got kW: %d", kW);
|
||||
THArgCheck(dW > 0, 6,
|
||||
"stride should be greater than zero, but got dW: %d", dW);
|
||||
THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
|
||||
"non-empty 3D weight tensor expected, but got: %s");
|
||||
THArgCheck(kW > 0, 5,
|
||||
"kernel size should be greater than zero, but got kW: %d", kW);
|
||||
THArgCheck(dW > 0, 6,
|
||||
"stride should be greater than zero, but got dW: %d", dW);
|
||||
THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
|
||||
"non-empty 3D weight tensor expected, but got: %s");
|
||||
THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
|
||||
THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
|
||||
|
||||
if (bias != NULL) {
|
||||
THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
|
||||
}
|
||||
if (bias != NULL) {
|
||||
THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
|
||||
}
|
||||
|
||||
// we're always looking at (possibly batch) x feats x seq
|
||||
int ndim = input->dim();
|
||||
int dimF = 0;
|
||||
int dimS = 1;
|
||||
// we're always looking at (possibly batch) x feats x seq
|
||||
int ndim = input->dim();
|
||||
int dimF = 0;
|
||||
int dimS = 1;
|
||||
|
||||
if (ndim == 3) {
|
||||
++dimS;
|
||||
++dimF;
|
||||
}
|
||||
if (ndim == 3) {
|
||||
++dimS;
|
||||
++dimF;
|
||||
}
|
||||
|
||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
|
||||
"non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
|
||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
|
||||
"non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
|
||||
|
||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||
int64_t nInputFrame = input->size(dimS);
|
||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||
int64_t nInputFrame = input->size(dimS);
|
||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||
|
||||
if (nOutputFrame < 1) {
|
||||
THError("Given input size: (%d x %d). "
|
||||
"Calculated output size: (%d x %d). Output size is too small",
|
||||
inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
|
||||
}
|
||||
if (nOutputFrame < 1) {
|
||||
THError("Given input size: (%d x %d). "
|
||||
"Calculated output size: (%d x %d). Output size is too small",
|
||||
inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
|
||||
}
|
||||
|
||||
THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
|
||||
THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
|
||||
|
||||
if (gradOutput != NULL) {
|
||||
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
|
||||
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
|
||||
}
|
||||
if (gradOutput != NULL) {
|
||||
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
|
||||
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
|
||||
}
|
||||
}
|
||||
|
||||
static void THNN_(unfolded_acc_row)(
|
||||
THTensor *finput,
|
||||
THTensor *input,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
int64_t inputFrameSize,
|
||||
int64_t nInputFrame,
|
||||
int64_t nOutputFrame) {
|
||||
THTensor *finput,
|
||||
THTensor *input,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
int64_t inputFrameSize,
|
||||
int64_t nInputFrame,
|
||||
int64_t nOutputFrame) {
|
||||
|
||||
int64_t c;
|
||||
scalar_t *input_data = input->data<scalar_t>();
|
||||
scalar_t *finput_data = finput->data<scalar_t>();
|
||||
int64_t c;
|
||||
scalar_t *input_data = input->data<scalar_t>();
|
||||
scalar_t *finput_data = finput->data<scalar_t>();
|
||||
|
||||
// #pragma omp parallel for private(c)
|
||||
for (c = 0; c < inputFrameSize; c++) {
|
||||
int64_t kw, x;
|
||||
int64_t ix = 0;
|
||||
for (c = 0; c < inputFrameSize; c++) {
|
||||
int64_t kw, x;
|
||||
int64_t ix = 0;
|
||||
|
||||
for (kw = 0; kw < kW; kw++) {
|
||||
scalar_t *src = finput_data
|
||||
+ c * (kW * nOutputFrame)
|
||||
+ kw * (nOutputFrame);
|
||||
scalar_t *dst = input_data + c * (nInputFrame);
|
||||
for (kw = 0; kw < kW; kw++) {
|
||||
scalar_t *src = finput_data
|
||||
+ c * (kW * nOutputFrame)
|
||||
+ kw * (nOutputFrame);
|
||||
scalar_t *dst = input_data + c * (nInputFrame);
|
||||
|
||||
ix = (size_t)(kw);
|
||||
if (dW == 1) {
|
||||
scalar_t *dst_slice = dst + (size_t)(ix);
|
||||
THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
|
||||
} else {
|
||||
for (x = 0; x < nOutputFrame; x++) {
|
||||
scalar_t *dst_slice = dst + (size_t)(ix + x * dW);
|
||||
THVector_(cadd)(dst_slice, dst_slice,
|
||||
src + (size_t)(x), 1, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ix = (size_t)(kw);
|
||||
if (dW == 1) {
|
||||
scalar_t *dst_slice = dst + (size_t)(ix);
|
||||
THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
|
||||
} else {
|
||||
for (x = 0; x < nOutputFrame; x++) {
|
||||
scalar_t *dst_slice = dst + (size_t)(ix + x * dW);
|
||||
THVector_(cadd)(dst_slice, dst_slice,
|
||||
src + (size_t)(x), 1, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void THNN_(unfolded_copy_row)(
|
||||
THTensor *finput,
|
||||
THTensor *input,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
int64_t inputFrameSize,
|
||||
int64_t nInputFrame,
|
||||
int64_t nOutputFrame) {
|
||||
THTensor *finput,
|
||||
THTensor *input,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
int64_t inputFrameSize,
|
||||
int64_t nInputFrame,
|
||||
int64_t nOutputFrame) {
|
||||
|
||||
int64_t k;
|
||||
scalar_t *input_data = input->data<scalar_t>();
|
||||
scalar_t *finput_data = finput->data<scalar_t>();
|
||||
int64_t k;
|
||||
scalar_t *input_data = input->data<scalar_t>();
|
||||
scalar_t *finput_data = finput->data<scalar_t>();
|
||||
|
||||
// #pragma omp parallel for private(k)
|
||||
for (k = 0; k < inputFrameSize * kW; k++) {
|
||||
int64_t c = k / kW;
|
||||
int64_t rest = k % kW;
|
||||
int64_t kw = rest % kW;
|
||||
int64_t x;
|
||||
int64_t ix;
|
||||
scalar_t *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
|
||||
scalar_t *src = input_data + c * (nInputFrame);
|
||||
for (k = 0; k < inputFrameSize * kW; k++) {
|
||||
int64_t c = k / kW;
|
||||
int64_t rest = k % kW;
|
||||
int64_t kw = rest % kW;
|
||||
int64_t x;
|
||||
int64_t ix;
|
||||
scalar_t *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
|
||||
scalar_t *src = input_data + c * (nInputFrame);
|
||||
|
||||
ix = (size_t)(kw);
|
||||
if (dW == 1) {
|
||||
memcpy(dst, src+(size_t)(ix), sizeof(scalar_t) * (nOutputFrame));
|
||||
} else {
|
||||
for (x = 0; x < nOutputFrame; x++) {
|
||||
memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
|
||||
sizeof(scalar_t) * 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
ix = (size_t)(kw);
|
||||
if (dW == 1) {
|
||||
memcpy(dst, src+(size_t)(ix), sizeof(scalar_t) * (nOutputFrame));
|
||||
} else {
|
||||
for (x = 0; x < nOutputFrame; x++) {
|
||||
memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
|
||||
sizeof(scalar_t) * 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void THNN_(TemporalRowConvolution_updateOutput_frame)(
|
||||
THTensor *input,
|
||||
THTensor *output,
|
||||
THTensor *weight,
|
||||
THTensor *bias,
|
||||
THTensor *finput,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
int64_t inputFrameSize,
|
||||
int64_t nInputFrame,
|
||||
int64_t nOutputFrame) {
|
||||
THTensor *input,
|
||||
THTensor *output,
|
||||
THTensor *weight,
|
||||
THTensor *bias,
|
||||
THTensor *finput,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
int64_t inputFrameSize,
|
||||
int64_t nInputFrame,
|
||||
int64_t nOutputFrame) {
|
||||
|
||||
int64_t i;
|
||||
int64_t i;
|
||||
|
||||
THTensor *output3d = THTensor_(newWithStorage3d)(
|
||||
THTensor_getStoragePtr(output), output->storage_offset(),
|
||||
inputFrameSize, -1,
|
||||
1, -1,
|
||||
nOutputFrame, -1);
|
||||
THTensor *output3d = THTensor_(newWithStorage3d)(
|
||||
THTensor_getStoragePtr(output), output->storage_offset(),
|
||||
inputFrameSize, -1,
|
||||
1, -1,
|
||||
nOutputFrame, -1);
|
||||
|
||||
THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
|
||||
THTensor_(zero)(output);
|
||||
THTensor_(zero)(output);
|
||||
|
||||
if (bias != NULL) {
|
||||
for (i = 0; i < inputFrameSize; i++)
|
||||
THVector_(fill)
|
||||
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
|
||||
+ output->stride(0) * i,
|
||||
THTensor_(get1d)(bias, i), nOutputFrame);
|
||||
}
|
||||
if (bias != NULL) {
|
||||
for (i = 0; i < inputFrameSize; i++)
|
||||
THVector_(fill)
|
||||
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
|
||||
+ output->stride(0) * i,
|
||||
THTensor_(get1d)(bias, i), nOutputFrame);
|
||||
}
|
||||
|
||||
THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
|
||||
THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
|
||||
|
||||
c10::raw::intrusive_ptr::decref(output3d);
|
||||
c10::raw::intrusive_ptr::decref(output3d);
|
||||
}
|
||||
|
||||
void THNN_(TemporalRowConvolution_updateOutput)(
|
||||
THNNState *state,
|
||||
THTensor *input,
|
||||
THTensor *output,
|
||||
THTensor *weight,
|
||||
THTensor *bias,
|
||||
THTensor *finput,
|
||||
THTensor *fgradInput, // unused here but needed for Cuda
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
bool featFirst) {
|
||||
THNNState *state,
|
||||
THTensor *input,
|
||||
THTensor *output,
|
||||
THTensor *weight,
|
||||
THTensor *bias,
|
||||
THTensor *finput,
|
||||
THTensor *fgradInput, // unused here but needed for Cuda
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
bool featFirst) {
|
||||
|
||||
int ndim = input->dim();
|
||||
int ndim = input->dim();
|
||||
|
||||
THTensor *tinput = NULL;
|
||||
if (!featFirst) {
|
||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||
input = THTensor_(newContiguous)(tinput);
|
||||
} else {
|
||||
input = THTensor_(newContiguous)(input);
|
||||
}
|
||||
THTensor *tinput = NULL;
|
||||
if (!featFirst) {
|
||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||
input = THTensor_(newContiguous)(tinput);
|
||||
} else {
|
||||
input = THTensor_(newContiguous)(input);
|
||||
}
|
||||
|
||||
THNN_(TemporalRowConvolution_shapeCheck)(
|
||||
state, input, NULL, weight, bias, kW, dW, padW);
|
||||
THNN_(TemporalRowConvolution_shapeCheck)(
|
||||
state, input, NULL, weight, bias, kW, dW, padW);
|
||||
|
||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||
int64_t nInputFrame = input->size(ndim - 1);
|
||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||
int64_t nInputFrame = input->size(ndim - 1);
|
||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||
|
||||
if (ndim == 2) { /* non-batch mode */
|
||||
if (ndim == 2) { /* non-batch mode */
|
||||
|
||||
THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
|
||||
THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
|
||||
THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
|
||||
THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
|
||||
|
||||
THTensor_(zero)(finput);
|
||||
THTensor_(zero)(output);
|
||||
THTensor_(zero)(finput);
|
||||
THTensor_(zero)(output);
|
||||
|
||||
THNN_(TemporalRowConvolution_updateOutput_frame)
|
||||
(input, output, weight, bias, finput,
|
||||
kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
THNN_(TemporalRowConvolution_updateOutput_frame)
|
||||
(input, output, weight, bias, finput,
|
||||
kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
|
||||
} else {
|
||||
int64_t T = input->size(0);
|
||||
int64_t t;
|
||||
} else {
|
||||
int64_t T = input->size(0);
|
||||
int64_t t;
|
||||
|
||||
THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
|
||||
THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
|
||||
THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
|
||||
THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
|
||||
|
||||
THTensor_(zero)(finput);
|
||||
THTensor_(zero)(output);
|
||||
THTensor_(zero)(finput);
|
||||
THTensor_(zero)(output);
|
||||
|
||||
#pragma omp parallel for private(t)
|
||||
for (t = 0; t < T; t++) {
|
||||
THTensor *input_t = THTensor_(newSelect)(input, 0, t);
|
||||
THTensor *output_t = THTensor_(newSelect)(output, 0, t);
|
||||
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
||||
for (t = 0; t < T; t++) {
|
||||
THTensor *input_t = THTensor_(newSelect)(input, 0, t);
|
||||
THTensor *output_t = THTensor_(newSelect)(output, 0, t);
|
||||
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
||||
|
||||
THNN_(TemporalRowConvolution_updateOutput_frame)
|
||||
(input_t, output_t, weight, bias, finput_t,
|
||||
kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
|
||||
THNN_(TemporalRowConvolution_updateOutput_frame)
|
||||
(input_t, output_t, weight, bias, finput_t,
|
||||
kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
|
||||
|
||||
c10::raw::intrusive_ptr::decref(input_t);
|
||||
c10::raw::intrusive_ptr::decref(output_t);
|
||||
c10::raw::intrusive_ptr::decref(finput_t);
|
||||
}
|
||||
}
|
||||
c10::raw::intrusive_ptr::decref(input_t);
|
||||
c10::raw::intrusive_ptr::decref(output_t);
|
||||
c10::raw::intrusive_ptr::decref(finput_t);
|
||||
}
|
||||
}
|
||||
|
||||
if (!featFirst) { // NOTE: output will NOT be contiguous in this case
|
||||
THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
|
||||
c10::raw::intrusive_ptr::decref(tinput);
|
||||
}
|
||||
if (!featFirst) { // NOTE: output will NOT be contiguous in this case
|
||||
THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
|
||||
c10::raw::intrusive_ptr::decref(tinput);
|
||||
}
|
||||
|
||||
c10::raw::intrusive_ptr::decref(input);
|
||||
c10::raw::intrusive_ptr::decref(input);
|
||||
}
|
||||
|
||||
static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
|
||||
THTensor *gradInput,
|
||||
THTensor *gradOutput,
|
||||
THTensor *weight,
|
||||
THTensor *fgradInput,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
int64_t inputFrameSize,
|
||||
int64_t nInputFrame,
|
||||
int64_t nOutputFrame) {
|
||||
THTensor *gradInput,
|
||||
THTensor *gradOutput,
|
||||
THTensor *weight,
|
||||
THTensor *fgradInput,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
int64_t inputFrameSize,
|
||||
int64_t nInputFrame,
|
||||
int64_t nOutputFrame) {
|
||||
|
||||
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
|
||||
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
|
||||
inputFrameSize, -1,
|
||||
1, -1,
|
||||
nOutputFrame, -1);
|
||||
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
|
||||
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
|
||||
inputFrameSize, -1,
|
||||
1, -1,
|
||||
nOutputFrame, -1);
|
||||
|
||||
// weight: inputFrameSize x kW x 1
|
||||
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
|
||||
THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
|
||||
// fgradInput: inputFrameSize x kW x nOutputFrame
|
||||
c10::raw::intrusive_ptr::decref(gradOutput3d);
|
||||
// weight: inputFrameSize x kW x 1
|
||||
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
|
||||
THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
|
||||
// fgradInput: inputFrameSize x kW x nOutputFrame
|
||||
c10::raw::intrusive_ptr::decref(gradOutput3d);
|
||||
|
||||
THTensor_(zero)(gradInput);
|
||||
THTensor_(zero)(gradInput);
|
||||
|
||||
THNN_(unfolded_acc_row)(fgradInput, gradInput,
|
||||
kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
THNN_(unfolded_acc_row)(fgradInput, gradInput,
|
||||
kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
}
|
||||
|
||||
void THNN_(TemporalRowConvolution_updateGradInput)(
|
||||
THNNState *state,
|
||||
THTensor *input,
|
||||
THTensor *gradOutput,
|
||||
THTensor *gradInput,
|
||||
THTensor *weight,
|
||||
THTensor *finput,
|
||||
THTensor *fgradInput,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
bool featFirst) {
|
||||
THNNState *state,
|
||||
THTensor *input,
|
||||
THTensor *gradOutput,
|
||||
THTensor *gradInput,
|
||||
THTensor *weight,
|
||||
THTensor *finput,
|
||||
THTensor *fgradInput,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
bool featFirst) {
|
||||
|
||||
int ndim = input->dim();
|
||||
int ndim = input->dim();
|
||||
|
||||
THTensor *tinput, *tgradOutput;
|
||||
THTensor *tinput, *tgradOutput;
|
||||
|
||||
if (!featFirst) {
|
||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
|
||||
if (!featFirst) {
|
||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
|
||||
|
||||
input = THTensor_(newContiguous)(tinput);
|
||||
gradOutput = THTensor_(newContiguous)(tgradOutput);
|
||||
input = THTensor_(newContiguous)(tinput);
|
||||
gradOutput = THTensor_(newContiguous)(tgradOutput);
|
||||
|
||||
} else {
|
||||
input = THTensor_(newContiguous)(input);
|
||||
gradOutput = THTensor_(newContiguous)(gradOutput);
|
||||
}
|
||||
} else {
|
||||
input = THTensor_(newContiguous)(input);
|
||||
gradOutput = THTensor_(newContiguous)(gradOutput);
|
||||
}
|
||||
|
||||
THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
|
||||
NULL, kW, dW, padW);
|
||||
THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
|
||||
NULL, kW, dW, padW);
|
||||
|
||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||
int64_t nInputFrame = input->size(ndim - 1);
|
||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||
int64_t nInputFrame = input->size(ndim - 1);
|
||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||
|
||||
THTensor_(resizeAs)(fgradInput, finput);
|
||||
THTensor_(resizeAs)(gradInput, input);
|
||||
THTensor_(resizeAs)(fgradInput, finput);
|
||||
THTensor_(resizeAs)(gradInput, input);
|
||||
|
||||
THTensor_(zero)(fgradInput);
|
||||
THTensor_(zero)(gradInput);
|
||||
THTensor_(zero)(fgradInput);
|
||||
THTensor_(zero)(gradInput);
|
||||
|
||||
THTensor *tweight = THTensor_(new)();
|
||||
THTensor_(transpose)(tweight, weight, 1, 2);
|
||||
|
||||
if (ndim == 2) {
|
||||
THNN_(TemporalRowConvolution_updateGradInput_frame)
|
||||
(gradInput, gradOutput, tweight, fgradInput,
|
||||
kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
} else {
|
||||
int64_t T = input->size(0);
|
||||
int64_t t;
|
||||
if (ndim == 2) {
|
||||
THNN_(TemporalRowConvolution_updateGradInput_frame)
|
||||
(gradInput, gradOutput, tweight, fgradInput,
|
||||
kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
} else {
|
||||
int64_t T = input->size(0);
|
||||
int64_t t;
|
||||
|
||||
#pragma omp parallel for private(t)
|
||||
for (t = 0; t < T; t++) {
|
||||
for (t = 0; t < T; t++) {
|
||||
|
||||
THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
|
||||
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
|
||||
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
|
||||
THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
|
||||
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
|
||||
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
|
||||
|
||||
THNN_(TemporalRowConvolution_updateGradInput_frame)
|
||||
(gradInput_t, gradOutput_t, tweight, fgradInput_t,
|
||||
kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
THNN_(TemporalRowConvolution_updateGradInput_frame)
|
||||
(gradInput_t, gradOutput_t, tweight, fgradInput_t,
|
||||
kW, dW, padW,
|
||||
inputFrameSize, nInputFrame, nOutputFrame);
|
||||
|
||||
c10::raw::intrusive_ptr::decref(gradInput_t);
|
||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||
c10::raw::intrusive_ptr::decref(fgradInput_t);
|
||||
}
|
||||
}
|
||||
c10::raw::intrusive_ptr::decref(gradInput_t);
|
||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||
c10::raw::intrusive_ptr::decref(fgradInput_t);
|
||||
}
|
||||
}
|
||||
|
||||
c10::raw::intrusive_ptr::decref(tweight);
|
||||
|
||||
if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
|
||||
if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
|
||||
|
||||
c10::raw::intrusive_ptr::decref(tinput);
|
||||
c10::raw::intrusive_ptr::decref(tgradOutput);
|
||||
c10::raw::intrusive_ptr::decref(tinput);
|
||||
c10::raw::intrusive_ptr::decref(tgradOutput);
|
||||
|
||||
THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
|
||||
}
|
||||
THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
|
||||
}
|
||||
|
||||
c10::raw::intrusive_ptr::decref(input);
|
||||
c10::raw::intrusive_ptr::decref(gradOutput);
|
||||
c10::raw::intrusive_ptr::decref(input);
|
||||
c10::raw::intrusive_ptr::decref(gradOutput);
|
||||
|
||||
}
|
||||
|
||||
static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
||||
THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
|
||||
THTensor *finput, scalar_t scale) {
|
||||
THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
|
||||
THTensor *finput, scalar_t scale) {
|
||||
|
||||
int64_t i;
|
||||
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
|
||||
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
|
||||
gradOutput->size(0), -1,
|
||||
1, -1,
|
||||
gradOutput->size(1), -1);
|
||||
int64_t i;
|
||||
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
|
||||
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
|
||||
gradOutput->size(0), -1,
|
||||
1, -1,
|
||||
gradOutput->size(1), -1);
|
||||
|
||||
THTensor *tfinput = THTensor_(new)();
|
||||
THTensor_(transpose)(tfinput, finput, 1, 2);
|
||||
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
|
||||
// finput: inputFrameSize x nOutputFrame x kW
|
||||
THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
|
||||
// gradWeight: inputFrameSize x 1 x kW
|
||||
THTensor_(transpose)(tfinput, finput, 1, 2);
|
||||
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
|
||||
// finput: inputFrameSize x nOutputFrame x kW
|
||||
THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
|
||||
// gradWeight: inputFrameSize x 1 x kW
|
||||
c10::raw::intrusive_ptr::decref(tfinput);
|
||||
|
||||
if (gradBias != NULL) {
|
||||
for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
|
||||
int64_t k;
|
||||
scalar_t sum = 0;
|
||||
scalar_t *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
|
||||
+ gradOutput3d->storage_offset()
|
||||
+ i * gradOutput3d->stride(0);
|
||||
for (k = 0; k < gradOutput3d->size(2); k++) {
|
||||
sum += data[k];
|
||||
}
|
||||
(THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
|
||||
+= scale * sum;
|
||||
}
|
||||
}
|
||||
if (gradBias != NULL) {
|
||||
for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
|
||||
int64_t k;
|
||||
scalar_t sum = 0;
|
||||
scalar_t *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
|
||||
+ gradOutput3d->storage_offset()
|
||||
+ i * gradOutput3d->stride(0);
|
||||
for (k = 0; k < gradOutput3d->size(2); k++) {
|
||||
sum += data[k];
|
||||
}
|
||||
(THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
|
||||
+= scale * sum;
|
||||
}
|
||||
}
|
||||
|
||||
c10::raw::intrusive_ptr::decref(gradOutput3d);
|
||||
c10::raw::intrusive_ptr::decref(gradOutput3d);
|
||||
|
||||
}
|
||||
|
||||
void THNN_(TemporalRowConvolution_accGradParameters)(
|
||||
THNNState *state,
|
||||
THTensor *input,
|
||||
THTensor *gradOutput,
|
||||
THTensor *gradWeight,
|
||||
THTensor *gradBias,
|
||||
THTensor *finput,
|
||||
THTensor *fgradInput,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
bool featFirst,
|
||||
accreal scale_) {
|
||||
THNNState *state,
|
||||
THTensor *input,
|
||||
THTensor *gradOutput,
|
||||
THTensor *gradWeight,
|
||||
THTensor *gradBias,
|
||||
THTensor *finput,
|
||||
THTensor *fgradInput,
|
||||
int kW,
|
||||
int dW,
|
||||
int padW,
|
||||
bool featFirst,
|
||||
accreal scale_) {
|
||||
|
||||
scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
|
||||
int ndim = input->dim();
|
||||
int ndim = input->dim();
|
||||
|
||||
THTensor *tinput = NULL;
|
||||
THTensor *tgradOutput = NULL;
|
||||
THTensor *tinput = NULL;
|
||||
THTensor *tgradOutput = NULL;
|
||||
|
||||
if (!featFirst) {
|
||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
|
||||
if (!featFirst) {
|
||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
|
||||
|
||||
input = THTensor_(newContiguous)(tinput);
|
||||
gradOutput = THTensor_(newContiguous)(tgradOutput);
|
||||
} else {
|
||||
input = THTensor_(newContiguous)(input);
|
||||
gradOutput = THTensor_(newContiguous)(gradOutput);
|
||||
}
|
||||
input = THTensor_(newContiguous)(tinput);
|
||||
gradOutput = THTensor_(newContiguous)(tgradOutput);
|
||||
} else {
|
||||
input = THTensor_(newContiguous)(input);
|
||||
gradOutput = THTensor_(newContiguous)(gradOutput);
|
||||
}
|
||||
|
||||
THNN_(TemporalRowConvolution_shapeCheck)
|
||||
(state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
|
||||
THNN_(TemporalRowConvolution_shapeCheck)
|
||||
(state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
|
||||
|
||||
if (ndim == 2) {
|
||||
THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
||||
gradOutput, gradWeight, gradBias, finput, scale);
|
||||
} else {
|
||||
int64_t T = input->size(0);
|
||||
int64_t t;
|
||||
if (ndim == 2) {
|
||||
THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
||||
gradOutput, gradWeight, gradBias, finput, scale);
|
||||
} else {
|
||||
int64_t T = input->size(0);
|
||||
int64_t t;
|
||||
|
||||
for (t = 0; t < T; t++) {
|
||||
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
|
||||
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
||||
for (t = 0; t < T; t++) {
|
||||
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
|
||||
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
||||
|
||||
THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
||||
gradOutput_t, gradWeight, gradBias, finput_t, scale);
|
||||
THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
||||
gradOutput_t, gradWeight, gradBias, finput_t, scale);
|
||||
|
||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||
c10::raw::intrusive_ptr::decref(finput_t);
|
||||
}
|
||||
}
|
||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||
c10::raw::intrusive_ptr::decref(finput_t);
|
||||
}
|
||||
}
|
||||
|
||||
if (!featFirst) {
|
||||
c10::raw::intrusive_ptr::decref(tinput);
|
||||
c10::raw::intrusive_ptr::decref(tgradOutput);
|
||||
}
|
||||
if (!featFirst) {
|
||||
c10::raw::intrusive_ptr::decref(tinput);
|
||||
c10::raw::intrusive_ptr::decref(tgradOutput);
|
||||
}
|
||||
|
||||
c10::raw::intrusive_ptr::decref(input);
|
||||
c10::raw::intrusive_ptr::decref(gradOutput);
|
||||
c10::raw::intrusive_ptr::decref(input);
|
||||
c10::raw::intrusive_ptr::decref(gradOutput);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -105,7 +105,7 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
|
||||
|
||||
|
||||
THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
|
||||
"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
|
||||
"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
|
||||
|
||||
if (input->dim() == 5)
|
||||
{
|
||||
|
@ -75,7 +75,7 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
|
||||
|
||||
if (otime < 1 || owidth < 1 || oheight < 1)
|
||||
THError("Given input size: (%dx%dx%dx%d). "
|
||||
"Calculated output size: (%dx%dx%dx%d). Output size is too small",
|
||||
"Calculated output size: (%dx%dx%dx%d). Output size is too small",
|
||||
nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
|
||||
|
||||
if (gradOutput != NULL) {
|
||||
|
@ -119,7 +119,7 @@ static THTensor* THNN_(newViewWeight)(THTensor *weight)
|
||||
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
|
||||
THTensor *old_weight = weight;
|
||||
weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
|
||||
s1, -1, s2, -1);
|
||||
s1, -1, s2, -1);
|
||||
c10::raw::intrusive_ptr::decref(old_weight);
|
||||
}
|
||||
return weight;
|
||||
|
@ -274,7 +274,7 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
|
||||
const int64_t k_ = 1;
|
||||
|
||||
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
|
||||
if (bias) {
|
||||
if (bias) {
|
||||
THBlas_(gemm)(
|
||||
't', 'n',
|
||||
n_, m_, k_,
|
||||
|
@ -7,13 +7,13 @@
|
||||
#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
|
||||
#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
|
||||
|
||||
#define THNN_CHECK_SHAPE(I1, I2) \
|
||||
if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2)) \
|
||||
{ \
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
|
||||
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
|
||||
THError(#I1 " and " #I2 " shapes do not match: " \
|
||||
#I1 " %s, " #I2 " %s", s1.str, s2.str); \
|
||||
#define THNN_CHECK_SHAPE(I1, I2) \
|
||||
if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2)) \
|
||||
{ \
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
|
||||
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
|
||||
THError(#I1 " and " #I2 " shapes do not match: " \
|
||||
#I1 " %s, " #I2 " %s", s1.str, s2.str); \
|
||||
}
|
||||
|
||||
#define THNN_CHECK_SHAPE_INDICES(I1, I2) \
|
||||
@ -26,39 +26,39 @@
|
||||
}
|
||||
|
||||
#define THNN_CHECK_NELEMENT(I1, I2) \
|
||||
if (I1 != NULL && I2 != NULL ) { \
|
||||
ptrdiff_t n1 = THTensor_(nElement)(I1); \
|
||||
ptrdiff_t n2 = THTensor_(nElement)(I2); \
|
||||
if (n1 != n2) \
|
||||
{ \
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
|
||||
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
|
||||
THError(#I1 " and " #I2 " have different number of elements: " \
|
||||
#I1 "%s has %ld elements, while " \
|
||||
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
|
||||
} \
|
||||
if (I1 != NULL && I2 != NULL ) { \
|
||||
ptrdiff_t n1 = THTensor_(nElement)(I1); \
|
||||
ptrdiff_t n2 = THTensor_(nElement)(I2); \
|
||||
if (n1 != n2) \
|
||||
{ \
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
|
||||
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
|
||||
THError(#I1 " and " #I2 " have different number of elements: " \
|
||||
#I1 "%s has %ld elements, while " \
|
||||
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
|
||||
if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
|
||||
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(T); \
|
||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
||||
#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
|
||||
if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
|
||||
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(T); \
|
||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
||||
}
|
||||
|
||||
#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \
|
||||
if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
|
||||
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
|
||||
THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \
|
||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||
#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \
|
||||
if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
|
||||
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
|
||||
THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \
|
||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
||||
}
|
||||
|
||||
#define THNN_ARGCHECK(COND, ARG, T, FORMAT) \
|
||||
if (!(COND)) { \
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(T); \
|
||||
THArgCheck(COND, ARG, FORMAT, s1.str); \
|
||||
#define THNN_ARGCHECK(COND, ARG, T, FORMAT) \
|
||||
if (!(COND)) { \
|
||||
THDescBuff s1 = THTensor_(sizeDesc)(T); \
|
||||
THArgCheck(COND, ARG, FORMAT, s1.str); \
|
||||
}
|
||||
|
||||
#include <THNN/generic/AbsCriterion.c>
|
||||
|
@ -206,7 +206,7 @@ TEST(LeftRightTest, givenInt_whenWriteThrowsExceptionOnSecondCall_thenKeepsNewSt
|
||||
write_called = true;
|
||||
}
|
||||
}),
|
||||
MyException
|
||||
MyException
|
||||
);
|
||||
|
||||
// check reading it returns new value
|
||||
|
290
c10/util/Half.h
290
c10/util/Half.h
@ -85,41 +85,41 @@ namespace detail {
|
||||
* @note The implementation doesn't use any floating-point operations.
|
||||
*/
|
||||
inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
|
||||
/*
|
||||
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
|
||||
* +---+-----+------------+-------------------+
|
||||
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||
* +---+-----+------------+-------------------+
|
||||
* Bits 31 26-30 16-25 0-15
|
||||
*
|
||||
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
|
||||
*/
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
/*
|
||||
* Extract the sign of the input number into the high bit of the 32-bit word:
|
||||
*
|
||||
* +---+----------------------------------+
|
||||
* | S |0000000 00000000 00000000 00000000|
|
||||
* +---+----------------------------------+
|
||||
* Bits 31 0-31
|
||||
*/
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
/*
|
||||
* Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
|
||||
*
|
||||
* +---+-----+------------+-------------------+
|
||||
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||
* +---+-----+------------+-------------------+
|
||||
* Bits 30 27-31 17-26 0-16
|
||||
*/
|
||||
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
|
||||
/*
|
||||
* Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
|
||||
* If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
|
||||
* In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
|
||||
* denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
|
||||
* biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
|
||||
*/
|
||||
/*
|
||||
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
|
||||
* +---+-----+------------+-------------------+
|
||||
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||
* +---+-----+------------+-------------------+
|
||||
* Bits 31 26-30 16-25 0-15
|
||||
*
|
||||
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
|
||||
*/
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
/*
|
||||
* Extract the sign of the input number into the high bit of the 32-bit word:
|
||||
*
|
||||
* +---+----------------------------------+
|
||||
* | S |0000000 00000000 00000000 00000000|
|
||||
* +---+----------------------------------+
|
||||
* Bits 31 0-31
|
||||
*/
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
/*
|
||||
* Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
|
||||
*
|
||||
* +---+-----+------------+-------------------+
|
||||
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||
* +---+-----+------------+-------------------+
|
||||
* Bits 30 27-31 17-26 0-16
|
||||
*/
|
||||
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
|
||||
/*
|
||||
* Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
|
||||
* If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
|
||||
* In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
|
||||
* denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
|
||||
* biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
|
||||
*/
|
||||
#ifdef _MSC_VER
|
||||
unsigned long nonsign_bsr;
|
||||
_BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
|
||||
@ -176,62 +176,62 @@ namespace detail {
|
||||
* floating-point operations and bitcasts between integer and floating-point variables.
|
||||
*/
|
||||
inline float fp16_ieee_to_fp32_value(uint16_t h) {
|
||||
/*
|
||||
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
|
||||
* +---+-----+------------+-------------------+
|
||||
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||
* +---+-----+------------+-------------------+
|
||||
* Bits 31 26-30 16-25 0-15
|
||||
*
|
||||
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
|
||||
*/
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
/*
|
||||
* Extract the sign of the input number into the high bit of the 32-bit word:
|
||||
*
|
||||
* +---+----------------------------------+
|
||||
* | S |0000000 00000000 00000000 00000000|
|
||||
* +---+----------------------------------+
|
||||
* Bits 31 0-31
|
||||
*/
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
/*
|
||||
* Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
|
||||
*
|
||||
* +-----+------------+---------------------+
|
||||
* |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
|
||||
* +-----+------------+---------------------+
|
||||
* Bits 27-31 17-26 0-16
|
||||
*/
|
||||
const uint32_t two_w = w + w;
|
||||
/*
|
||||
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
|
||||
* +---+-----+------------+-------------------+
|
||||
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||
* +---+-----+------------+-------------------+
|
||||
* Bits 31 26-30 16-25 0-15
|
||||
*
|
||||
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
|
||||
*/
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
/*
|
||||
* Extract the sign of the input number into the high bit of the 32-bit word:
|
||||
*
|
||||
* +---+----------------------------------+
|
||||
* | S |0000000 00000000 00000000 00000000|
|
||||
* +---+----------------------------------+
|
||||
* Bits 31 0-31
|
||||
*/
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
/*
|
||||
* Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
|
||||
*
|
||||
* +-----+------------+---------------------+
|
||||
* |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
|
||||
* +-----+------------+---------------------+
|
||||
* Bits 27-31 17-26 0-16
|
||||
*/
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
/*
|
||||
* Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
|
||||
* of a single-precision floating-point number:
|
||||
*
|
||||
* S|Exponent | Mantissa
|
||||
* +-+---+-----+------------+----------------+
|
||||
* |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
|
||||
* +-+---+-----+------------+----------------+
|
||||
* Bits | 23-31 | 0-22
|
||||
*
|
||||
* Next, there are some adjustments to the exponent:
|
||||
* - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
|
||||
* formats (0x7F - 0xF = 0x70)
|
||||
* - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
|
||||
* Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
|
||||
* of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
|
||||
* - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
|
||||
* by the difference in the exponent bias (see above).
|
||||
* - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
|
||||
* exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
|
||||
* The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
|
||||
* partially IEEE754-compliant implementations.
|
||||
*
|
||||
* Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
|
||||
* operate on denormal inputs, and do not produce denormal results.
|
||||
*/
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
/*
|
||||
* Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
|
||||
* of a single-precision floating-point number:
|
||||
*
|
||||
* S|Exponent | Mantissa
|
||||
* +-+---+-----+------------+----------------+
|
||||
* |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
|
||||
* +-+---+-----+------------+----------------+
|
||||
* Bits | 23-31 | 0-22
|
||||
*
|
||||
* Next, there are some adjustments to the exponent:
|
||||
* - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
|
||||
* formats (0x7F - 0xF = 0x70)
|
||||
* - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
|
||||
* Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
|
||||
* of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
|
||||
* - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
|
||||
* by the difference in the exponent bias (see above).
|
||||
* - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
|
||||
* exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
|
||||
* The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
|
||||
* partially IEEE754-compliant implementations.
|
||||
*
|
||||
* Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
|
||||
* operate on denormal inputs, and do not produce denormal results.
|
||||
*/
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
// const float exp_scale = 0x1.0p-112f;
|
||||
uint32_t scale_bits = (uint32_t) 15 << 23;
|
||||
float exp_scale_val;
|
||||
@ -239,48 +239,48 @@ namespace detail {
|
||||
const float exp_scale = exp_scale_val;
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
/*
|
||||
* Convert denormalized half-precision inputs into single-precision results (always normalized).
|
||||
* Zero inputs are also handled here.
|
||||
*
|
||||
* In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
|
||||
* First, we shift mantissa into bits 0-9 of the 32-bit word.
|
||||
*
|
||||
* zeros | mantissa
|
||||
* +---------------------------+------------+
|
||||
* |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
|
||||
* +---------------------------+------------+
|
||||
* Bits 10-31 0-9
|
||||
*
|
||||
* Now, remember that denormalized half-precision numbers are represented as:
|
||||
* FP16 = mantissa * 2**(-24).
|
||||
* The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
|
||||
* and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
|
||||
* A normalized single-precision floating-point number is represented as:
|
||||
* FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
|
||||
* Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
|
||||
* number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
|
||||
*
|
||||
* The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
|
||||
* is zero, the constructed single-precision number has the value of
|
||||
* FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
|
||||
* Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
|
||||
* the input half-precision number.
|
||||
*/
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
/*
|
||||
* Convert denormalized half-precision inputs into single-precision results (always normalized).
|
||||
* Zero inputs are also handled here.
|
||||
*
|
||||
* In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
|
||||
* First, we shift mantissa into bits 0-9 of the 32-bit word.
|
||||
*
|
||||
* zeros | mantissa
|
||||
* +---------------------------+------------+
|
||||
* |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
|
||||
* +---------------------------+------------+
|
||||
* Bits 10-31 0-9
|
||||
*
|
||||
* Now, remember that denormalized half-precision numbers are represented as:
|
||||
* FP16 = mantissa * 2**(-24).
|
||||
* The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
|
||||
* and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
|
||||
* A normalized single-precision floating-point number is represented as:
|
||||
* FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
|
||||
* Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
|
||||
* number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
|
||||
*
|
||||
* The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
|
||||
* is zero, the constructed single-precision number has the value of
|
||||
* FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
|
||||
* Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
|
||||
* the input half-precision number.
|
||||
*/
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
/*
|
||||
* - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
|
||||
* input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
|
||||
* input is either a denormal number, or zero.
|
||||
* - Combine the result of conversion of exponent and mantissa with the sign of the input number.
|
||||
*/
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
/*
|
||||
* - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
|
||||
* input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
|
||||
* input is either a denormal number, or zero.
|
||||
* - Combine the result of conversion of exponent and mantissa with the sign of the input number.
|
||||
*/
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -301,22 +301,22 @@ namespace detail {
|
||||
const float scale_to_inf = scale_to_inf_val;
|
||||
const float scale_to_zero = scale_to_zero_val;
|
||||
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -55,7 +55,7 @@ Assertion Passed!
|
||||
|
||||
</details>
|
||||
|
||||
)DOC")
|
||||
)DOC")
|
||||
.Arg(
|
||||
"error_msg",
|
||||
"(*string*): custom error message to be thrown when the input does not pass assertion",
|
||||
|
@ -107,17 +107,17 @@ Testing CountUp operator...
|
||||
'count' value after CountUp test: 10
|
||||
|
||||
Testing CountDown operator...
|
||||
'count' value after CountDown: 9 'done' value: False
|
||||
'count' value after CountDown: 8 'done' value: False
|
||||
'count' value after CountDown: 7 'done' value: False
|
||||
'count' value after CountDown: 6 'done' value: False
|
||||
'count' value after CountDown: 5 'done' value: False
|
||||
'count' value after CountDown: 4 'done' value: False
|
||||
'count' value after CountDown: 3 'done' value: False
|
||||
'count' value after CountDown: 2 'done' value: False
|
||||
'count' value after CountDown: 1 'done' value: False
|
||||
'count' value after CountDown: 0 'done' value: False
|
||||
'count' value after CountDown: -1 'done' value: True
|
||||
'count' value after CountDown: 9 'done' value: False
|
||||
'count' value after CountDown: 8 'done' value: False
|
||||
'count' value after CountDown: 7 'done' value: False
|
||||
'count' value after CountDown: 6 'done' value: False
|
||||
'count' value after CountDown: 5 'done' value: False
|
||||
'count' value after CountDown: 4 'done' value: False
|
||||
'count' value after CountDown: 3 'done' value: False
|
||||
'count' value after CountDown: 2 'done' value: False
|
||||
'count' value after CountDown: 1 'done' value: False
|
||||
'count' value after CountDown: 0 'done' value: False
|
||||
'count' value after CountDown: -1 'done' value: True
|
||||
```
|
||||
|
||||
</details>
|
||||
|
@ -24,11 +24,11 @@ OPERATOR_SCHEMA(Expand)
|
||||
.NumInputs(2)
|
||||
.NumOutputs(1)
|
||||
.SetDoc(R"DOC(
|
||||
Broadcast the input tensor to a materialized new tensor using given shape.
|
||||
Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
|
||||
Dimensions are right alignment;
|
||||
Two corresponding dimensions must have the same value, or one of them
|
||||
equals to 1.
|
||||
Broadcast the input tensor to a materialized new tensor using given shape.
|
||||
Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
|
||||
Dimensions are right alignment;
|
||||
Two corresponding dimensions must have the same value, or one of them
|
||||
equals to 1.
|
||||
In order to align with PyTorch's `expand`, `shape` is allowed to have entries
|
||||
equal to -1, which means to preserve the size of the corresponding dimension
|
||||
in `X` (so it's actually equivalent to equal to 1).
|
||||
|
@ -758,10 +758,10 @@ if(USE_CUDA)
|
||||
endif()
|
||||
if(CAFFE2_USE_CUDNN)
|
||||
IF(CUDNN_STATIC_LINKAGE)
|
||||
LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
|
||||
caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
|
||||
LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
|
||||
caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
|
||||
ELSE()
|
||||
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
|
||||
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
|
||||
ENDIF()
|
||||
else()
|
||||
caffe2_update_option(USE_CUDNN OFF)
|
||||
@ -1166,7 +1166,7 @@ if (NOT BUILD_ATEN_MOBILE)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("#include <stdint.h>
|
||||
static inline void cpuid(uint32_t *eax, uint32_t *ebx,
|
||||
uint32_t *ecx, uint32_t *edx)
|
||||
uint32_t *ecx, uint32_t *edx)
|
||||
{
|
||||
uint32_t a = *eax, b, c = *ecx, d;
|
||||
asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
|
||||
|
@ -3,16 +3,16 @@
|
||||
# CUB_INCLUDE_DIRS - the CUB include directory
|
||||
|
||||
find_path(CUB_INCLUDE_DIR
|
||||
NAMES cub/cub.cuh
|
||||
DOC "The directory where CUB includes reside"
|
||||
NAMES cub/cub.cuh
|
||||
DOC "The directory where CUB includes reside"
|
||||
)
|
||||
|
||||
set(CUB_INCLUDE_DIRS ${CUB_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(CUB
|
||||
FOUND_VAR CUB_FOUND
|
||||
REQUIRED_VARS CUB_INCLUDE_DIR
|
||||
FOUND_VAR CUB_FOUND
|
||||
REQUIRED_VARS CUB_INCLUDE_DIR
|
||||
)
|
||||
|
||||
mark_as_advanced(CUB_FOUND)
|
||||
|
@ -35,20 +35,20 @@ find_package_handle_standard_args(
|
||||
MIOPEN DEFAULT_MSG MIOPEN_INCLUDE_DIR MIOPEN_LIBRARY)
|
||||
|
||||
if(MIOPEN_FOUND)
|
||||
# get MIOpen version
|
||||
# get MIOpen version
|
||||
file(READ ${MIOPEN_INCLUDE_DIR}/version.h MIOPEN_HEADER_CONTENTS)
|
||||
string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
|
||||
MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
|
||||
string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
|
||||
MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
|
||||
string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
|
||||
MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
|
||||
string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
|
||||
MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
|
||||
string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
|
||||
MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
|
||||
string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
|
||||
MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
|
||||
string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
|
||||
MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
|
||||
string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
|
||||
MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
|
||||
string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
|
||||
MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
|
||||
string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
|
||||
MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
|
||||
string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
|
||||
MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
|
||||
string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
|
||||
MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
|
||||
# Assemble MIOpen version
|
||||
if(NOT MIOPEN_VERSION_MAJOR)
|
||||
set(MIOPEN_VERSION "?")
|
||||
|
@ -3,16 +3,16 @@
|
||||
# pybind11_INCLUDE_DIRS - the pybind11 include directory
|
||||
|
||||
find_path(pybind11_INCLUDE_DIR
|
||||
NAMES pybind11/pybind11.h
|
||||
DOC "The directory where pybind11 includes reside"
|
||||
NAMES pybind11/pybind11.h
|
||||
DOC "The directory where pybind11 includes reside"
|
||||
)
|
||||
|
||||
set(pybind11_INCLUDE_DIRS ${pybind11_INCLUDE_DIR})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(pybind11
|
||||
FOUND_VAR pybind11_FOUND
|
||||
REQUIRED_VARS pybind11_INCLUDE_DIR
|
||||
FOUND_VAR pybind11_FOUND
|
||||
REQUIRED_VARS pybind11_INCLUDE_DIR
|
||||
)
|
||||
|
||||
mark_as_advanced(pybind11_FOUND)
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -145,10 +145,10 @@ allowed values for these axes at the moment are:
|
||||
|
||||
.. tip::
|
||||
|
||||
There exist "Rust-style" shorthands for dtypes, like ``kF32`` instead of
|
||||
``kFloat32``. See `here
|
||||
<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/types.h>`_
|
||||
for the full list.
|
||||
There exist "Rust-style" shorthands for dtypes, like ``kF32`` instead of
|
||||
``kFloat32``. See `here
|
||||
<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/types.h>`_
|
||||
for the full list.
|
||||
|
||||
|
||||
An instance of ``TensorOptions`` stores a concrete value for each of these
|
||||
@ -314,8 +314,8 @@ we can convert it from ``int64`` to ``float32``:
|
||||
|
||||
.. attention::
|
||||
|
||||
The result of the conversion, ``float_tensor``, is a new tensor pointing to
|
||||
new memory, unrelated to the source ``source_tensor``.
|
||||
The result of the conversion, ``float_tensor``, is a new tensor pointing to
|
||||
new memory, unrelated to the source ``source_tensor``.
|
||||
|
||||
We can then move it from CPU memory to GPU memory:
|
||||
|
||||
|
@ -5,7 +5,7 @@ pushd %~dp0
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
@ -15,15 +15,15 @@ if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
|
||||
|
@ -770,34 +770,34 @@ Interpreting Graphs
|
||||
|
||||
The example script above produces the graph::
|
||||
|
||||
graph(%len : int) {
|
||||
%15 : int = prim::Constant[value=1]()
|
||||
%9 : bool = prim::Constant[value=1]()
|
||||
%7 : Device = prim::Constant[value="cpu"]()
|
||||
%6 : int = prim::Constant[value=0]()
|
||||
%5 : int = prim::Constant[value=6]()
|
||||
%1 : int = prim::Constant[value=3]()
|
||||
%2 : int = prim::Constant[value=4]()
|
||||
%11 : int = prim::Constant[value=10]()
|
||||
%14 : float = prim::Constant[value=1]()
|
||||
%4 : int[] = prim::ListConstruct(%1, %2)
|
||||
%rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
|
||||
%rv : Tensor = prim::Loop(%len, %9, %rv.1)
|
||||
block0(%i : int, %13 : Tensor) {
|
||||
%12 : bool = aten::lt(%i, %11)
|
||||
%rv.4 : Tensor = prim::If(%12)
|
||||
block0() {
|
||||
%rv.2 : Tensor = aten::sub(%13, %14, %15)
|
||||
-> (%rv.2)
|
||||
}
|
||||
block1() {
|
||||
%rv.3 : Tensor = aten::add(%13, %14, %15)
|
||||
-> (%rv.3)
|
||||
}
|
||||
-> (%9, %rv.4)
|
||||
}
|
||||
return (%rv);
|
||||
}
|
||||
graph(%len : int) {
|
||||
%15 : int = prim::Constant[value=1]()
|
||||
%9 : bool = prim::Constant[value=1]()
|
||||
%7 : Device = prim::Constant[value="cpu"]()
|
||||
%6 : int = prim::Constant[value=0]()
|
||||
%5 : int = prim::Constant[value=6]()
|
||||
%1 : int = prim::Constant[value=3]()
|
||||
%2 : int = prim::Constant[value=4]()
|
||||
%11 : int = prim::Constant[value=10]()
|
||||
%14 : float = prim::Constant[value=1]()
|
||||
%4 : int[] = prim::ListConstruct(%1, %2)
|
||||
%rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
|
||||
%rv : Tensor = prim::Loop(%len, %9, %rv.1)
|
||||
block0(%i : int, %13 : Tensor) {
|
||||
%12 : bool = aten::lt(%i, %11)
|
||||
%rv.4 : Tensor = prim::If(%12)
|
||||
block0() {
|
||||
%rv.2 : Tensor = aten::sub(%13, %14, %15)
|
||||
-> (%rv.2)
|
||||
}
|
||||
block1() {
|
||||
%rv.3 : Tensor = aten::add(%13, %14, %15)
|
||||
-> (%rv.3)
|
||||
}
|
||||
-> (%9, %rv.4)
|
||||
}
|
||||
return (%rv);
|
||||
}
|
||||
|
||||
|
||||
Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
|
||||
@ -850,39 +850,39 @@ Automatic Trace Checking
|
||||
traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
|
||||
|
||||
Gives us the following diagnostic information::
|
||||
ERROR: Graphs differed across invocations!
|
||||
Graph diff::
|
||||
ERROR: Graphs differed across invocations!
|
||||
Graph diff::
|
||||
|
||||
graph(%x : Tensor) {
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%2 : int = prim::Constant[value=0]()
|
||||
%result.1 : Tensor = aten::select(%x, %1, %2)
|
||||
%4 : int = prim::Constant[value=0]()
|
||||
%5 : int = prim::Constant[value=0]()
|
||||
%6 : Tensor = aten::select(%x, %4, %5)
|
||||
%result.2 : Tensor = aten::mul(%result.1, %6)
|
||||
%8 : int = prim::Constant[value=0]()
|
||||
%9 : int = prim::Constant[value=1]()
|
||||
%10 : Tensor = aten::select(%x, %8, %9)
|
||||
- %result : Tensor = aten::mul(%result.2, %10)
|
||||
+ %result.3 : Tensor = aten::mul(%result.2, %10)
|
||||
? ++
|
||||
%12 : int = prim::Constant[value=0]()
|
||||
%13 : int = prim::Constant[value=2]()
|
||||
%14 : Tensor = aten::select(%x, %12, %13)
|
||||
+ %result : Tensor = aten::mul(%result.3, %14)
|
||||
+ %16 : int = prim::Constant[value=0]()
|
||||
+ %17 : int = prim::Constant[value=3]()
|
||||
+ %18 : Tensor = aten::select(%x, %16, %17)
|
||||
- %15 : Tensor = aten::mul(%result, %14)
|
||||
? ^ ^
|
||||
+ %19 : Tensor = aten::mul(%result, %18)
|
||||
? ^ ^
|
||||
- return (%15);
|
||||
? ^
|
||||
+ return (%19);
|
||||
? ^
|
||||
}
|
||||
graph(%x : Tensor) {
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%2 : int = prim::Constant[value=0]()
|
||||
%result.1 : Tensor = aten::select(%x, %1, %2)
|
||||
%4 : int = prim::Constant[value=0]()
|
||||
%5 : int = prim::Constant[value=0]()
|
||||
%6 : Tensor = aten::select(%x, %4, %5)
|
||||
%result.2 : Tensor = aten::mul(%result.1, %6)
|
||||
%8 : int = prim::Constant[value=0]()
|
||||
%9 : int = prim::Constant[value=1]()
|
||||
%10 : Tensor = aten::select(%x, %8, %9)
|
||||
- %result : Tensor = aten::mul(%result.2, %10)
|
||||
+ %result.3 : Tensor = aten::mul(%result.2, %10)
|
||||
? ++
|
||||
%12 : int = prim::Constant[value=0]()
|
||||
%13 : int = prim::Constant[value=2]()
|
||||
%14 : Tensor = aten::select(%x, %12, %13)
|
||||
+ %result : Tensor = aten::mul(%result.3, %14)
|
||||
+ %16 : int = prim::Constant[value=0]()
|
||||
+ %17 : int = prim::Constant[value=3]()
|
||||
+ %18 : Tensor = aten::select(%x, %16, %17)
|
||||
- %15 : Tensor = aten::mul(%result, %14)
|
||||
? ^ ^
|
||||
+ %19 : Tensor = aten::mul(%result, %18)
|
||||
? ^ ^
|
||||
- return (%15);
|
||||
? ^
|
||||
+ return (%19);
|
||||
? ^
|
||||
}
|
||||
|
||||
|
||||
This message indicates to us that the computation differed between when
|
||||
@ -912,19 +912,19 @@ Automatic Trace Checking
|
||||
|
||||
Which produces::
|
||||
|
||||
graph(%x : Tensor) {
|
||||
%5 : bool = prim::Constant[value=1]()
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%result.1 : Tensor = aten::select(%x, %1, %1)
|
||||
%4 : int = aten::size(%x, %1)
|
||||
%result : Tensor = prim::Loop(%4, %5, %result.1)
|
||||
block0(%i : int, %7 : Tensor) {
|
||||
%10 : Tensor = aten::select(%x, %1, %i)
|
||||
%result.2 : Tensor = aten::mul(%7, %10)
|
||||
-> (%5, %result.2)
|
||||
}
|
||||
return (%result);
|
||||
}
|
||||
graph(%x : Tensor) {
|
||||
%5 : bool = prim::Constant[value=1]()
|
||||
%1 : int = prim::Constant[value=0]()
|
||||
%result.1 : Tensor = aten::select(%x, %1, %1)
|
||||
%4 : int = aten::size(%x, %1)
|
||||
%result : Tensor = prim::Loop(%4, %5, %result.1)
|
||||
block0(%i : int, %7 : Tensor) {
|
||||
%10 : Tensor = aten::select(%x, %1, %i)
|
||||
%result.2 : Tensor = aten::mul(%7, %10)
|
||||
-> (%5, %result.2)
|
||||
}
|
||||
return (%result);
|
||||
}
|
||||
|
||||
Tracer Warnings
|
||||
^^^^^^^^^^^^^^^
|
||||
|
@ -213,8 +213,8 @@ Multiprocessing error without if-clause protection
|
||||
.. code-block:: py3tb
|
||||
|
||||
RuntimeError:
|
||||
An attempt has been made to start a new process before the
|
||||
current process has finished its bootstrapping phase.
|
||||
An attempt has been made to start a new process before the
|
||||
current process has finished its bootstrapping phase.
|
||||
|
||||
This probably means that you are not using fork to start your
|
||||
child processes and you have forgotten to use the proper idiom
|
||||
|
@ -8,24 +8,24 @@
|
||||
PyInit*;
|
||||
init*;
|
||||
state;
|
||||
_ZGVZN2at*;
|
||||
_ZGVZN2at*;
|
||||
_ZN2at*;
|
||||
_ZNK2at*Type*;
|
||||
_ZNK2at*Tensor*;
|
||||
_ZNK2at*Storage*;
|
||||
_ZNK2at*Scalar*;
|
||||
_ZNK2at*CUDA*;
|
||||
*2at7Context*;
|
||||
_ZTIN2at*;
|
||||
_ZTIZN2at*;
|
||||
_ZTSN2at*;
|
||||
_ZTSPN2at*;
|
||||
_ZTSZN2at*;
|
||||
_ZTVN2at*;
|
||||
_ZZN2at*;
|
||||
_Z*torch*;
|
||||
_Z*Tensor*;
|
||||
_Z*tensor*;
|
||||
_ZNK2at*Type*;
|
||||
_ZNK2at*Tensor*;
|
||||
_ZNK2at*Storage*;
|
||||
_ZNK2at*Scalar*;
|
||||
_ZNK2at*CUDA*;
|
||||
*2at7Context*;
|
||||
_ZTIN2at*;
|
||||
_ZTIZN2at*;
|
||||
_ZTSN2at*;
|
||||
_ZTSPN2at*;
|
||||
_ZTSZN2at*;
|
||||
_ZTVN2at*;
|
||||
_ZZN2at*;
|
||||
_Z*torch*;
|
||||
_Z*Tensor*;
|
||||
_Z*tensor*;
|
||||
local:
|
||||
*;
|
||||
};
|
||||
|
@ -18,9 +18,9 @@ struct ConvOptions {
|
||||
int64_t input_channels,
|
||||
int64_t output_channels,
|
||||
ExpandingArray<D> kernel_size) :
|
||||
input_channels_(input_channels),
|
||||
output_channels_(output_channels),
|
||||
kernel_size_(std::move(kernel_size)) {}
|
||||
input_channels_(input_channels),
|
||||
output_channels_(output_channels),
|
||||
kernel_size_(std::move(kernel_size)) {}
|
||||
|
||||
/// The number of channels the input volumes will have.
|
||||
/// Changing this parameter after construction __has no effect__.
|
||||
|
@ -370,21 +370,21 @@ As the trace runs, individual operators create Nodes in the Graph being traced t
|
||||
torch::jit::Node* node = nullptr;
|
||||
std::shared_ptr<jit::tracer::TracingState> tracer_state;
|
||||
if (jit::tracer::isTracing()) {
|
||||
tracer_state = jit::tracer::getTracingState();
|
||||
at::Symbol op_name;
|
||||
op_name = jit::Symbol::fromQualString("aten::__ilshift__");
|
||||
node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
|
||||
jit::tracer::recordSourceLocation(node);
|
||||
jit::tracer::addInputs(node, "self", self);
|
||||
jit::tracer::addInputs(node, "other", other);
|
||||
tracer_state->graph->insertNode(node);
|
||||
tracer_state = jit::tracer::getTracingState();
|
||||
at::Symbol op_name;
|
||||
op_name = jit::Symbol::fromQualString("aten::__ilshift__");
|
||||
node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
|
||||
jit::tracer::recordSourceLocation(node);
|
||||
jit::tracer::addInputs(node, "self", self);
|
||||
jit::tracer::addInputs(node, "other", other);
|
||||
tracer_state->graph->insertNode(node);
|
||||
|
||||
jit::tracer::setTracingState(nullptr);
|
||||
jit::tracer::setTracingState(nullptr);
|
||||
}
|
||||
TypeDefault::__ilshift__(self, other);
|
||||
if (tracer_state) {
|
||||
jit::tracer::setTracingState(std::move(tracer_state));
|
||||
jit::tracer::addOutput(node, self);
|
||||
jit::tracer::setTracingState(std::move(tracer_state));
|
||||
jit::tracer::addOutput(node, self);
|
||||
}
|
||||
```
|
||||
|
||||
@ -412,15 +412,15 @@ Our frontends produce ASTs in the form of Tree objects. Trees are similar to [s-
|
||||
|
||||
```
|
||||
(-
|
||||
(+
|
||||
(variable (ident x))
|
||||
(variable (ident y)))
|
||||
(apply
|
||||
(.
|
||||
(variable (ident z))
|
||||
(ident sigmoid))
|
||||
(list)
|
||||
(list))))
|
||||
(+
|
||||
(variable (ident x))
|
||||
(variable (ident y)))
|
||||
(apply
|
||||
(.
|
||||
(variable (ident z))
|
||||
(ident sigmoid))
|
||||
(list)
|
||||
(list))))
|
||||
```
|
||||
|
||||
This is printed in s-expression style with `(kind ...)` representing compound trees and `string_value` representing strings.
|
||||
@ -454,16 +454,16 @@ The typical way to traverse a tree is to `switch` on the kind and then construct
|
||||
```cpp
|
||||
switch (tree.kind()) {
|
||||
case TK_VAR:
|
||||
auto var = Var(tree); // construct tree-view
|
||||
return environment_stack->getSugaredVar(var.name());
|
||||
auto var = Var(tree); // construct tree-view
|
||||
return environment_stack->getSugaredVar(var.name());
|
||||
case '.': {
|
||||
auto select = Select(tree); // construct tree-view
|
||||
auto sv = emitSugaredExpr(select.value(), 1);
|
||||
return sv->attr(select.range(), method, select.selector().name());
|
||||
auto select = Select(tree); // construct tree-view
|
||||
auto sv = emitSugaredExpr(select.value(), 1);
|
||||
return sv->attr(select.range(), method, select.selector().name());
|
||||
}
|
||||
case TK_APPLY: {
|
||||
auto apply = Apply(tree); // construct tree-view
|
||||
return emitApplyExpr(apply, n_binders);
|
||||
auto apply = Apply(tree); // construct tree-view
|
||||
return emitApplyExpr(apply, n_binders);
|
||||
} break;
|
||||
|
||||
```
|
||||
@ -507,7 +507,7 @@ Tokens are either keywords (`def`), operators (`+`), literals (`3.4`), or identi
|
||||
|
||||
```cpp
|
||||
if (lexer.nextIf('+')) {
|
||||
// handle + ...
|
||||
// handle + ...
|
||||
}
|
||||
```
|
||||
|
||||
@ -650,10 +650,10 @@ using Operation = std::function<int(Stack&)>;
|
||||
|
||||
// schema: example_add(Tensor a, Tensor b) -> Tensor
|
||||
int example_add(Stack& stack) {
|
||||
Tensor a, b;
|
||||
// stack before: ? ? ? a b <- back
|
||||
pop(stack, a, b); //Templated helper function
|
||||
// that pops a, b and converts them to tensor
|
||||
Tensor a, b;
|
||||
// stack before: ? ? ? a b <- back
|
||||
pop(stack, a, b); //Templated helper function
|
||||
// that pops a, b and converts them to tensor
|
||||
push(stack, a + b);
|
||||
// stack after:
|
||||
// ? ? ? c <- back
|
||||
@ -1126,7 +1126,7 @@ As a more involved example, the following TorchScript snippet:
|
||||
```python
|
||||
@torch.jit.script
|
||||
def foo(a : Tensor, b : Tensor):
|
||||
c = 2 * b
|
||||
c = 2 * b
|
||||
a += 1
|
||||
if a.max() > 4:
|
||||
r = a[0]
|
||||
|
@ -71,15 +71,15 @@ public:
|
||||
auto size = tuple ? PyTuple_GET_SIZE(source) : PyList_GET_SIZE(source);
|
||||
v_value.resize(size);
|
||||
for (int idx = 0; idx < size; idx++) {
|
||||
PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
|
||||
if (THPVariable_Check(obj)) {
|
||||
v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
|
||||
} else if (PyLong_Check(obj)) {
|
||||
// use THPUtils_unpackLong after it is safe to include python_numbers.h
|
||||
v_value[idx] = THPUtils_unpackLong(obj);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
|
||||
if (THPVariable_Check(obj)) {
|
||||
v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
|
||||
} else if (PyLong_Check(obj)) {
|
||||
// use THPUtils_unpackLong after it is safe to include python_numbers.h
|
||||
v_value[idx] = THPUtils_unpackLong(obj);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
value = v_value;
|
||||
return true;
|
||||
|
@ -195,7 +195,7 @@ ScalarType numpy_dtype_to_aten(int dtype) {
|
||||
|
||||
bool is_numpy_scalar(PyObject* obj) {
|
||||
return (PyArray_IsIntegerScalar(obj) ||
|
||||
PyArray_IsScalar(obj, Floating));
|
||||
PyArray_IsScalar(obj, Floating));
|
||||
}
|
||||
|
||||
}} // namespace torch::utils
|
||||
|
Reference in New Issue
Block a user