mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Convert all tabs to spaces, add CI. (#18959)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18959 ghimport-source-id: a934163fa34cb2019732d5f49dc7290c376bf156 Differential Revision: D14831246 Pulled By: ezyang fbshipit-source-id: beb92dc4ee8c82f4c8259c081dd72e477fe7a9d0
This commit is contained in:
committed by
Facebook Github Bot
parent
544783fa1d
commit
48a35135fb
@ -16,6 +16,10 @@ matrix:
|
|||||||
python: "3.6"
|
python: "3.6"
|
||||||
dist: xenial
|
dist: xenial
|
||||||
script: cd .circleci && ./ensure-consistency.py
|
script: cd .circleci && ./ensure-consistency.py
|
||||||
|
- name: "Ensure no tabs"
|
||||||
|
python: "2.7"
|
||||||
|
script:
|
||||||
|
- (! git grep -I -l $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above files have tabs; please convert them to spaces"; false))
|
||||||
- name: "Python 2.7 Lint"
|
- name: "Python 2.7 Lint"
|
||||||
python: "2.7"
|
python: "2.7"
|
||||||
install: pip install flake8
|
install: pip install flake8
|
||||||
|
|||||||
@ -252,21 +252,21 @@ IF(USE_CUDA AND NOT USE_ROCM)
|
|||||||
EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
|
EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
|
||||||
if(${CUDA_VERSION_MAJOR} EQUAL "8")
|
if(${CUDA_VERSION_MAJOR} EQUAL "8")
|
||||||
SET(CUFFT_FAKELINK_OPTIONS
|
SET(CUFFT_FAKELINK_OPTIONS
|
||||||
--generate-code arch=compute_35,code=sm_35
|
--generate-code arch=compute_35,code=sm_35
|
||||||
--generate-code arch=compute_50,code=sm_50
|
--generate-code arch=compute_50,code=sm_50
|
||||||
--generate-code arch=compute_60,code=sm_60)
|
--generate-code arch=compute_60,code=sm_60)
|
||||||
elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
|
elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
|
||||||
SET(CUFFT_FAKELINK_OPTIONS
|
SET(CUFFT_FAKELINK_OPTIONS
|
||||||
--generate-code arch=compute_35,code=sm_35
|
--generate-code arch=compute_35,code=sm_35
|
||||||
--generate-code arch=compute_50,code=sm_50
|
--generate-code arch=compute_50,code=sm_50
|
||||||
--generate-code arch=compute_60,code=sm_60
|
--generate-code arch=compute_60,code=sm_60
|
||||||
--generate-code arch=compute_70,code=sm_70)
|
--generate-code arch=compute_70,code=sm_70)
|
||||||
elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
|
elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
|
||||||
SET(CUFFT_FAKELINK_OPTIONS
|
SET(CUFFT_FAKELINK_OPTIONS
|
||||||
--generate-code arch=compute_35,code=sm_35
|
--generate-code arch=compute_35,code=sm_35
|
||||||
--generate-code arch=compute_50,code=sm_50
|
--generate-code arch=compute_50,code=sm_50
|
||||||
--generate-code arch=compute_60,code=sm_60
|
--generate-code arch=compute_60,code=sm_60
|
||||||
--generate-code arch=compute_70,code=sm_70)
|
--generate-code arch=compute_70,code=sm_70)
|
||||||
else()
|
else()
|
||||||
MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
|
MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@ -19,7 +19,7 @@
|
|||||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||||
#include <mmintrin.h>
|
#include <mmintrin.h>
|
||||||
#elif (defined(__GNUC__) || defined(__xlC__)) && \
|
#elif (defined(__GNUC__) || defined(__xlC__)) && \
|
||||||
(defined(__VEC__) || defined(__ALTIVEC__))
|
(defined(__VEC__) || defined(__ALTIVEC__))
|
||||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
#elif defined(__GNUC__) && defined(__SPE__)
|
#elif defined(__GNUC__) && defined(__SPE__)
|
||||||
|
|||||||
@ -46,12 +46,12 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
|
|||||||
auto sr = right.size(i)>1;
|
auto sr = right.size(i)>1;
|
||||||
if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
|
if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
|
||||||
if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size
|
if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size
|
||||||
AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
|
AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
|
||||||
sum_size *= left.size(i);
|
sum_size *= left.size(i);
|
||||||
} else if (sl) { // if it is only in one of left and right, we can sum right away
|
} else if (sl) { // if it is only in one of left and right, we can sum right away
|
||||||
left = left.sum(i, true);
|
left = left.sum(i, true);
|
||||||
} else if (sr) {
|
} else if (sr) {
|
||||||
right = right.sum(i, true);
|
right = right.sum(i, true);
|
||||||
}
|
}
|
||||||
} else if (sl && sr) { // now deal with dimensions dimensions that will be in the output
|
} else if (sl && sr) { // now deal with dimensions dimensions that will be in the output
|
||||||
// dimensions nontrivially in both left and right must be of the same size
|
// dimensions nontrivially in both left and right must be of the same size
|
||||||
@ -117,7 +117,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
|
|||||||
if (! keepdim) {
|
if (! keepdim) {
|
||||||
for (int i = dim-1; i>=0; i--)
|
for (int i = dim-1; i>=0; i--)
|
||||||
if (sum_dims[i])
|
if (sum_dims[i])
|
||||||
result.squeeze_(i);
|
result.squeeze_(i);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -183,7 +183,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
|
|||||||
}
|
}
|
||||||
else { // we have seen an ellipsis before, so we check compatibility
|
else { // we have seen an ellipsis before, so we check compatibility
|
||||||
AT_CHECK(candidate_num_ell_idxes == num_ell_idxes,
|
AT_CHECK(candidate_num_ell_idxes == num_ell_idxes,
|
||||||
"ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
|
"ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
|
||||||
}
|
}
|
||||||
for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
|
for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
|
||||||
current_op_idxes.push_back(first_ell_idx + i);
|
current_op_idxes.push_back(first_ell_idx + i);
|
||||||
@ -360,8 +360,8 @@ Tensor einsum(std::string eqn, TensorList tensors) {
|
|||||||
// the computation is unrolled in the unroll_dim dimension
|
// the computation is unrolled in the unroll_dim dimension
|
||||||
// its main purpose is to unify the computations in bilinear and bilinear_backward
|
// its main purpose is to unify the computations in bilinear and bilinear_backward
|
||||||
Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
||||||
IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
|
IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
|
||||||
IntArrayRef sumdim_, int64_t unroll_dim) {
|
IntArrayRef sumdim_, int64_t unroll_dim) {
|
||||||
int64_t total_dim = i1_.dim()+expand1_.size();
|
int64_t total_dim = i1_.dim()+expand1_.size();
|
||||||
AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]");
|
AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]");
|
||||||
auto expand1 = at::dim_list_to_bitset(expand1_, total_dim);
|
auto expand1 = at::dim_list_to_bitset(expand1_, total_dim);
|
||||||
@ -390,11 +390,11 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
|||||||
if (expand3[i]) {
|
if (expand3[i]) {
|
||||||
i3 = i3.unsqueeze(i);
|
i3 = i3.unsqueeze(i);
|
||||||
if (sumdim[i] && (i != unroll_dim))
|
if (sumdim[i] && (i != unroll_dim))
|
||||||
sum_dims_12.push_back(i);
|
sum_dims_12.push_back(i);
|
||||||
} else {
|
} else {
|
||||||
s = i3.size(i);
|
s = i3.size(i);
|
||||||
if (sumdim[i] && (i != unroll_dim))
|
if (sumdim[i] && (i != unroll_dim))
|
||||||
sum_dims_23.push_back(i);
|
sum_dims_23.push_back(i);
|
||||||
}
|
}
|
||||||
output_size.push_back(sumdim[i] ? 1 : s);
|
output_size.push_back(sumdim[i] ? 1 : s);
|
||||||
if (i == unroll_dim)
|
if (i == unroll_dim)
|
||||||
@ -408,8 +408,8 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
|||||||
if (! sumdim[unroll_dim]) {
|
if (! sumdim[unroll_dim]) {
|
||||||
for (int64_t k = 0; k < unroll_size; k++) {
|
for (int64_t k = 0; k < unroll_size; k++) {
|
||||||
Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1),
|
Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1),
|
||||||
i2.narrow(unroll_dim, k * slicemul2, 1),
|
i2.narrow(unroll_dim, k * slicemul2, 1),
|
||||||
sum_dims_12, true);
|
sum_dims_12, true);
|
||||||
buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k * slicemul3, 1), sum_dims_23, true);
|
buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k * slicemul3, 1), sum_dims_23, true);
|
||||||
output.narrow(unroll_dim, k, 1).add_(buf);
|
output.narrow(unroll_dim, k, 1).add_(buf);
|
||||||
}
|
}
|
||||||
@ -417,7 +417,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
|
|||||||
else {
|
else {
|
||||||
for (int64_t k = 0; k < unroll_size; k++) {
|
for (int64_t k = 0; k < unroll_size; k++) {
|
||||||
Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k*slicemul1, 1),
|
Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k*slicemul1, 1),
|
||||||
i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
|
i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
|
||||||
buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k*slicemul3, 1), sum_dims_23, true);
|
buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k*slicemul3, 1), sum_dims_23, true);
|
||||||
output.add_(buf);
|
output.add_(buf);
|
||||||
}
|
}
|
||||||
@ -473,7 +473,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
|
|||||||
t2 = t2.sum(dims2[i], true);
|
t2 = t2.sum(dims2[i], true);
|
||||||
} else {
|
} else {
|
||||||
AT_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i],
|
AT_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i],
|
||||||
" and second has size ", s2, " in dim ", dims2[i]);
|
" and second has size ", s2, " in dim ", dims2[i]);
|
||||||
csize *= s1;
|
csize *= s1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -61,7 +61,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
tg_batch_offsets[i] = pos;
|
tg_batch_offsets[i] = pos;
|
||||||
pos += target_lengths[i];
|
pos += target_lengths[i];
|
||||||
if (max_target_length < target_lengths[i])
|
if (max_target_length < target_lengths[i])
|
||||||
max_target_length = target_lengths[i];
|
max_target_length = target_lengths[i];
|
||||||
}
|
}
|
||||||
tg_target_stride = targets.stride(0);
|
tg_target_stride = targets.stride(0);
|
||||||
checkSize(c, targets_arg, 0, pos);
|
checkSize(c, targets_arg, 0, pos);
|
||||||
@ -83,8 +83,8 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
int64_t max_input_length = log_probs.size(0);
|
int64_t max_input_length = log_probs.size(0);
|
||||||
for (int64_t b = 0; b < batch_size; b++) {
|
for (int64_t b = 0; b < batch_size; b++) {
|
||||||
AT_CHECK(input_lengths[b] <= max_input_length,
|
AT_CHECK(input_lengths[b] <= max_input_length,
|
||||||
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
|
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
|
||||||
" (while checking arguments for ", c, ")");
|
" (while checking arguments for ", c, ")");
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
|
Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
|
||||||
@ -115,11 +115,11 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
// now the loop over the inputs
|
// now the loop over the inputs
|
||||||
for (int64_t t=1; t<input_length; t++) {
|
for (int64_t t=1; t<input_length; t++) {
|
||||||
for (int64_t s=0; s<2*target_length+1; s++) {
|
for (int64_t s=0; s<2*target_length+1; s++) {
|
||||||
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
||||||
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
|
// this loop over s could be parallel/vectorized, too, but the required items are one index apart
|
||||||
// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
|
// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
|
||||||
// for the cuda implementation, that gave a speed boost.
|
// for the cuda implementation, that gave a speed boost.
|
||||||
// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
|
// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
|
||||||
|
|
||||||
scalar_t la1 = log_alpha_a[t-1][s];
|
scalar_t la1 = log_alpha_a[t-1][s];
|
||||||
scalar_t lamax = la1;
|
scalar_t lamax = la1;
|
||||||
@ -141,7 +141,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
|
|||||||
}
|
}
|
||||||
if (lamax == neginf) // cannot do neginf-neginf
|
if (lamax == neginf) // cannot do neginf-neginf
|
||||||
lamax = 0;
|
lamax = 0;
|
||||||
// this is the assignment of eq (6)
|
// this is the assignment of eq (6)
|
||||||
log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime];
|
log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -182,7 +182,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
|||||||
tg_batch_offsets[i] = pos;
|
tg_batch_offsets[i] = pos;
|
||||||
pos += target_lengths[i];
|
pos += target_lengths[i];
|
||||||
if (max_target_length < target_lengths[i])
|
if (max_target_length < target_lengths[i])
|
||||||
max_target_length = target_lengths[i];
|
max_target_length = target_lengths[i];
|
||||||
}
|
}
|
||||||
tg_target_stride = targets.stride(0);
|
tg_target_stride = targets.stride(0);
|
||||||
}
|
}
|
||||||
@ -268,9 +268,9 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
|
|||||||
|
|
||||||
log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime];
|
log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime];
|
||||||
// one might check whether one can vectorize this better when done after the t-loop...
|
// one might check whether one can vectorize this better when done after the t-loop...
|
||||||
// now that we have beta, we fill in the sum of alpha*beta in eq (16)
|
// now that we have beta, we fill in the sum of alpha*beta in eq (16)
|
||||||
// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
|
// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
|
||||||
// issue (several s can map to the same target character)
|
// issue (several s can map to the same target character)
|
||||||
// collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
|
// collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
|
||||||
scalar_t log_alpha_beta = log_alpha_a[t][s] + log_beta_a[t][s];
|
scalar_t log_alpha_beta = log_alpha_a[t][s] + log_beta_a[t][s];
|
||||||
scalar_t &lcab = grad_a[t][current_target_prime];
|
scalar_t &lcab = grad_a[t][current_target_prime];
|
||||||
@ -309,9 +309,9 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu(const Tensor& log_probs, const Tensor& t
|
|||||||
(void)zero_infinity; // only used for backwards
|
(void)zero_infinity; // only used for backwards
|
||||||
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cpu", [&] {
|
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cpu", [&] {
|
||||||
if (targets.scalar_type() == kLong) {
|
if (targets.scalar_type() == kLong) {
|
||||||
return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||||
} else {
|
} else {
|
||||||
return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -320,9 +320,9 @@ Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const
|
|||||||
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
||||||
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cpu", [&] {
|
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cpu", [&] {
|
||||||
if (targets.scalar_type() == kLong) {
|
if (targets.scalar_type() == kLong) {
|
||||||
return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||||
} else {
|
} else {
|
||||||
return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -76,11 +76,11 @@ pthreadpool_t nnpack_threadpool() {
|
|||||||
enum nnp_status nnpack_status = nnp_initialize();
|
enum nnp_status nnpack_status = nnp_initialize();
|
||||||
if (nnpack_status != nnp_status_success) {
|
if (nnpack_status != nnp_status_success) {
|
||||||
if (nnpack_status == nnp_status_out_of_memory) {
|
if (nnpack_status == nnp_status_out_of_memory) {
|
||||||
throw std::runtime_error("could not initialize NNPack (out of memory)");
|
throw std::runtime_error("could not initialize NNPack (out of memory)");
|
||||||
} else if (nnpack_status == nnp_status_unsupported_hardware) {
|
} else if (nnpack_status == nnp_status_unsupported_hardware) {
|
||||||
throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
|
throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error("could not initialize NNPack (unknown error)");
|
throw std::runtime_error("could not initialize NNPack (unknown error)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unsigned int threads;
|
unsigned int threads;
|
||||||
|
|||||||
@ -614,7 +614,7 @@ std::tuple<Tensor, Tensor> NAME( \
|
|||||||
num_layers, dropout_p, train, bidirectional, batch_first); \
|
num_layers, dropout_p, train, bidirectional, batch_first); \
|
||||||
return std::make_tuple(output, hy); \
|
return std::make_tuple(output, hy); \
|
||||||
} \
|
} \
|
||||||
check_device(_input, _params, hx); \
|
check_device(_input, _params, hx); \
|
||||||
auto input = batch_first ? _input.transpose(0, 1) : _input; \
|
auto input = batch_first ? _input.transpose(0, 1) : _input; \
|
||||||
auto params = gather_params(_params, has_biases); \
|
auto params = gather_params(_params, has_biases); \
|
||||||
auto results = _rnn_impl_with_concat<CELL, FullLayer, FullBidirectionalLayer>( \
|
auto results = _rnn_impl_with_concat<CELL, FullLayer, FullBidirectionalLayer>( \
|
||||||
|
|||||||
@ -126,10 +126,10 @@ Tensor& arange_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
|
|||||||
double size_d;
|
double size_d;
|
||||||
if (std::is_same<scalar_t, int64_t>::value) {
|
if (std::is_same<scalar_t, int64_t>::value) {
|
||||||
size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
|
size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
|
||||||
/ step.to<accscalar_t>());
|
/ step.to<accscalar_t>());
|
||||||
} else {
|
} else {
|
||||||
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
|
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
|
||||||
/ step.to<double>());
|
/ step.to<double>());
|
||||||
}
|
}
|
||||||
|
|
||||||
AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
|
AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
|
||||||
|
|||||||
@ -100,7 +100,7 @@ typedef union imm_xmm_union {
|
|||||||
|
|
||||||
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
|
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
|
||||||
imm_xmm_union u __attribute__((aligned(32))); \
|
imm_xmm_union u __attribute__((aligned(32))); \
|
||||||
u.imm = imm_; \
|
u.imm = imm_; \
|
||||||
xmm0_ = u.xmm[0]; \
|
xmm0_ = u.xmm[0]; \
|
||||||
xmm1_ = u.xmm[1]; \
|
xmm1_ = u.xmm[1]; \
|
||||||
}
|
}
|
||||||
@ -228,8 +228,8 @@ inline v8sf log256_ps(v8sf x) {
|
|||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
_PS256_CONST(exp_hi, 88.3762626647949f);
|
_PS256_CONST(exp_hi, 88.3762626647949f);
|
||||||
_PS256_CONST(exp_lo, -88.3762626647949f);
|
_PS256_CONST(exp_lo, -88.3762626647949f);
|
||||||
|
|
||||||
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
|
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
|
||||||
_PS256_CONST(cephes_exp_C1, 0.693359375);
|
_PS256_CONST(cephes_exp_C1, 0.693359375);
|
||||||
|
|||||||
@ -266,7 +266,7 @@ public:
|
|||||||
CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
|
CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
|
||||||
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
|
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
|
||||||
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1,
|
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1,
|
||||||
exec_type, batch, &ws_size_t));
|
exec_type, batch, &ws_size_t));
|
||||||
#else
|
#else
|
||||||
CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
|
CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
|
||||||
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
|
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
|
||||||
|
|||||||
@ -87,10 +87,10 @@ __global__ void embedding_backward_feature_kernel
|
|||||||
match_found_this_thread = 0;
|
match_found_this_thread = 0;
|
||||||
#ifdef __HIP_PLATFORM_HCC__
|
#ifdef __HIP_PLATFORM_HCC__
|
||||||
unsigned long long int matchmask = WARP_BALLOT(match_found_this_thread);
|
unsigned long long int matchmask = WARP_BALLOT(match_found_this_thread);
|
||||||
int first_remaining_peer = __ffsll(matchmask) - 1;
|
int first_remaining_peer = __ffsll(matchmask) - 1;
|
||||||
#else
|
#else
|
||||||
unsigned int matchmask = WARP_BALLOT(match_found_this_thread);
|
unsigned int matchmask = WARP_BALLOT(match_found_this_thread);
|
||||||
int first_remaining_peer = __ffs(matchmask) - 1;
|
int first_remaining_peer = __ffs(matchmask) - 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader
|
if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader
|
||||||
@ -103,7 +103,7 @@ __global__ void embedding_backward_feature_kernel
|
|||||||
#else
|
#else
|
||||||
first_remaining_peer = __ffs(matchmask) - 1;
|
first_remaining_peer = __ffs(matchmask) - 1;
|
||||||
#endif
|
#endif
|
||||||
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
|
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
|
||||||
matchmask ^= (1 << first_remaining_peer);
|
matchmask ^= (1 << first_remaining_peer);
|
||||||
}
|
}
|
||||||
if(f < s)
|
if(f < s)
|
||||||
|
|||||||
@ -110,8 +110,8 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
|
|||||||
for (int64_t t=1; t < max_input_length; t++) {
|
for (int64_t t=1; t < max_input_length; t++) {
|
||||||
__syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
|
__syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
|
||||||
if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) {
|
if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) {
|
||||||
// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
|
// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
|
||||||
// lamax is the maximum for the logsumexp trick.
|
// lamax is the maximum for the logsumexp trick.
|
||||||
scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
|
scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
|
||||||
scalar_t lamax = la1;
|
scalar_t lamax = la1;
|
||||||
scalar_t la2, la3;
|
scalar_t la2, la3;
|
||||||
@ -135,7 +135,7 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
|
|||||||
log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax
|
log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax
|
||||||
+ log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
|
+ log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
|
||||||
} else {
|
} else {
|
||||||
// otherwise we just set to neginf
|
// otherwise we just set to neginf
|
||||||
if (s < 2*max_target_length+1)
|
if (s < 2*max_target_length+1)
|
||||||
log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
|
log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
|
||||||
}
|
}
|
||||||
@ -218,8 +218,8 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
|
|||||||
int64_t max_input_length = log_probs.size(0);
|
int64_t max_input_length = log_probs.size(0);
|
||||||
for (int64_t b = 0; b < batch_size; b++) {
|
for (int64_t b = 0; b < batch_size; b++) {
|
||||||
AT_CHECK(input_lengths[b] <= max_input_length,
|
AT_CHECK(input_lengths[b] <= max_input_length,
|
||||||
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
|
"Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
|
||||||
" (while checking arguments for ", c, ")");
|
" (while checking arguments for ", c, ")");
|
||||||
}
|
}
|
||||||
|
|
||||||
auto target_lengths_t = at::tensor(target_lengths, targets.options().dtype(kLong));
|
auto target_lengths_t = at::tensor(target_lengths, targets.options().dtype(kLong));
|
||||||
@ -242,7 +242,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
|
|||||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>(
|
ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>(
|
||||||
log_alpha.data<scalar_t>(),
|
log_alpha.data<scalar_t>(),
|
||||||
log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
|
log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
|
||||||
targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
|
targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
|
||||||
neg_log_likelihood.data<scalar_t>(),
|
neg_log_likelihood.data<scalar_t>(),
|
||||||
@ -304,8 +304,8 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
|
|||||||
if (s < 2*target_length+1) {
|
if (s < 2*target_length+1) {
|
||||||
current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
|
||||||
have_three = ((s < 2*target_length-1) &&
|
have_three = ((s < 2*target_length-1) &&
|
||||||
(get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
|
(get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
|
||||||
current_target_prime));
|
current_target_prime));
|
||||||
} else {
|
} else {
|
||||||
current_target_prime = BLANK;
|
current_target_prime = BLANK;
|
||||||
have_three = false;
|
have_three = false;
|
||||||
@ -377,7 +377,7 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da
|
|||||||
int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
|
int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
|
||||||
int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
|
int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
|
||||||
const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
|
const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
|
||||||
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
|
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
|
||||||
int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
|
int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
|
||||||
int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime!
|
int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime!
|
||||||
|
|
||||||
@ -405,9 +405,9 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da
|
|||||||
for (int64_t t = 0; t < input_length; t++) {
|
for (int64_t t = 0; t < input_length; t++) {
|
||||||
scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
|
scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
|
||||||
atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
|
atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
|
||||||
-std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
|
-std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
|
||||||
+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
|
+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
|
||||||
+ nll - lp) * gr);
|
+ nll - lp) * gr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -429,7 +429,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
|
|||||||
int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
|
int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
|
||||||
int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
|
int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
|
||||||
const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
|
const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
|
||||||
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
|
int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
|
||||||
|
|
||||||
constexpr scalar_t neginf = -INFINITY;
|
constexpr scalar_t neginf = -INFINITY;
|
||||||
int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
|
int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
|
||||||
@ -481,7 +481,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
|
|||||||
// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
|
// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
|
||||||
template<typename scalar_t, ScalarType target_scalar_type>
|
template<typename scalar_t, ScalarType target_scalar_type>
|
||||||
Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
|
Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
|
||||||
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
||||||
constexpr scalar_t neginf = -INFINITY;
|
constexpr scalar_t neginf = -INFINITY;
|
||||||
using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
|
using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
|
||||||
int64_t batch_size = log_probs.size(1);
|
int64_t batch_size = log_probs.size(1);
|
||||||
@ -500,7 +500,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
|
|||||||
tg_batch_offsets_data[i] = pos;
|
tg_batch_offsets_data[i] = pos;
|
||||||
pos += target_lengths[i];
|
pos += target_lengths[i];
|
||||||
if (max_target_length < target_lengths[i])
|
if (max_target_length < target_lengths[i])
|
||||||
max_target_length = target_lengths[i];
|
max_target_length = target_lengths[i];
|
||||||
}
|
}
|
||||||
tg_target_stride = targets.stride(0);
|
tg_target_stride = targets.stride(0);
|
||||||
}
|
}
|
||||||
@ -558,15 +558,15 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
|
|||||||
// maybe we should kernelize this, too.
|
// maybe we should kernelize this, too.
|
||||||
auto grad_blank = grad.narrow(2, BLANK, 1);
|
auto grad_blank = grad.narrow(2, BLANK, 1);
|
||||||
grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
|
grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
|
||||||
{log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
|
{log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
|
||||||
+ log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
|
+ log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
|
||||||
{log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
|
{log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
|
||||||
2, true)
|
2, true)
|
||||||
.permute({1, 0, 2})
|
.permute({1, 0, 2})
|
||||||
.add_(neg_log_likelihood.view({1, batch_size, 1}))
|
.add_(neg_log_likelihood.view({1, batch_size, 1}))
|
||||||
.sub_(log_probs.narrow(2, BLANK, 1))
|
.sub_(log_probs.narrow(2, BLANK, 1))
|
||||||
.exp_()
|
.exp_()
|
||||||
);
|
);
|
||||||
// scale by output gradient (blanks and first summand of non-blanks)
|
// scale by output gradient (blanks and first summand of non-blanks)
|
||||||
grad *= grad_out.view({1, batch_size, 1});
|
grad *= grad_out.view({1, batch_size, 1});
|
||||||
if (zero_infinity) {
|
if (zero_infinity) {
|
||||||
@ -630,9 +630,9 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& t
|
|||||||
(void)zero_infinity; // only used for backward
|
(void)zero_infinity; // only used for backward
|
||||||
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cuda", [&] {
|
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cuda", [&] {
|
||||||
if (targets.scalar_type() == kLong) {
|
if (targets.scalar_type() == kLong) {
|
||||||
return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||||
} else {
|
} else {
|
||||||
return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -641,9 +641,9 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
|
|||||||
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
|
||||||
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cuda", [&] {
|
return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cuda", [&] {
|
||||||
if (targets.scalar_type() == kLong) {
|
if (targets.scalar_type() == kLong) {
|
||||||
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||||
} else {
|
} else {
|
||||||
return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -146,10 +146,10 @@ Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
|
|||||||
double size_d;
|
double size_d;
|
||||||
if (std::is_same<scalar_t, int64_t>::value) {
|
if (std::is_same<scalar_t, int64_t>::value) {
|
||||||
size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
|
size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
|
||||||
/ step.to<accscalar_t>());
|
/ step.to<accscalar_t>());
|
||||||
} else {
|
} else {
|
||||||
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
|
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
|
||||||
/ step.to<double>());
|
/ step.to<double>());
|
||||||
}
|
}
|
||||||
|
|
||||||
AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
|
AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
|
||||||
|
|||||||
@ -441,18 +441,18 @@ std::tuple<Tensor, Tensor> weight_norm_cuda_backward
|
|||||||
{
|
{
|
||||||
using accscalar_t = acc_type<scalar_t, true>;
|
using accscalar_t = acc_type<scalar_t, true>;
|
||||||
|
|
||||||
weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
|
weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
|
||||||
<<<grad_w.size(0),
|
<<<grad_w.size(0),
|
||||||
BLOCK,
|
BLOCK,
|
||||||
BLOCK*sizeof(accscalar_t),
|
BLOCK*sizeof(accscalar_t),
|
||||||
stream>>>
|
stream>>>
|
||||||
(grad_v.data<scalar_t>(),
|
(grad_v.data<scalar_t>(),
|
||||||
grad_g.data<scalar_t>(),
|
grad_g.data<scalar_t>(),
|
||||||
grad_w.data<scalar_t>(),
|
grad_w.data<scalar_t>(),
|
||||||
saved_v.data<scalar_t>(),
|
saved_v.data<scalar_t>(),
|
||||||
saved_g.data<scalar_t>(),
|
saved_g.data<scalar_t>(),
|
||||||
saved_norms.data<accscalar_t>(),
|
saved_norms.data<accscalar_t>(),
|
||||||
rowSize);
|
rowSize);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
else if(dim == ndims - 1)
|
else if(dim == ndims - 1)
|
||||||
|
|||||||
@ -72,17 +72,17 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
|
|||||||
|
|
||||||
size_t workspace_size;
|
size_t workspace_size;
|
||||||
AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(),
|
AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(),
|
||||||
targets->data<int>(), target_lengths.data(), input_lengths.data(),
|
targets->data<int>(), target_lengths.data(), input_lengths.data(),
|
||||||
algo, ctc_loss_desc.desc(), &workspace_size));
|
algo, ctc_loss_desc.desc(), &workspace_size));
|
||||||
|
|
||||||
|
|
||||||
Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte));
|
Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte));
|
||||||
Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
|
Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
|
||||||
|
|
||||||
AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
|
AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
|
||||||
targets->data<int>(), target_lengths.data(), input_lengths.data(),
|
targets->data<int>(), target_lengths.data(), input_lengths.data(),
|
||||||
costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
|
costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
|
||||||
ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
|
ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
|
||||||
|
|
||||||
return std::make_tuple(costs, grad);
|
return std::make_tuple(costs, grad);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -456,7 +456,7 @@ struct algorithm_search<miopenConvFwdAlgorithm_t> {
|
|||||||
args.wdesc.desc(), args.weight.data_ptr(),
|
args.wdesc.desc(), args.weight.data_ptr(),
|
||||||
args.cdesc.desc(),
|
args.cdesc.desc(),
|
||||||
args.odesc.desc(), args.output.data_ptr(),
|
args.odesc.desc(), args.output.data_ptr(),
|
||||||
1, // just return the fastest
|
1, // just return the fastest
|
||||||
&perf_count,
|
&perf_count,
|
||||||
&perf_results,
|
&perf_results,
|
||||||
ws.data,
|
ws.data,
|
||||||
|
|||||||
@ -5,12 +5,12 @@
|
|||||||
|
|
||||||
#define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)
|
#define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)
|
||||||
|
|
||||||
#define THLapackCheck(fmt, func, info , ...) \
|
#define THLapackCheck(fmt, func, info , ...) \
|
||||||
if (info < 0) { \
|
if (info < 0) { \
|
||||||
THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
|
THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
|
||||||
} else if(info > 0) { \
|
} else if(info > 0) { \
|
||||||
THError(fmt, func, info, ##__VA_ARGS__); \
|
THError(fmt, func, info, ##__VA_ARGS__); \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
#define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...) \
|
#define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...) \
|
||||||
if (info < 0) { \
|
if (info < 0) { \
|
||||||
|
|||||||
@ -14,7 +14,7 @@ typedef struct THMemoryFile__
|
|||||||
THCharStorage *storage;
|
THCharStorage *storage;
|
||||||
ssize_t size;
|
ssize_t size;
|
||||||
ssize_t position;
|
ssize_t position;
|
||||||
int longSize;
|
int longSize;
|
||||||
|
|
||||||
} THMemoryFile;
|
} THMemoryFile;
|
||||||
|
|
||||||
|
|||||||
@ -149,13 +149,13 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
|
|||||||
|
|
||||||
/* get optimal workspace size */
|
/* get optimal workspace size */
|
||||||
THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
|
THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
|
||||||
rb__->data<scalar_t>(), ldb,
|
rb__->data<scalar_t>(), ldb,
|
||||||
&wkopt, -1, &info);
|
&wkopt, -1, &info);
|
||||||
lwork = (int)wkopt;
|
lwork = (int)wkopt;
|
||||||
work = THTensor_(newWithSize1d)(lwork);
|
work = THTensor_(newWithSize1d)(lwork);
|
||||||
THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
|
THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
|
||||||
rb__->data<scalar_t>(), ldb,
|
rb__->data<scalar_t>(), ldb,
|
||||||
work->data<scalar_t>(), lwork, &info);
|
work->data<scalar_t>(), lwork, &info);
|
||||||
|
|
||||||
THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
|
THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
|
||||||
THCleanup(c10::raw::intrusive_ptr::decref(ra__);
|
THCleanup(c10::raw::intrusive_ptr::decref(ra__);
|
||||||
@ -378,21 +378,21 @@ void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
|
|||||||
}
|
}
|
||||||
|
|
||||||
THLapack_(gesdd)(jobz,
|
THLapack_(gesdd)(jobz,
|
||||||
m,n,ra__->data<scalar_t>(),lda,
|
m,n,ra__->data<scalar_t>(),lda,
|
||||||
rs__data,
|
rs__data,
|
||||||
ru__data,
|
ru__data,
|
||||||
ldu,
|
ldu,
|
||||||
rv__data, ldvt,
|
rv__data, ldvt,
|
||||||
&wkopt, -1, THIntTensor_data(iwork), &info);
|
&wkopt, -1, THIntTensor_data(iwork), &info);
|
||||||
lwork = (int)wkopt;
|
lwork = (int)wkopt;
|
||||||
work = THTensor_(newWithSize1d)(lwork);
|
work = THTensor_(newWithSize1d)(lwork);
|
||||||
THLapack_(gesdd)(jobz,
|
THLapack_(gesdd)(jobz,
|
||||||
m,n,ra__->data<scalar_t>(),lda,
|
m,n,ra__->data<scalar_t>(),lda,
|
||||||
rs__data,
|
rs__data,
|
||||||
ru__data,
|
ru__data,
|
||||||
ldu,
|
ldu,
|
||||||
rv__data, ldvt,
|
rv__data, ldvt,
|
||||||
work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
|
work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
|
||||||
|
|
||||||
if (jobz != 'N') {
|
if (jobz != 'N') {
|
||||||
THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",
|
THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",
|
||||||
|
|||||||
@ -999,31 +999,31 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
|
|||||||
return equal;
|
return equal;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \
|
#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \
|
||||||
void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, scalar_t value) \
|
void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, scalar_t value) \
|
||||||
{ \
|
{ \
|
||||||
THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
|
THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
|
||||||
TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t, \
|
TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t, \
|
||||||
*r__data = (*t_data OP value) ? 1 : 0;); \
|
*r__data = (*t_data OP value) ? 1 : 0;); \
|
||||||
} \
|
} \
|
||||||
void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value) \
|
void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value) \
|
||||||
{ \
|
{ \
|
||||||
THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
|
THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
|
||||||
TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, \
|
TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, \
|
||||||
*r__data = (*t_data OP value) ? 1 : 0;); \
|
*r__data = (*t_data OP value) ? 1 : 0;); \
|
||||||
} \
|
} \
|
||||||
void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
|
void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
|
||||||
{ \
|
{ \
|
||||||
THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
|
THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
|
||||||
TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb, \
|
TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb, \
|
||||||
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
|
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
|
||||||
} \
|
} \
|
||||||
void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
|
void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
|
||||||
{ \
|
{ \
|
||||||
THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
|
THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
|
||||||
TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb, \
|
TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb, \
|
||||||
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
|
*r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
|
|
||||||
TENSOR_IMPLEMENT_LOGICAL(lt,<)
|
TENSOR_IMPLEMENT_LOGICAL(lt,<)
|
||||||
@ -1302,10 +1302,10 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, scalar_t value, int dimension, i
|
|||||||
*r__data = TH_MATH_NAME(pow)(sum, 1.0/3), 0);
|
*r__data = TH_MATH_NAME(pow)(sum, 1.0/3), 0);
|
||||||
} else if (value == INFINITY) {
|
} else if (value == INFINITY) {
|
||||||
DIM_REDUCE(sum = THMax(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
|
DIM_REDUCE(sum = THMax(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
|
||||||
*r__data = sum, 0);
|
*r__data = sum, 0);
|
||||||
} else if (value == -INFINITY) {
|
} else if (value == -INFINITY) {
|
||||||
DIM_REDUCE(sum = THMin(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
|
DIM_REDUCE(sum = THMin(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
|
||||||
*r__data = sum, INFINITY);
|
*r__data = sum, INFINITY);
|
||||||
} else {
|
} else {
|
||||||
DIM_REDUCE(sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(t_data[i*t_stride]), value),
|
DIM_REDUCE(sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(t_data[i*t_stride]), value),
|
||||||
*r__data = TH_MATH_NAME(pow)(sum, 1.0/value), 0);
|
*r__data = TH_MATH_NAME(pow)(sum, 1.0/value), 0);
|
||||||
|
|||||||
@ -17,10 +17,10 @@ TH_API void THVector_(cdiv)(scalar_t *z, const scalar_t *x, const scalar_t *y, c
|
|||||||
TH_API void THVector_(divs)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n);
|
TH_API void THVector_(divs)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n);
|
||||||
TH_API void THVector_(neg)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
|
TH_API void THVector_(neg)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
|
||||||
TH_API void THVector_(normal_fill)(scalar_t *data,
|
TH_API void THVector_(normal_fill)(scalar_t *data,
|
||||||
const int64_t size,
|
const int64_t size,
|
||||||
struct THGenerator *generator,
|
struct THGenerator *generator,
|
||||||
const scalar_t mean,
|
const scalar_t mean,
|
||||||
const scalar_t stddev);
|
const scalar_t stddev);
|
||||||
|
|
||||||
#endif /* non bool only part */
|
#endif /* non bool only part */
|
||||||
|
|
||||||
|
|||||||
@ -1342,7 +1342,7 @@ static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const
|
|||||||
// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
|
// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
|
||||||
// $ ./vsxtest
|
// $ ./vsxtest
|
||||||
//
|
//
|
||||||
// TODO
|
// TODO
|
||||||
//
|
//
|
||||||
//
|
//
|
||||||
// Finished running all tests. All tests PASSED.
|
// Finished running all tests. All tests PASSED.
|
||||||
|
|||||||
@ -119,7 +119,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
|
|||||||
#else
|
#else
|
||||||
uint32_t a = *eax, b, c = *ecx, d;
|
uint32_t a = *eax, b, c = *ecx, d;
|
||||||
asm volatile ( "cpuid\n\t"
|
asm volatile ( "cpuid\n\t"
|
||||||
: "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
|
: "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
|
||||||
*eax = a;
|
*eax = a;
|
||||||
*ebx = b;
|
*ebx = b;
|
||||||
*ecx = c;
|
*ecx = c;
|
||||||
|
|||||||
@ -308,12 +308,12 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
|
|||||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||||
if (prop->major >= 5){
|
if (prop->major >= 5){
|
||||||
THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
|
THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
|
||||||
THCublasCheck(cublasGemmEx(handle, opa, opb,
|
THCublasCheck(cublasGemmEx(handle, opa, opb,
|
||||||
i_m, i_n, i_k, &fAlpha,
|
i_m, i_n, i_k, &fAlpha,
|
||||||
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
|
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
|
||||||
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc,
|
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc,
|
||||||
CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP));
|
CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP));
|
||||||
THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
|
THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
|
||||||
}else{
|
}else{
|
||||||
THCublasCheck(cublasSgemmEx(handle, opa, opb,
|
THCublasCheck(cublasSgemmEx(handle, opa, opb,
|
||||||
i_m, i_n, i_k, &fAlpha,
|
i_m, i_n, i_k, &fAlpha,
|
||||||
|
|||||||
@ -48,7 +48,7 @@ inline bool getCatGrid(THCState* state, ptrdiff_t nTensors, dim3& grid) {
|
|||||||
//X dim of grid for cat array cooperates on a single tensor in the cat.
|
//X dim of grid for cat array cooperates on a single tensor in the cat.
|
||||||
//Given half of the GPU, full utilization will always occur.
|
//Given half of the GPU, full utilization will always occur.
|
||||||
grid = dim3( 2LL * numSM, (long long) nTensors );
|
grid = dim3( 2LL * numSM, (long long) nTensors );
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,7 +131,7 @@ __global__ void CatArrayBatchedCopy(
|
|||||||
|
|
||||||
while( tid < nElements){
|
while( tid < nElements){
|
||||||
IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
|
IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
|
||||||
os.outputSize, os.outputStride, dimSize, concatDim, tid);
|
os.outputSize, os.outputStride, dimSize, concatDim, tid);
|
||||||
output[dataOffset + elementOffset] = data[tid];
|
output[dataOffset + elementOffset] = data[tid];
|
||||||
|
|
||||||
tid += stride;
|
tid += stride;
|
||||||
|
|||||||
@ -79,7 +79,7 @@ condDiv(T *q, int64_t *J, int64_t inputsize, T q_max) {
|
|||||||
q[idx] = one;
|
q[idx] = one;
|
||||||
} else {
|
} else {
|
||||||
if (THCNumerics<T>::gt(q_max, one)) {
|
if (THCNumerics<T>::gt(q_max, one)) {
|
||||||
q[idx] = THCNumerics<T>::div(q[idx], q_max);
|
q[idx] = THCNumerics<T>::div(q[idx], q_max);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -236,7 +236,7 @@ sampleMultinomialOnce(int64_t* dest,
|
|||||||
THCNumerics<AccT>::div(
|
THCNumerics<AccT>::div(
|
||||||
ScalarConvert<T, AccT>::to(dist[curDist * stride_dist + cat * stride_categories]),
|
ScalarConvert<T, AccT>::to(dist[curDist * stride_dist + cat * stride_categories]),
|
||||||
sum) :
|
sum) :
|
||||||
accZero);
|
accZero);
|
||||||
|
|
||||||
smem[threadIdx.x] = dist_val;
|
smem[threadIdx.x] = dist_val;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|||||||
@ -42,7 +42,7 @@ THCTensor_(numel)(THCState *state, THCTensor *t)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void THCTensor_(cat)(THCState *state, THCTensor *result,
|
void THCTensor_(cat)(THCState *state, THCTensor *result,
|
||||||
THCTensor *ta, THCTensor *tb, int dimension)
|
THCTensor *ta, THCTensor *tb, int dimension)
|
||||||
{
|
{
|
||||||
THCTensor* inputs[2];
|
THCTensor* inputs[2];
|
||||||
inputs[0] = ta;
|
inputs[0] = ta;
|
||||||
@ -73,7 +73,7 @@ inline void THCTensor_(check_shape_except_dim)(THCState *state,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void THCTensor_(catArray)(THCState *state, THCTensor *result,
|
void THCTensor_(catArray)(THCState *state, THCTensor *result,
|
||||||
THCTensor **inputs, int numInputs, int dimension)
|
THCTensor **inputs, int numInputs, int dimension)
|
||||||
{
|
{
|
||||||
// previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
|
// previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
|
||||||
// to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
|
// to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
|
||||||
|
|||||||
@ -34,9 +34,9 @@ THC_API scalar_t THCTensor_(maxall)(THCState *state, THCTensor *self);
|
|||||||
THC_API scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self);
|
THC_API scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self);
|
||||||
|
|
||||||
THC_API void THCTensor_(median)(THCState *state,
|
THC_API void THCTensor_(median)(THCState *state,
|
||||||
THCTensor *values,
|
THCTensor *values,
|
||||||
THCudaLongTensor *indices,
|
THCudaLongTensor *indices,
|
||||||
THCTensor *src, int dim, int keepdim);
|
THCTensor *src, int dim, int keepdim);
|
||||||
|
|
||||||
THC_API accreal THCTensor_(dist)(THCState *state, THCTensor *self, THCTensor *src,
|
THC_API accreal THCTensor_(dist)(THCState *state, THCTensor *self, THCTensor *src,
|
||||||
scalar_t value);
|
scalar_t value);
|
||||||
|
|||||||
@ -249,7 +249,7 @@ void THCTensor_(multinomial)(struct THCState *state,
|
|||||||
THCudaLongTensor_data(state, self),
|
THCudaLongTensor_data(state, self),
|
||||||
numDist, numCategories,
|
numDist, numCategories,
|
||||||
THCTensor_(data)(state, prefixSum),
|
THCTensor_(data)(state, prefixSum),
|
||||||
THCTensor_(data)(state, normDist));
|
THCTensor_(data)(state, normDist));
|
||||||
} else {
|
} else {
|
||||||
// Sample without replacement
|
// Sample without replacement
|
||||||
|
|
||||||
|
|||||||
@ -7,12 +7,12 @@
|
|||||||
#define ZERO_MACRO zero<T>()
|
#define ZERO_MACRO zero<T>()
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline __device__ typename std::enable_if<std::is_same<T, double>::value, T>::type zero() {
|
inline __device__ typename std::enable_if<std::is_same<T, double>::value, T>::type zero() {
|
||||||
return 0.;
|
return 0.;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline __device__ typename std::enable_if<!std::is_same<T, double>::value, T>::type zero() {
|
inline __device__ typename std::enable_if<!std::is_same<T, double>::value, T>::type zero() {
|
||||||
return 0.f;
|
return 0.f;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#define ZERO_MACRO 0.f
|
#define ZERO_MACRO 0.f
|
||||||
|
|||||||
@ -88,7 +88,7 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature
|
|||||||
#else
|
#else
|
||||||
first_remaining_peer = __ffs(matchmask) - 1;
|
first_remaining_peer = __ffs(matchmask) - 1;
|
||||||
#endif
|
#endif
|
||||||
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
|
my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
|
||||||
matchmask ^= (1 << first_remaining_peer);
|
matchmask ^= (1 << first_remaining_peer);
|
||||||
}
|
}
|
||||||
if(f < s)
|
if(f < s)
|
||||||
|
|||||||
@ -49,14 +49,14 @@ __global__ void cunn_LookupTableBag_updateOutputKernel(
|
|||||||
for (int64_t emb = begin; emb < end; emb++) {
|
for (int64_t emb = begin; emb < end; emb++) {
|
||||||
const int weightRow = ((int) input[emb]) * stride;
|
const int weightRow = ((int) input[emb]) * stride;
|
||||||
weightFeatSum += ScalarConvert<Dtype, Acctype>::to(weightFeat[weightRow]);
|
weightFeatSum += ScalarConvert<Dtype, Acctype>::to(weightFeat[weightRow]);
|
||||||
bag_size_ ++;
|
bag_size_ ++;
|
||||||
if (featureDim == 0) {
|
if (featureDim == 0) {
|
||||||
offset2bag[emb] = bag;
|
offset2bag[emb] = bag;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (mode == MODE_MEAN) {
|
if (mode == MODE_MEAN) {
|
||||||
weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
|
weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
|
||||||
bag_size[bag] = bag_size_;
|
bag_size[bag] = bag_size_;
|
||||||
}
|
}
|
||||||
(void) MODE_SUM; //silence warnings about unused MODE_SUM;
|
(void) MODE_SUM; //silence warnings about unused MODE_SUM;
|
||||||
output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
|
output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
|
||||||
@ -114,9 +114,9 @@ __global__ void cunn_LookupTableBag_accGradParametersKernel(
|
|||||||
if (featureDim < stride)
|
if (featureDim < stride)
|
||||||
{
|
{
|
||||||
gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
|
gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
|
||||||
if (mode == MODE_MEAN) {
|
if (mode == MODE_MEAN) {
|
||||||
gradient[ii] /= bag_size[seq_number];
|
gradient[ii] /= bag_size[seq_number];
|
||||||
}
|
}
|
||||||
weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
|
weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
|
|||||||
C10_LAUNCH_BOUNDS_1(1024)
|
C10_LAUNCH_BOUNDS_1(1024)
|
||||||
#endif
|
#endif
|
||||||
__global__ void nearest_neighbor_4d_kernel(
|
__global__ void nearest_neighbor_4d_kernel(
|
||||||
const int n,
|
const int n,
|
||||||
const THCDeviceTensor<Dtype, 4> data1,
|
const THCDeviceTensor<Dtype, 4> data1,
|
||||||
THCDeviceTensor<Dtype, 4> data2) {
|
THCDeviceTensor<Dtype, 4> data2) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
const int batchsize = data1.getSize(0);
|
const int batchsize = data1.getSize(0);
|
||||||
const int channels = data1.getSize(1);
|
const int channels = data1.getSize(1);
|
||||||
@ -37,10 +37,10 @@ __global__ void nearest_neighbor_4d_kernel(
|
|||||||
const int h1 = h2;
|
const int h1 = h2;
|
||||||
const int w1 = w2;
|
const int w1 = w2;
|
||||||
for (int n = 0; n < batchsize; n++) {
|
for (int n = 0; n < batchsize; n++) {
|
||||||
for (int c = 0; c < channels; ++c) {
|
for (int c = 0; c < channels; ++c) {
|
||||||
const Dtype val = data1[n][c][h1][w1];
|
const Dtype val = data1[n][c][h1][w1];
|
||||||
data2[n][c][h2][w2] = val;
|
data2[n][c][h2][w2] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -49,8 +49,8 @@ __global__ void nearest_neighbor_4d_kernel(
|
|||||||
const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
|
const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
|
||||||
for (int n = 0; n < batchsize; n++) {
|
for (int n = 0; n < batchsize; n++) {
|
||||||
for (int c = 0; c < channels; ++c) {
|
for (int c = 0; c < channels; ++c) {
|
||||||
const Dtype val = data1[n][c][h1][w1];
|
const Dtype val = data1[n][c][h1][w1];
|
||||||
data2[n][c][h2][w2] = val;
|
data2[n][c][h2][w2] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -62,9 +62,9 @@ template <typename Dtype, typename Acctype>
|
|||||||
C10_LAUNCH_BOUNDS_1(1024)
|
C10_LAUNCH_BOUNDS_1(1024)
|
||||||
#endif
|
#endif
|
||||||
__global__ void nearest_neighbor_4d_kernel_backward(
|
__global__ void nearest_neighbor_4d_kernel_backward(
|
||||||
const int n,
|
const int n,
|
||||||
THCDeviceTensor<Dtype, 4> data1,
|
THCDeviceTensor<Dtype, 4> data1,
|
||||||
const THCDeviceTensor<Dtype, 4> data2) {
|
const THCDeviceTensor<Dtype, 4> data2) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
const int batchsize = data1.getSize(0);
|
const int batchsize = data1.getSize(0);
|
||||||
const int channels = data1.getSize(1);
|
const int channels = data1.getSize(1);
|
||||||
@ -83,10 +83,10 @@ __global__ void nearest_neighbor_4d_kernel_backward(
|
|||||||
const int h1 = h2;
|
const int h1 = h2;
|
||||||
const int w1 = w2;
|
const int w1 = w2;
|
||||||
for (int n = 0; n < batchsize; n++) {
|
for (int n = 0; n < batchsize; n++) {
|
||||||
for (int c = 0; c < channels; ++c) {
|
for (int c = 0; c < channels; ++c) {
|
||||||
const Dtype val = data2[n][c][h2][w2];
|
const Dtype val = data2[n][c][h2][w2];
|
||||||
data1[n][c][h1][w1] = val;
|
data1[n][c][h1][w1] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
|
|||||||
C10_LAUNCH_BOUNDS_1(1024)
|
C10_LAUNCH_BOUNDS_1(1024)
|
||||||
#endif
|
#endif
|
||||||
__global__ void nearest_neighbor_3d_kernel(
|
__global__ void nearest_neighbor_3d_kernel(
|
||||||
const int n,
|
const int n,
|
||||||
const THCDeviceTensor<Dtype, 3> data1,
|
const THCDeviceTensor<Dtype, 3> data1,
|
||||||
THCDeviceTensor<Dtype, 3> data2) {
|
THCDeviceTensor<Dtype, 3> data2) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
const int batchsize = data1.getSize(0);
|
const int batchsize = data1.getSize(0);
|
||||||
const int channels = data1.getSize(1);
|
const int channels = data1.getSize(1);
|
||||||
@ -32,10 +32,10 @@ __global__ void nearest_neighbor_3d_kernel(
|
|||||||
if (width1 == width2) {
|
if (width1 == width2) {
|
||||||
const int w1 = w2;
|
const int w1 = w2;
|
||||||
for (int n = 0; n < batchsize; n++) {
|
for (int n = 0; n < batchsize; n++) {
|
||||||
for (int c = 0; c < channels; ++c) {
|
for (int c = 0; c < channels; ++c) {
|
||||||
const Dtype val = data1[n][c][w1];
|
const Dtype val = data1[n][c][w1];
|
||||||
data2[n][c][w2] = val;
|
data2[n][c][w2] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -43,8 +43,8 @@ __global__ void nearest_neighbor_3d_kernel(
|
|||||||
const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
|
const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
|
||||||
for (int n = 0; n < batchsize; n++) {
|
for (int n = 0; n < batchsize; n++) {
|
||||||
for (int c = 0; c < channels; ++c) {
|
for (int c = 0; c < channels; ++c) {
|
||||||
const Dtype val = data1[n][c][w1];
|
const Dtype val = data1[n][c][w1];
|
||||||
data2[n][c][w2] = val;
|
data2[n][c][w2] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -56,9 +56,9 @@ template <typename Dtype, typename Acctype>
|
|||||||
C10_LAUNCH_BOUNDS_1(1024)
|
C10_LAUNCH_BOUNDS_1(1024)
|
||||||
#endif
|
#endif
|
||||||
__global__ void nearest_neighbor_3d_kernel_backward(
|
__global__ void nearest_neighbor_3d_kernel_backward(
|
||||||
const int n,
|
const int n,
|
||||||
THCDeviceTensor<Dtype, 3> data1,
|
THCDeviceTensor<Dtype, 3> data1,
|
||||||
const THCDeviceTensor<Dtype, 3> data2) {
|
const THCDeviceTensor<Dtype, 3> data2) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
const int batchsize = data1.getSize(0);
|
const int batchsize = data1.getSize(0);
|
||||||
const int channels = data1.getSize(1);
|
const int channels = data1.getSize(1);
|
||||||
@ -72,10 +72,10 @@ __global__ void nearest_neighbor_3d_kernel_backward(
|
|||||||
if (width1 == width2) {
|
if (width1 == width2) {
|
||||||
const int w1 = w2;
|
const int w1 = w2;
|
||||||
for (int n = 0; n < batchsize; n++) {
|
for (int n = 0; n < batchsize; n++) {
|
||||||
for (int c = 0; c < channels; ++c) {
|
for (int c = 0; c < channels; ++c) {
|
||||||
const Dtype val = data2[n][c][w1];
|
const Dtype val = data2[n][c][w1];
|
||||||
data1[n][c][w2] = val;
|
data1[n][c][w2] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
|
|||||||
C10_LAUNCH_BOUNDS_1(1024)
|
C10_LAUNCH_BOUNDS_1(1024)
|
||||||
#endif
|
#endif
|
||||||
__global__ void nearest_neighbor_5d_kernel(
|
__global__ void nearest_neighbor_5d_kernel(
|
||||||
const int n,
|
const int n,
|
||||||
const THCDeviceTensor<Dtype, 5> data1,
|
const THCDeviceTensor<Dtype, 5> data1,
|
||||||
THCDeviceTensor<Dtype, 5> data2) {
|
THCDeviceTensor<Dtype, 5> data2) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
const int batchsize = data1.getSize(0);
|
const int batchsize = data1.getSize(0);
|
||||||
const int channels = data1.getSize(1);
|
const int channels = data1.getSize(1);
|
||||||
@ -55,8 +55,8 @@ __global__ void nearest_neighbor_5d_kernel(
|
|||||||
const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
|
const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
|
||||||
for (int n = 0; n < batchsize; n++) {
|
for (int n = 0; n < batchsize; n++) {
|
||||||
for (int c = 0; c < channels; ++c) {
|
for (int c = 0; c < channels; ++c) {
|
||||||
const Dtype val = data1[n][c][d1][h1][w1];
|
const Dtype val = data1[n][c][d1][h1][w1];
|
||||||
data2[n][c][d2][h2][w2] = val;
|
data2[n][c][d2][h2][w2] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -68,9 +68,9 @@ template <typename Dtype, typename Acctype>
|
|||||||
C10_LAUNCH_BOUNDS_1(1024)
|
C10_LAUNCH_BOUNDS_1(1024)
|
||||||
#endif
|
#endif
|
||||||
__global__ void nearest_neighbor_5d_kernel_backward(
|
__global__ void nearest_neighbor_5d_kernel_backward(
|
||||||
const int n,
|
const int n,
|
||||||
THCDeviceTensor<Dtype, 5> data1,
|
THCDeviceTensor<Dtype, 5> data1,
|
||||||
const THCDeviceTensor<Dtype, 5> data2) {
|
const THCDeviceTensor<Dtype, 5> data2) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
const int batchsize = data1.getSize(0);
|
const int batchsize = data1.getSize(0);
|
||||||
const int channels = data1.getSize(1);
|
const int channels = data1.getSize(1);
|
||||||
@ -108,8 +108,8 @@ __global__ void nearest_neighbor_5d_kernel_backward(
|
|||||||
const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
|
const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
|
||||||
for (int n = 0; n < batchsize; n++) {
|
for (int n = 0; n < batchsize; n++) {
|
||||||
for (int c = 0; c < channels; ++c) {
|
for (int c = 0; c < channels; ++c) {
|
||||||
const Dtype val = data2[n][c][d2][h2][w2];
|
const Dtype val = data2[n][c][d2][h2][w2];
|
||||||
atomicAdd(data1[n][c][d1][h1][w1].data(), val);
|
atomicAdd(data1[n][c][d1][h1][w1].data(), val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -24,7 +24,7 @@ inline int GET_BLOCKS(const int N)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define THCUNN_check_shape(STATE, I1, I2) \
|
#define THCUNN_check_shape(STATE, I1, I2) \
|
||||||
if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \
|
if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \
|
||||||
{ \
|
{ \
|
||||||
THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1); \
|
THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1); \
|
||||||
THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \
|
THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \
|
||||||
@ -47,20 +47,20 @@ inline int GET_BLOCKS(const int N)
|
|||||||
ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1); \
|
ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1); \
|
||||||
ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2); \
|
ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2); \
|
||||||
if (n1 != n2) \
|
if (n1 != n2) \
|
||||||
{ \
|
{ \
|
||||||
THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1); \
|
THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1); \
|
||||||
THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2); \
|
THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2); \
|
||||||
THError(#I1 " and " #I2 " have different number of elements: " \
|
THError(#I1 " and " #I2 " have different number of elements: " \
|
||||||
#I1 "%s has %ld elements, while " \
|
#I1 "%s has %ld elements, while " \
|
||||||
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
|
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
|
#define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
|
||||||
if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \
|
if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \
|
||||||
THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \
|
THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \
|
||||||
THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \
|
THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \
|
||||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||||
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,7 @@ void THNN_(LookupTableBag_updateOutput)(
|
|||||||
THCTensor *weight,
|
THCTensor *weight,
|
||||||
THCTensor *output,
|
THCTensor *output,
|
||||||
THCIndexTensor *offset2bag,
|
THCIndexTensor *offset2bag,
|
||||||
int mode,
|
int mode,
|
||||||
THCIndexTensor *bag_size)
|
THCIndexTensor *bag_size)
|
||||||
{
|
{
|
||||||
THCUNN_assertSameGPU(state, 5, input, offsets, weight, output, offset2bag);
|
THCUNN_assertSameGPU(state, 5, input, offsets, weight, output, offset2bag);
|
||||||
@ -65,8 +65,8 @@ void THNN_(LookupTableBag_accGradParameters)(
|
|||||||
THCIndexTensor *sortedIndices,
|
THCIndexTensor *sortedIndices,
|
||||||
THCIndexTensor *origIndices,
|
THCIndexTensor *origIndices,
|
||||||
bool scaleGradByFreq,
|
bool scaleGradByFreq,
|
||||||
int mode,
|
int mode,
|
||||||
THCIndexTensor *bag_size,
|
THCIndexTensor *bag_size,
|
||||||
accreal scale_)
|
accreal scale_)
|
||||||
{
|
{
|
||||||
scalar_t scale = ScalarConvert<accreal, scalar_t>::to(scale_);
|
scalar_t scale = ScalarConvert<accreal, scalar_t>::to(scale_);
|
||||||
|
|||||||
@ -88,7 +88,7 @@ static THCTensor* THNN_(newViewWeightMM2d)(THCState *state, THCTensor *weight) {
|
|||||||
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
|
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
|
||||||
THCTensor *old_weight = weight;
|
THCTensor *old_weight = weight;
|
||||||
weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(),
|
weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(),
|
||||||
s1, -1, s2, -1);
|
s1, -1, s2, -1);
|
||||||
THCTensor_(free)(state, old_weight);
|
THCTensor_(free)(state, old_weight);
|
||||||
}
|
}
|
||||||
return weight;
|
return weight;
|
||||||
|
|||||||
@ -11,7 +11,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
|
|||||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||||
int dilationH, int dilationW, int weight_nullable) {
|
int dilationH, int dilationW, int weight_nullable) {
|
||||||
THArgCheck(kW > 0 && kH > 0, 9,
|
THArgCheck(kW > 0 && kH > 0, 9,
|
||||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||||
THArgCheck(dW > 0 && dH > 0, 11,
|
THArgCheck(dW > 0 && dH > 0, 11,
|
||||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||||
THArgCheck(dilationW > 0 && dilationH > 0, 14,
|
THArgCheck(dilationW > 0 && dilationH > 0, 14,
|
||||||
|
|||||||
@ -34,7 +34,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
|
|||||||
THCState *state,
|
THCState *state,
|
||||||
THCTensor *input,
|
THCTensor *input,
|
||||||
THCTensor *output,
|
THCTensor *output,
|
||||||
int outputHeight,
|
int outputHeight,
|
||||||
int outputWidth)
|
int outputWidth)
|
||||||
{
|
{
|
||||||
THCUNN_assertSameGPU(state, 2, input, output);
|
THCUNN_assertSameGPU(state, 2, input, output);
|
||||||
@ -44,14 +44,14 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
|
|||||||
int inputWidth = THCTensor_(size)(state, input, 3);
|
int inputWidth = THCTensor_(size)(state, input, 3);
|
||||||
|
|
||||||
THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
|
THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
|
||||||
inputHeight, inputWidth,
|
inputHeight, inputWidth,
|
||||||
outputHeight, outputWidth);
|
outputHeight, outputWidth);
|
||||||
THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
|
THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
|
||||||
|
|
||||||
THCTensor_(resize4d)(state, output,
|
THCTensor_(resize4d)(state, output,
|
||||||
THCTensor_(size)(state, input, 0),
|
THCTensor_(size)(state, input, 0),
|
||||||
THCTensor_(size)(state, input, 1),
|
THCTensor_(size)(state, input, 1),
|
||||||
outputHeight,
|
outputHeight,
|
||||||
outputWidth);
|
outputWidth);
|
||||||
THCTensor_(zero)(state, output);
|
THCTensor_(zero)(state, output);
|
||||||
|
|
||||||
@ -62,7 +62,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
|
|||||||
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
||||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||||
nearest_neighbor_4d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
nearest_neighbor_4d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
||||||
0, stream>>>(num_kernels, idata, odata);
|
0, stream>>>(num_kernels, idata, odata);
|
||||||
THCudaCheck(cudaGetLastError());
|
THCudaCheck(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -73,15 +73,15 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
|
|||||||
THCTensor *gradOutput,
|
THCTensor *gradOutput,
|
||||||
THCTensor *gradInput,
|
THCTensor *gradInput,
|
||||||
int nbatch,
|
int nbatch,
|
||||||
int nchannels,
|
int nchannels,
|
||||||
int inputHeight,
|
int inputHeight,
|
||||||
int inputWidth,
|
int inputWidth,
|
||||||
int outputHeight,
|
int outputHeight,
|
||||||
int outputWidth)
|
int outputWidth)
|
||||||
{
|
{
|
||||||
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
|
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
|
||||||
THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
|
THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
|
||||||
inputHeight, inputWidth, outputHeight, outputWidth);
|
inputHeight, inputWidth, outputHeight, outputWidth);
|
||||||
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
|
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
|
||||||
THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
|
THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
|
||||||
|
|
||||||
@ -94,7 +94,7 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
|
|||||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||||
|
|
||||||
nearest_neighbor_4d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
nearest_neighbor_4d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
||||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||||
THCudaCheck(cudaGetLastError());
|
THCudaCheck(cudaGetLastError());
|
||||||
THCTensor_(free)(state, gradOutput);
|
THCTensor_(free)(state, gradOutput);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -243,7 +243,7 @@ THC_API void THNN_(LookupTableBag_updateOutput)(
|
|||||||
THCTensor *weight,
|
THCTensor *weight,
|
||||||
THCTensor *output,
|
THCTensor *output,
|
||||||
THCIndexTensor *offset2bag,
|
THCIndexTensor *offset2bag,
|
||||||
int mode,
|
int mode,
|
||||||
THCIndexTensor *seq_length); // [OPTIONAL]
|
THCIndexTensor *seq_length); // [OPTIONAL]
|
||||||
|
|
||||||
THC_API void THNN_(LookupTableBag_accGradParameters)(
|
THC_API void THNN_(LookupTableBag_accGradParameters)(
|
||||||
@ -256,8 +256,8 @@ THC_API void THNN_(LookupTableBag_accGradParameters)(
|
|||||||
THCIndexTensor *sortedIndices,
|
THCIndexTensor *sortedIndices,
|
||||||
THCIndexTensor *origIndices,
|
THCIndexTensor *origIndices,
|
||||||
bool scaleGradByFreq,
|
bool scaleGradByFreq,
|
||||||
int mode,
|
int mode,
|
||||||
THCIndexTensor *seq_length, // [OPTIONAL]
|
THCIndexTensor *seq_length, // [OPTIONAL]
|
||||||
accreal scale_);
|
accreal scale_);
|
||||||
|
|
||||||
THC_API void THNN_(L1Cost_updateOutput)(
|
THC_API void THNN_(L1Cost_updateOutput)(
|
||||||
|
|||||||
@ -54,7 +54,7 @@ void THNN_(TemporalUpSamplingNearest_updateOutput)(
|
|||||||
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
||||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||||
nearest_neighbor_3d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
nearest_neighbor_3d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
||||||
0, stream>>>(num_kernels, idata, odata);
|
0, stream>>>(num_kernels, idata, odata);
|
||||||
THCudaCheck(cudaGetLastError());
|
THCudaCheck(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ void THNN_(TemporalUpSamplingNearest_updateGradInput)(
|
|||||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||||
|
|
||||||
nearest_neighbor_3d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
nearest_neighbor_3d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
||||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||||
|
|
||||||
THCudaCheck(cudaGetLastError());
|
THCudaCheck(cudaGetLastError());
|
||||||
THCTensor_(free)(state, gradOutput);
|
THCTensor_(free)(state, gradOutput);
|
||||||
|
|||||||
@ -47,10 +47,10 @@ void THNN_(VolumetricUpSamplingNearest_updateOutput)(
|
|||||||
int inputWidth = THCTensor_(size)(state, input, 4);
|
int inputWidth = THCTensor_(size)(state, input, 4);
|
||||||
|
|
||||||
THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
|
THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
|
||||||
inputDepth, inputHeight, inputWidth,
|
inputDepth, inputHeight, inputWidth,
|
||||||
outputDepth, outputHeight, outputWidth);
|
outputDepth, outputHeight, outputWidth);
|
||||||
THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
|
THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
|
||||||
outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
|
outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
|
||||||
|
|
||||||
THCTensor_(resize5d)(state, output,
|
THCTensor_(resize5d)(state, output,
|
||||||
THCTensor_(size)(state, input, 0),
|
THCTensor_(size)(state, input, 0),
|
||||||
@ -67,7 +67,7 @@ void THNN_(VolumetricUpSamplingNearest_updateOutput)(
|
|||||||
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
||||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||||
nearest_neighbor_5d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
nearest_neighbor_5d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
|
||||||
0, stream>>>(num_kernels, idata, odata);
|
0, stream>>>(num_kernels, idata, odata);
|
||||||
THCudaCheck(cudaGetLastError());
|
THCudaCheck(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,8 +88,8 @@ void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
|
|||||||
{
|
{
|
||||||
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
|
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
|
||||||
THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
|
THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
|
||||||
inputDepth, inputHeight, inputWidth,
|
inputDepth, inputHeight, inputWidth,
|
||||||
outputDepth, outputHeight, outputWidth);
|
outputDepth, outputHeight, outputWidth);
|
||||||
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
|
gradOutput = THCTensor_(newContiguous)(state, gradOutput);
|
||||||
THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);
|
THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);
|
||||||
|
|
||||||
@ -100,7 +100,7 @@ void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
|
|||||||
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
|
||||||
cudaStream_t stream = THCState_getCurrentStream(state);
|
cudaStream_t stream = THCState_getCurrentStream(state);
|
||||||
nearest_neighbor_5d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
nearest_neighbor_5d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
|
||||||
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
num_threads, 0, stream>>>(num_kernels, data1, data2);
|
||||||
THCudaCheck(cudaGetLastError());
|
THCudaCheck(cudaGetLastError());
|
||||||
THCTensor_(free)(state, gradOutput);
|
THCTensor_(free)(state, gradOutput);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -36,7 +36,7 @@ static Acctype linear_upsampling_compute_source_index(
|
|||||||
|
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
static int nearest_neighbor_compute_source_index(
|
static int nearest_neighbor_compute_source_index(
|
||||||
const float scale, int dst_index, int inputSize) {
|
const float scale, int dst_index, int inputSize) {
|
||||||
const int src_index = MIN(floor(dst_index * scale), inputSize - 1);
|
const int src_index = MIN(floor(dst_index * scale), inputSize - 1);
|
||||||
return src_index;
|
return src_index;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,16 +29,16 @@ void THNN_(BCECriterion_updateOutput)(
|
|||||||
scalar_t y = *target_data;
|
scalar_t y = *target_data;
|
||||||
THAssertMsg(x >= 0. && x <= 1.,
|
THAssertMsg(x >= 0. && x <= 1.,
|
||||||
"input value should be between 0~1, but got %f",
|
"input value should be between 0~1, but got %f",
|
||||||
(double) x);
|
(double) x);
|
||||||
*output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
|
*output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
|
||||||
);
|
);
|
||||||
if (weights) {
|
if (weights) {
|
||||||
THTensor_(cmul)(output, output, weights);
|
THTensor_(cmul)(output, output, weights);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
THTensor_(resize0d)(output);
|
THTensor_(resize0d)(output);
|
||||||
scalar_t sum = 0;
|
scalar_t sum = 0;
|
||||||
|
|
||||||
if (weights) {
|
if (weights) {
|
||||||
@ -48,7 +48,7 @@ void THNN_(BCECriterion_updateOutput)(
|
|||||||
scalar_t w = *weights_data;
|
scalar_t w = *weights_data;
|
||||||
THAssertMsg(x >= 0. && x <= 1.,
|
THAssertMsg(x >= 0. && x <= 1.,
|
||||||
"input value should be between 0~1, but got %f",
|
"input value should be between 0~1, but got %f",
|
||||||
(double) x);
|
(double) x);
|
||||||
sum -= (safe_log(x) * y + safe_log(1. - x) * (1. - y)) * w;
|
sum -= (safe_log(x) * y + safe_log(1. - x) * (1. - y)) * w;
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
@ -57,7 +57,7 @@ void THNN_(BCECriterion_updateOutput)(
|
|||||||
scalar_t y = *target_data;
|
scalar_t y = *target_data;
|
||||||
THAssertMsg(x >= 0. && x <= 1.,
|
THAssertMsg(x >= 0. && x <= 1.,
|
||||||
"input value should be between 0~1, but got %f",
|
"input value should be between 0~1, but got %f",
|
||||||
(double) x);
|
(double) x);
|
||||||
sum -= safe_log(x) * y + safe_log(1. - x) * (1. - y);
|
sum -= safe_log(x) * y + safe_log(1. - x) * (1. - y);
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -25,7 +25,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
|
|||||||
if (weights && THTensor_(nElement)(weights) != n_classes) {
|
if (weights && THTensor_(nElement)(weights) != n_classes) {
|
||||||
THDescBuff s1 = THTensor_(sizeDesc)(weights);
|
THDescBuff s1 = THTensor_(sizeDesc)(weights);
|
||||||
THError("weight tensor should be defined either for all %d classes or no classes"
|
THError("weight tensor should be defined either for all %d classes or no classes"
|
||||||
" but got weight tensor of shape: %s", n_classes, s1.str);
|
" but got weight tensor of shape: %s", n_classes, s1.str);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (reduction == Reduction::None && n_dims == 2) {
|
if (reduction == Reduction::None && n_dims == 2) {
|
||||||
@ -39,8 +39,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
|
|||||||
int cur_target = THLongTensor_fastGetLegacy1dNoScalars(target, i);
|
int cur_target = THLongTensor_fastGetLegacy1dNoScalars(target, i);
|
||||||
|
|
||||||
if (cur_target == ignore_index) {
|
if (cur_target == ignore_index) {
|
||||||
THTensor_(fastSet1d)(output, i, 0.0f);
|
THTensor_(fastSet1d)(output, i, 0.0f);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (cur_target >= 0 && cur_target < n_classes) {
|
if (cur_target >= 0 && cur_target < n_classes) {
|
||||||
scalar_t cur_weight = weights ? THTensor_(fastGetLegacy1dNoScalars)(weights, cur_target) : 1.0f;
|
scalar_t cur_weight = weights ? THTensor_(fastGetLegacy1dNoScalars)(weights, cur_target) : 1.0f;
|
||||||
|
|||||||
@ -40,7 +40,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
|
|||||||
{
|
{
|
||||||
THIndex_t idx = THIndexTensor_(get1d)(target, t);
|
THIndex_t idx = THIndexTensor_(get1d)(target, t);
|
||||||
THArgCheck((idx >= 0) && (idx < dim), 3,
|
THArgCheck((idx >= 0) && (idx < dim), 3,
|
||||||
"target out of range");
|
"target out of range");
|
||||||
}
|
}
|
||||||
|
|
||||||
input = THTensor_(newContiguous)(input);
|
input = THTensor_(newContiguous)(input);
|
||||||
|
|||||||
@ -98,7 +98,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
|
|||||||
|
|
||||||
|
|
||||||
THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
|
THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
|
||||||
"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
|
"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
|
||||||
|
|
||||||
if (input->dim() == 4)
|
if (input->dim() == 4)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -6,9 +6,9 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
static inline void THNN_(SpatialAveragePooling_shapeCheck)(
|
static inline void THNN_(SpatialAveragePooling_shapeCheck)(
|
||||||
THTensor *input, THTensor *gradOutput,
|
THTensor *input, THTensor *gradOutput,
|
||||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||||
bool ceil_mode) {
|
bool ceil_mode) {
|
||||||
|
|
||||||
THArgCheck(kW > 0 && kH > 0, 5,
|
THArgCheck(kW > 0 && kH > 0, 5,
|
||||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||||
@ -27,12 +27,12 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
|
|||||||
}
|
}
|
||||||
|
|
||||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||||
|
|
||||||
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
|
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
|
||||||
"pad should be smaller than half of kernel size, but got "
|
"pad should be smaller than half of kernel size, but got "
|
||||||
"padW = %d, padH = %d, kW = %d, kH = %d",
|
"padW = %d, padH = %d, kW = %d, kH = %d",
|
||||||
padW, padH, kW, kH);
|
padW, padH, kW, kH);
|
||||||
|
|
||||||
int64_t nInputPlane = input->size(dimh-1);
|
int64_t nInputPlane = input->size(dimh-1);
|
||||||
int64_t inputHeight = input->size(dimh);
|
int64_t inputHeight = input->size(dimh);
|
||||||
@ -44,7 +44,7 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
|
|||||||
|
|
||||||
if (outputWidth < 1 || outputHeight < 1)
|
if (outputWidth < 1 || outputHeight < 1)
|
||||||
THError("Given input size: (%dx%dx%d). "
|
THError("Given input size: (%dx%dx%d). "
|
||||||
"Calculated output size: (%dx%dx%d). Output size is too small",
|
"Calculated output size: (%dx%dx%d). Output size is too small",
|
||||||
nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
|
nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
|
||||||
|
|
||||||
if (gradOutput != NULL) {
|
if (gradOutput != NULL) {
|
||||||
|
|||||||
@ -4,12 +4,12 @@
|
|||||||
|
|
||||||
#define INITIAL_CHECK \
|
#define INITIAL_CHECK \
|
||||||
THArgCheck(THIndexTensor_(nDimensionLegacyAll)(target) == 3, 3, \
|
THArgCheck(THIndexTensor_(nDimensionLegacyAll)(target) == 3, 3, \
|
||||||
"only batches of spatial targets supported (3D tensors)" \
|
"only batches of spatial targets supported (3D tensors)" \
|
||||||
" but got targets of dimension: %d", \
|
" but got targets of dimension: %d", \
|
||||||
THIndexTensor_(nDimensionLegacyAll)(target)); \
|
THIndexTensor_(nDimensionLegacyAll)(target)); \
|
||||||
THArgCheck(THTensor_(nDimensionLegacyAll)(input) == 4, 2, \
|
THArgCheck(THTensor_(nDimensionLegacyAll)(input) == 4, 2, \
|
||||||
"only batches of spatial inputs supported (4D tensors), " \
|
"only batches of spatial inputs supported (4D tensors), " \
|
||||||
"but got input of dimension: %d", THTensor_(nDimensionLegacyAll)(input)); \
|
"but got input of dimension: %d", THTensor_(nDimensionLegacyAll)(input)); \
|
||||||
if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) { \
|
if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) { \
|
||||||
THError("weight tensor should be defined either for all or no classes"); \
|
THError("weight tensor should be defined either for all or no classes"); \
|
||||||
} \
|
} \
|
||||||
@ -30,8 +30,8 @@
|
|||||||
#define GRADOUTPUT_SHAPE_CHECK \
|
#define GRADOUTPUT_SHAPE_CHECK \
|
||||||
THArgCheck(THTensor_(nDimensionLegacyAll)(gradOutput) == 3, 3, \
|
THArgCheck(THTensor_(nDimensionLegacyAll)(gradOutput) == 3, 3, \
|
||||||
"gradOutput must have same dimension as target (3)" \
|
"gradOutput must have same dimension as target (3)" \
|
||||||
" but got dimension: %d", \
|
" but got dimension: %d", \
|
||||||
THTensor_(nDimensionLegacyAll)(gradOutput)); \
|
THTensor_(nDimensionLegacyAll)(gradOutput)); \
|
||||||
{ \
|
{ \
|
||||||
int64_t gradOutput0 = THTensor_(size)(gradOutput, 0); \
|
int64_t gradOutput0 = THTensor_(size)(gradOutput, 0); \
|
||||||
int64_t gradOutput1 = THTensor_(size)(gradOutput, 1); \
|
int64_t gradOutput1 = THTensor_(size)(gradOutput, 1); \
|
||||||
|
|||||||
@ -5,14 +5,14 @@
|
|||||||
#include <ATen/div_rtn.h>
|
#include <ATen/div_rtn.h>
|
||||||
|
|
||||||
static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
|
static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
|
||||||
THTensor *input, THTensor *gradOutput,
|
THTensor *input, THTensor *gradOutput,
|
||||||
THTensor *weight, THTensor *bias,
|
THTensor *weight, THTensor *bias,
|
||||||
int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {
|
int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {
|
||||||
|
|
||||||
THArgCheck(kW > 0 && kH > 0, 9,
|
THArgCheck(kW > 0 && kH > 0, 9,
|
||||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||||
THArgCheck(dW > 0 && dH > 0, 11,
|
THArgCheck(dW > 0 && dH > 0, 11,
|
||||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||||
|
|
||||||
if (weight != NULL) {
|
if (weight != NULL) {
|
||||||
THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
|
THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
|
||||||
@ -36,7 +36,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
|
|||||||
}
|
}
|
||||||
|
|
||||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||||
|
|
||||||
int64_t inputHeight = input->size(dimh);
|
int64_t inputHeight = input->size(dimh);
|
||||||
int64_t inputWidth = input->size(dimw);
|
int64_t inputWidth = input->size(dimw);
|
||||||
@ -87,8 +87,8 @@ static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) {
|
|||||||
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
|
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
|
||||||
THTensor *old_weight = weight;
|
THTensor *old_weight = weight;
|
||||||
weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
|
weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
|
||||||
s1, -1, s2, -1);
|
s1, -1, s2, -1);
|
||||||
c10::raw::intrusive_ptr::decref(old_weight);
|
c10::raw::intrusive_ptr::decref(old_weight);
|
||||||
}
|
}
|
||||||
return weight;
|
return weight;
|
||||||
}
|
}
|
||||||
@ -116,8 +116,8 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
|
|||||||
THTensor *output2d;
|
THTensor *output2d;
|
||||||
|
|
||||||
THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
|
THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
|
||||||
nInputPlane, inputWidth, inputHeight,
|
nInputPlane, inputWidth, inputHeight,
|
||||||
outputWidth, outputHeight);
|
outputWidth, outputHeight);
|
||||||
|
|
||||||
output2d = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(output), output->storage_offset(),
|
output2d = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(output), output->storage_offset(),
|
||||||
nOutputPlane, -1,
|
nOutputPlane, -1,
|
||||||
@ -125,8 +125,8 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
|
|||||||
if (bias) {
|
if (bias) {
|
||||||
for(i = 0; i < nOutputPlane; i++)
|
for(i = 0; i < nOutputPlane; i++)
|
||||||
THVector_(fill)
|
THVector_(fill)
|
||||||
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
|
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
|
||||||
THTensor_(get1d)(bias, i), outputHeight*outputWidth);
|
THTensor_(get1d)(bias, i), outputHeight*outputWidth);
|
||||||
} else {
|
} else {
|
||||||
THTensor_(zero)(output);
|
THTensor_(zero)(output);
|
||||||
}
|
}
|
||||||
@ -202,10 +202,10 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
|
|||||||
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
||||||
|
|
||||||
THNN_(SpatialConvolutionMM_updateOutput_frame)
|
THNN_(SpatialConvolutionMM_updateOutput_frame)
|
||||||
(input_t, output_t, weight, bias, finput_t,
|
(input_t, output_t, weight, bias, finput_t,
|
||||||
kW, kH, dW, dH, padW, padH,
|
kW, kH, dW, dH, padW, padH,
|
||||||
nInputPlane, inputWidth, inputHeight,
|
nInputPlane, inputWidth, inputHeight,
|
||||||
nOutputPlane, outputWidth, outputHeight);
|
nOutputPlane, outputWidth, outputHeight);
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(input_t);
|
c10::raw::intrusive_ptr::decref(input_t);
|
||||||
c10::raw::intrusive_ptr::decref(output_t);
|
c10::raw::intrusive_ptr::decref(output_t);
|
||||||
@ -239,9 +239,9 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
|
|||||||
THTensor_(zero)(gradInput);
|
THTensor_(zero)(gradInput);
|
||||||
|
|
||||||
THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
|
THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
|
||||||
padW, padH,
|
padW, padH,
|
||||||
gradInput->size(0), gradInput->size(2), gradInput->size(1),
|
gradInput->size(0), gradInput->size(2), gradInput->size(1),
|
||||||
gradOutput->size(2), gradOutput->size(1));
|
gradOutput->size(2), gradOutput->size(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
void THNN_(SpatialConvolutionMM_updateGradInput)(
|
void THNN_(SpatialConvolutionMM_updateGradInput)(
|
||||||
@ -280,8 +280,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
|
|||||||
if(input->dim() == 3)
|
if(input->dim() == 3)
|
||||||
{
|
{
|
||||||
THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
|
THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
|
||||||
tweight, fgradInput,
|
tweight, fgradInput,
|
||||||
kW, kH, dW, dH, padW, padH);
|
kW, kH, dW, dH, padW, padH);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -296,8 +296,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
|
|||||||
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
|
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
|
||||||
|
|
||||||
THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
|
THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
|
||||||
tweight, fgradInput_t,
|
tweight, fgradInput_t,
|
||||||
kW, kH, dW, dH, padW, padH);
|
kW, kH, dW, dH, padW, padH);
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(gradInput_t);
|
c10::raw::intrusive_ptr::decref(gradInput_t);
|
||||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||||
@ -380,7 +380,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
|
|||||||
if(input->dim() == 3)
|
if(input->dim() == 3)
|
||||||
{
|
{
|
||||||
THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
|
THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
|
||||||
gradBias, finput, scale);
|
gradBias, finput, scale);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -396,7 +396,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
|
|||||||
}
|
}
|
||||||
|
|
||||||
THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
|
THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
|
||||||
gradBias, finput_t, scale);
|
gradBias, finput_t, scale);
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||||
if (gradWeight) {
|
if (gradWeight) {
|
||||||
|
|||||||
@ -5,10 +5,10 @@
|
|||||||
#include <ATen/div_rtn.h>
|
#include <ATen/div_rtn.h>
|
||||||
|
|
||||||
static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
|
static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
|
||||||
THTensor *input, THTensor *gradOutput,
|
THTensor *input, THTensor *gradOutput,
|
||||||
THTensor *weight, THTensor *bias,
|
THTensor *weight, THTensor *bias,
|
||||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||||
int dilationH, int dilationW, int weight_nullable) {
|
int dilationH, int dilationW, int weight_nullable) {
|
||||||
THArgCheck(kW > 0 && kH > 0, 9,
|
THArgCheck(kW > 0 && kH > 0, 9,
|
||||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||||
THArgCheck(dW > 0 && dH > 0, 11,
|
THArgCheck(dW > 0 && dH > 0, 11,
|
||||||
@ -40,7 +40,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
|
|||||||
}
|
}
|
||||||
|
|
||||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||||
|
|
||||||
int64_t inputHeight = input->size(dimh);
|
int64_t inputHeight = input->size(dimh);
|
||||||
int64_t inputWidth = input->size(dimw);
|
int64_t inputWidth = input->size(dimw);
|
||||||
@ -235,7 +235,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
|
|||||||
is_batch = 0;
|
is_batch = 0;
|
||||||
THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
|
THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
|
||||||
THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1),
|
THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1),
|
||||||
gradOutput->size(2));
|
gradOutput->size(2));
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t inputWidth = input->size(3);
|
int64_t inputWidth = input->size(3);
|
||||||
@ -342,7 +342,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
|
|||||||
is_batch = 0;
|
is_batch = 0;
|
||||||
THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
|
THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
|
||||||
THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0),
|
THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0),
|
||||||
gradOutput->size(1), gradOutput->size(2));
|
gradOutput->size(1), gradOutput->size(2));
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t nInputPlane = input->size(1);
|
int64_t nInputPlane = input->size(1);
|
||||||
|
|||||||
@ -6,9 +6,9 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
|
static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
|
||||||
THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
|
THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
|
||||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||||
int dilationH, int dilationW, bool ceil_mode) {
|
int dilationH, int dilationW, bool ceil_mode) {
|
||||||
|
|
||||||
THArgCheck(kW > 0 && kH > 0, 5,
|
THArgCheck(kW > 0 && kH > 0, 5,
|
||||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||||
@ -30,12 +30,12 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
|
|||||||
}
|
}
|
||||||
|
|
||||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||||
|
|
||||||
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
|
THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
|
||||||
"pad should be smaller than half of kernel size, but got "
|
"pad should be smaller than half of kernel size, but got "
|
||||||
"padW = %d, padH = %d, kW = %d, kH = %d",
|
"padW = %d, padH = %d, kW = %d, kH = %d",
|
||||||
padW, padH, kW, kH);
|
padW, padH, kW, kH);
|
||||||
|
|
||||||
int64_t nInputPlane = input->size(dimh-1);
|
int64_t nInputPlane = input->size(dimh-1);
|
||||||
int64_t inputHeight = input->size(dimh);
|
int64_t inputHeight = input->size(dimh);
|
||||||
@ -47,7 +47,7 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
|
|||||||
|
|
||||||
if (outputWidth < 1 || outputHeight < 1)
|
if (outputWidth < 1 || outputHeight < 1)
|
||||||
THError("Given input size: (%dx%dx%d). "
|
THError("Given input size: (%dx%dx%d). "
|
||||||
"Calculated output size: (%dx%dx%d). Output size is too small",
|
"Calculated output size: (%dx%dx%d). Output size is too small",
|
||||||
nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
|
nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
|
||||||
|
|
||||||
if (gradOutput != NULL) {
|
if (gradOutput != NULL) {
|
||||||
@ -221,16 +221,16 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
|
|||||||
for (p = 0; p < nbatch; p++)
|
for (p = 0; p < nbatch; p++)
|
||||||
{
|
{
|
||||||
THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
|
THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
|
||||||
(input_data+p*nInputPlane*inputWidth*inputHeight,
|
(input_data+p*nInputPlane*inputWidth*inputHeight,
|
||||||
output_data+p*nInputPlane*outputWidth*outputHeight,
|
output_data+p*nInputPlane*outputWidth*outputHeight,
|
||||||
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
||||||
nInputPlane,
|
nInputPlane,
|
||||||
inputWidth, inputHeight,
|
inputWidth, inputHeight,
|
||||||
outputWidth, outputHeight,
|
outputWidth, outputHeight,
|
||||||
kW, kH, dW, dH,
|
kW, kH, dW, dH,
|
||||||
padW, padH,
|
padW, padH,
|
||||||
dilationW, dilationH
|
dilationW, dilationH
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,10 +266,10 @@ static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
|
|||||||
{
|
{
|
||||||
/* retrieve position of max */
|
/* retrieve position of max */
|
||||||
int64_t maxp = ind_p_k[i*outputWidth + j];
|
int64_t maxp = ind_p_k[i*outputWidth + j];
|
||||||
if (maxp != -1) {
|
if (maxp != -1) {
|
||||||
/* update gradient */
|
/* update gradient */
|
||||||
gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
|
gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -350,13 +350,13 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
|
|||||||
for (p = 0; p < nbatch; p++)
|
for (p = 0; p < nbatch; p++)
|
||||||
{
|
{
|
||||||
THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
|
THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
|
||||||
(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
|
(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
|
||||||
gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
|
gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
|
||||||
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
||||||
nInputPlane,
|
nInputPlane,
|
||||||
inputWidth, inputHeight,
|
inputWidth, inputHeight,
|
||||||
outputWidth, outputHeight,
|
outputWidth, outputHeight,
|
||||||
dW, dH);
|
dW, dH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -3,15 +3,15 @@
|
|||||||
#else
|
#else
|
||||||
|
|
||||||
static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
|
static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
|
||||||
THTensor *input, THTensor *gradOutput,
|
THTensor *input, THTensor *gradOutput,
|
||||||
THTensor *weight, THTensor *bias,
|
THTensor *weight, THTensor *bias,
|
||||||
int kH, int kW, int dH, int dW, int padH, int padW,
|
int kH, int kW, int dH, int dW, int padH, int padW,
|
||||||
int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {
|
int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {
|
||||||
|
|
||||||
THArgCheck(kW > 0 && kH > 0, 9,
|
THArgCheck(kW > 0 && kH > 0, 9,
|
||||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
"kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
|
||||||
THArgCheck(dW > 0 && dH > 0, 11,
|
THArgCheck(dW > 0 && dH > 0, 11,
|
||||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||||
THArgCheck(dilationW > 0 && dilationH > 0, 15,
|
THArgCheck(dilationW > 0 && dilationH > 0, 15,
|
||||||
"dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
|
"dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
|
||||||
dilationH, dilationW);
|
dilationH, dilationW);
|
||||||
@ -41,7 +41,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
|
|||||||
}
|
}
|
||||||
|
|
||||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
|
||||||
"non-empty 3D or 4D input tensor expected but got: %s");
|
"non-empty 3D or 4D input tensor expected but got: %s");
|
||||||
|
|
||||||
int64_t inputHeight = input->size(dimh);
|
int64_t inputHeight = input->size(dimh);
|
||||||
int64_t inputWidth = input->size(dimw);
|
int64_t inputWidth = input->size(dimw);
|
||||||
@ -50,8 +50,8 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
|
|||||||
|
|
||||||
if (outputWidth < 1 || outputHeight < 1) {
|
if (outputWidth < 1 || outputHeight < 1) {
|
||||||
THError("Given input size per channel: (%ld x %ld). "
|
THError("Given input size per channel: (%ld x %ld). "
|
||||||
"Calculated output size per channel: (%ld x %ld). Output size is too small",
|
"Calculated output size per channel: (%ld x %ld). Output size is too small",
|
||||||
inputHeight, inputWidth, outputHeight, outputWidth);
|
inputHeight, inputWidth, outputHeight, outputWidth);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (weight != NULL) {
|
if (weight != NULL) {
|
||||||
|
|||||||
@ -111,12 +111,12 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
|
|||||||
for (p = 0; p < nbatch; p++)
|
for (p = 0; p < nbatch; p++)
|
||||||
{
|
{
|
||||||
THNN_(SpatialMaxUnpooling_updateOutput_frame)(
|
THNN_(SpatialMaxUnpooling_updateOutput_frame)(
|
||||||
input_data+p*nslices*iwidth*iheight,
|
input_data+p*nslices*iwidth*iheight,
|
||||||
output_data+p*nslices*owidth*oheight,
|
output_data+p*nslices*owidth*oheight,
|
||||||
indices_data+p*nslices*iwidth*iheight,
|
indices_data+p*nslices*iwidth*iheight,
|
||||||
nslices,
|
nslices,
|
||||||
iwidth, iheight,
|
iwidth, iheight,
|
||||||
owidth, oheight);
|
owidth, oheight);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -196,7 +196,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
|
|||||||
|
|
||||||
if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){
|
if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){
|
||||||
THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
|
THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
|
||||||
oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
|
oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get raw pointers */
|
/* get raw pointers */
|
||||||
|
|||||||
@ -3,467 +3,467 @@
|
|||||||
#else
|
#else
|
||||||
|
|
||||||
static inline void THNN_(TemporalRowConvolution_shapeCheck)(
|
static inline void THNN_(TemporalRowConvolution_shapeCheck)(
|
||||||
THNNState *state,
|
THNNState *state,
|
||||||
THTensor *input,
|
THTensor *input,
|
||||||
THTensor *gradOutput,
|
THTensor *gradOutput,
|
||||||
THTensor *weight,
|
THTensor *weight,
|
||||||
THTensor *bias,
|
THTensor *bias,
|
||||||
int kW,
|
int kW,
|
||||||
int dW,
|
int dW,
|
||||||
int padW) {
|
int padW) {
|
||||||
|
|
||||||
THArgCheck(kW > 0, 5,
|
THArgCheck(kW > 0, 5,
|
||||||
"kernel size should be greater than zero, but got kW: %d", kW);
|
"kernel size should be greater than zero, but got kW: %d", kW);
|
||||||
THArgCheck(dW > 0, 6,
|
THArgCheck(dW > 0, 6,
|
||||||
"stride should be greater than zero, but got dW: %d", dW);
|
"stride should be greater than zero, but got dW: %d", dW);
|
||||||
THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
|
THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
|
||||||
"non-empty 3D weight tensor expected, but got: %s");
|
"non-empty 3D weight tensor expected, but got: %s");
|
||||||
THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
|
THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
|
||||||
THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
|
THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
|
||||||
|
|
||||||
if (bias != NULL) {
|
if (bias != NULL) {
|
||||||
THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
|
THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
// we're always looking at (possibly batch) x feats x seq
|
// we're always looking at (possibly batch) x feats x seq
|
||||||
int ndim = input->dim();
|
int ndim = input->dim();
|
||||||
int dimF = 0;
|
int dimF = 0;
|
||||||
int dimS = 1;
|
int dimS = 1;
|
||||||
|
|
||||||
if (ndim == 3) {
|
if (ndim == 3) {
|
||||||
++dimS;
|
++dimS;
|
||||||
++dimF;
|
++dimF;
|
||||||
}
|
}
|
||||||
|
|
||||||
THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
|
THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
|
||||||
"non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
|
"non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
|
||||||
|
|
||||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||||
int64_t nInputFrame = input->size(dimS);
|
int64_t nInputFrame = input->size(dimS);
|
||||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||||
|
|
||||||
if (nOutputFrame < 1) {
|
if (nOutputFrame < 1) {
|
||||||
THError("Given input size: (%d x %d). "
|
THError("Given input size: (%d x %d). "
|
||||||
"Calculated output size: (%d x %d). Output size is too small",
|
"Calculated output size: (%d x %d). Output size is too small",
|
||||||
inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
|
inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
|
||||||
}
|
}
|
||||||
|
|
||||||
THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
|
THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
|
||||||
|
|
||||||
if (gradOutput != NULL) {
|
if (gradOutput != NULL) {
|
||||||
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
|
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
|
||||||
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
|
THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void THNN_(unfolded_acc_row)(
|
static void THNN_(unfolded_acc_row)(
|
||||||
THTensor *finput,
|
THTensor *finput,
|
||||||
THTensor *input,
|
THTensor *input,
|
||||||
int kW,
|
int kW,
|
||||||
int dW,
|
int dW,
|
||||||
int padW,
|
int padW,
|
||||||
int64_t inputFrameSize,
|
int64_t inputFrameSize,
|
||||||
int64_t nInputFrame,
|
int64_t nInputFrame,
|
||||||
int64_t nOutputFrame) {
|
int64_t nOutputFrame) {
|
||||||
|
|
||||||
int64_t c;
|
int64_t c;
|
||||||
scalar_t *input_data = input->data<scalar_t>();
|
scalar_t *input_data = input->data<scalar_t>();
|
||||||
scalar_t *finput_data = finput->data<scalar_t>();
|
scalar_t *finput_data = finput->data<scalar_t>();
|
||||||
|
|
||||||
// #pragma omp parallel for private(c)
|
// #pragma omp parallel for private(c)
|
||||||
for (c = 0; c < inputFrameSize; c++) {
|
for (c = 0; c < inputFrameSize; c++) {
|
||||||
int64_t kw, x;
|
int64_t kw, x;
|
||||||
int64_t ix = 0;
|
int64_t ix = 0;
|
||||||
|
|
||||||
for (kw = 0; kw < kW; kw++) {
|
for (kw = 0; kw < kW; kw++) {
|
||||||
scalar_t *src = finput_data
|
scalar_t *src = finput_data
|
||||||
+ c * (kW * nOutputFrame)
|
+ c * (kW * nOutputFrame)
|
||||||
+ kw * (nOutputFrame);
|
+ kw * (nOutputFrame);
|
||||||
scalar_t *dst = input_data + c * (nInputFrame);
|
scalar_t *dst = input_data + c * (nInputFrame);
|
||||||
|
|
||||||
ix = (size_t)(kw);
|
ix = (size_t)(kw);
|
||||||
if (dW == 1) {
|
if (dW == 1) {
|
||||||
scalar_t *dst_slice = dst + (size_t)(ix);
|
scalar_t *dst_slice = dst + (size_t)(ix);
|
||||||
THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
|
THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
|
||||||
} else {
|
} else {
|
||||||
for (x = 0; x < nOutputFrame; x++) {
|
for (x = 0; x < nOutputFrame; x++) {
|
||||||
scalar_t *dst_slice = dst + (size_t)(ix + x * dW);
|
scalar_t *dst_slice = dst + (size_t)(ix + x * dW);
|
||||||
THVector_(cadd)(dst_slice, dst_slice,
|
THVector_(cadd)(dst_slice, dst_slice,
|
||||||
src + (size_t)(x), 1, 1);
|
src + (size_t)(x), 1, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void THNN_(unfolded_copy_row)(
|
static void THNN_(unfolded_copy_row)(
|
||||||
THTensor *finput,
|
THTensor *finput,
|
||||||
THTensor *input,
|
THTensor *input,
|
||||||
int kW,
|
int kW,
|
||||||
int dW,
|
int dW,
|
||||||
int padW,
|
int padW,
|
||||||
int64_t inputFrameSize,
|
int64_t inputFrameSize,
|
||||||
int64_t nInputFrame,
|
int64_t nInputFrame,
|
||||||
int64_t nOutputFrame) {
|
int64_t nOutputFrame) {
|
||||||
|
|
||||||
int64_t k;
|
int64_t k;
|
||||||
scalar_t *input_data = input->data<scalar_t>();
|
scalar_t *input_data = input->data<scalar_t>();
|
||||||
scalar_t *finput_data = finput->data<scalar_t>();
|
scalar_t *finput_data = finput->data<scalar_t>();
|
||||||
|
|
||||||
// #pragma omp parallel for private(k)
|
// #pragma omp parallel for private(k)
|
||||||
for (k = 0; k < inputFrameSize * kW; k++) {
|
for (k = 0; k < inputFrameSize * kW; k++) {
|
||||||
int64_t c = k / kW;
|
int64_t c = k / kW;
|
||||||
int64_t rest = k % kW;
|
int64_t rest = k % kW;
|
||||||
int64_t kw = rest % kW;
|
int64_t kw = rest % kW;
|
||||||
int64_t x;
|
int64_t x;
|
||||||
int64_t ix;
|
int64_t ix;
|
||||||
scalar_t *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
|
scalar_t *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
|
||||||
scalar_t *src = input_data + c * (nInputFrame);
|
scalar_t *src = input_data + c * (nInputFrame);
|
||||||
|
|
||||||
ix = (size_t)(kw);
|
ix = (size_t)(kw);
|
||||||
if (dW == 1) {
|
if (dW == 1) {
|
||||||
memcpy(dst, src+(size_t)(ix), sizeof(scalar_t) * (nOutputFrame));
|
memcpy(dst, src+(size_t)(ix), sizeof(scalar_t) * (nOutputFrame));
|
||||||
} else {
|
} else {
|
||||||
for (x = 0; x < nOutputFrame; x++) {
|
for (x = 0; x < nOutputFrame; x++) {
|
||||||
memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
|
memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
|
||||||
sizeof(scalar_t) * 1);
|
sizeof(scalar_t) * 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void THNN_(TemporalRowConvolution_updateOutput_frame)(
|
static void THNN_(TemporalRowConvolution_updateOutput_frame)(
|
||||||
THTensor *input,
|
THTensor *input,
|
||||||
THTensor *output,
|
THTensor *output,
|
||||||
THTensor *weight,
|
THTensor *weight,
|
||||||
THTensor *bias,
|
THTensor *bias,
|
||||||
THTensor *finput,
|
THTensor *finput,
|
||||||
int kW,
|
int kW,
|
||||||
int dW,
|
int dW,
|
||||||
int padW,
|
int padW,
|
||||||
int64_t inputFrameSize,
|
int64_t inputFrameSize,
|
||||||
int64_t nInputFrame,
|
int64_t nInputFrame,
|
||||||
int64_t nOutputFrame) {
|
int64_t nOutputFrame) {
|
||||||
|
|
||||||
int64_t i;
|
int64_t i;
|
||||||
|
|
||||||
THTensor *output3d = THTensor_(newWithStorage3d)(
|
THTensor *output3d = THTensor_(newWithStorage3d)(
|
||||||
THTensor_getStoragePtr(output), output->storage_offset(),
|
THTensor_getStoragePtr(output), output->storage_offset(),
|
||||||
inputFrameSize, -1,
|
inputFrameSize, -1,
|
||||||
1, -1,
|
1, -1,
|
||||||
nOutputFrame, -1);
|
nOutputFrame, -1);
|
||||||
|
|
||||||
THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
|
THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
|
||||||
inputFrameSize, nInputFrame, nOutputFrame);
|
inputFrameSize, nInputFrame, nOutputFrame);
|
||||||
|
|
||||||
THTensor_(zero)(output);
|
THTensor_(zero)(output);
|
||||||
|
|
||||||
if (bias != NULL) {
|
if (bias != NULL) {
|
||||||
for (i = 0; i < inputFrameSize; i++)
|
for (i = 0; i < inputFrameSize; i++)
|
||||||
THVector_(fill)
|
THVector_(fill)
|
||||||
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
|
(THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
|
||||||
+ output->stride(0) * i,
|
+ output->stride(0) * i,
|
||||||
THTensor_(get1d)(bias, i), nOutputFrame);
|
THTensor_(get1d)(bias, i), nOutputFrame);
|
||||||
}
|
}
|
||||||
|
|
||||||
THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
|
THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(output3d);
|
c10::raw::intrusive_ptr::decref(output3d);
|
||||||
}
|
}
|
||||||
|
|
||||||
void THNN_(TemporalRowConvolution_updateOutput)(
|
void THNN_(TemporalRowConvolution_updateOutput)(
|
||||||
THNNState *state,
|
THNNState *state,
|
||||||
THTensor *input,
|
THTensor *input,
|
||||||
THTensor *output,
|
THTensor *output,
|
||||||
THTensor *weight,
|
THTensor *weight,
|
||||||
THTensor *bias,
|
THTensor *bias,
|
||||||
THTensor *finput,
|
THTensor *finput,
|
||||||
THTensor *fgradInput, // unused here but needed for Cuda
|
THTensor *fgradInput, // unused here but needed for Cuda
|
||||||
int kW,
|
int kW,
|
||||||
int dW,
|
int dW,
|
||||||
int padW,
|
int padW,
|
||||||
bool featFirst) {
|
bool featFirst) {
|
||||||
|
|
||||||
int ndim = input->dim();
|
int ndim = input->dim();
|
||||||
|
|
||||||
THTensor *tinput = NULL;
|
THTensor *tinput = NULL;
|
||||||
if (!featFirst) {
|
if (!featFirst) {
|
||||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||||
input = THTensor_(newContiguous)(tinput);
|
input = THTensor_(newContiguous)(tinput);
|
||||||
} else {
|
} else {
|
||||||
input = THTensor_(newContiguous)(input);
|
input = THTensor_(newContiguous)(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
THNN_(TemporalRowConvolution_shapeCheck)(
|
THNN_(TemporalRowConvolution_shapeCheck)(
|
||||||
state, input, NULL, weight, bias, kW, dW, padW);
|
state, input, NULL, weight, bias, kW, dW, padW);
|
||||||
|
|
||||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||||
int64_t nInputFrame = input->size(ndim - 1);
|
int64_t nInputFrame = input->size(ndim - 1);
|
||||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||||
|
|
||||||
if (ndim == 2) { /* non-batch mode */
|
if (ndim == 2) { /* non-batch mode */
|
||||||
|
|
||||||
THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
|
THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
|
||||||
THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
|
THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
|
||||||
|
|
||||||
THTensor_(zero)(finput);
|
THTensor_(zero)(finput);
|
||||||
THTensor_(zero)(output);
|
THTensor_(zero)(output);
|
||||||
|
|
||||||
THNN_(TemporalRowConvolution_updateOutput_frame)
|
THNN_(TemporalRowConvolution_updateOutput_frame)
|
||||||
(input, output, weight, bias, finput,
|
(input, output, weight, bias, finput,
|
||||||
kW, dW, padW,
|
kW, dW, padW,
|
||||||
inputFrameSize, nInputFrame, nOutputFrame);
|
inputFrameSize, nInputFrame, nOutputFrame);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
int64_t T = input->size(0);
|
int64_t T = input->size(0);
|
||||||
int64_t t;
|
int64_t t;
|
||||||
|
|
||||||
THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
|
THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
|
||||||
THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
|
THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
|
||||||
|
|
||||||
THTensor_(zero)(finput);
|
THTensor_(zero)(finput);
|
||||||
THTensor_(zero)(output);
|
THTensor_(zero)(output);
|
||||||
|
|
||||||
#pragma omp parallel for private(t)
|
#pragma omp parallel for private(t)
|
||||||
for (t = 0; t < T; t++) {
|
for (t = 0; t < T; t++) {
|
||||||
THTensor *input_t = THTensor_(newSelect)(input, 0, t);
|
THTensor *input_t = THTensor_(newSelect)(input, 0, t);
|
||||||
THTensor *output_t = THTensor_(newSelect)(output, 0, t);
|
THTensor *output_t = THTensor_(newSelect)(output, 0, t);
|
||||||
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
||||||
|
|
||||||
THNN_(TemporalRowConvolution_updateOutput_frame)
|
THNN_(TemporalRowConvolution_updateOutput_frame)
|
||||||
(input_t, output_t, weight, bias, finput_t,
|
(input_t, output_t, weight, bias, finput_t,
|
||||||
kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
|
kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(input_t);
|
c10::raw::intrusive_ptr::decref(input_t);
|
||||||
c10::raw::intrusive_ptr::decref(output_t);
|
c10::raw::intrusive_ptr::decref(output_t);
|
||||||
c10::raw::intrusive_ptr::decref(finput_t);
|
c10::raw::intrusive_ptr::decref(finput_t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!featFirst) { // NOTE: output will NOT be contiguous in this case
|
if (!featFirst) { // NOTE: output will NOT be contiguous in this case
|
||||||
THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
|
THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
|
||||||
c10::raw::intrusive_ptr::decref(tinput);
|
c10::raw::intrusive_ptr::decref(tinput);
|
||||||
}
|
}
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(input);
|
c10::raw::intrusive_ptr::decref(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
|
static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
|
||||||
THTensor *gradInput,
|
THTensor *gradInput,
|
||||||
THTensor *gradOutput,
|
THTensor *gradOutput,
|
||||||
THTensor *weight,
|
THTensor *weight,
|
||||||
THTensor *fgradInput,
|
THTensor *fgradInput,
|
||||||
int kW,
|
int kW,
|
||||||
int dW,
|
int dW,
|
||||||
int padW,
|
int padW,
|
||||||
int64_t inputFrameSize,
|
int64_t inputFrameSize,
|
||||||
int64_t nInputFrame,
|
int64_t nInputFrame,
|
||||||
int64_t nOutputFrame) {
|
int64_t nOutputFrame) {
|
||||||
|
|
||||||
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
|
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
|
||||||
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
|
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
|
||||||
inputFrameSize, -1,
|
inputFrameSize, -1,
|
||||||
1, -1,
|
1, -1,
|
||||||
nOutputFrame, -1);
|
nOutputFrame, -1);
|
||||||
|
|
||||||
// weight: inputFrameSize x kW x 1
|
// weight: inputFrameSize x kW x 1
|
||||||
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
|
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
|
||||||
THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
|
THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
|
||||||
// fgradInput: inputFrameSize x kW x nOutputFrame
|
// fgradInput: inputFrameSize x kW x nOutputFrame
|
||||||
c10::raw::intrusive_ptr::decref(gradOutput3d);
|
c10::raw::intrusive_ptr::decref(gradOutput3d);
|
||||||
|
|
||||||
THTensor_(zero)(gradInput);
|
THTensor_(zero)(gradInput);
|
||||||
|
|
||||||
THNN_(unfolded_acc_row)(fgradInput, gradInput,
|
THNN_(unfolded_acc_row)(fgradInput, gradInput,
|
||||||
kW, dW, padW,
|
kW, dW, padW,
|
||||||
inputFrameSize, nInputFrame, nOutputFrame);
|
inputFrameSize, nInputFrame, nOutputFrame);
|
||||||
}
|
}
|
||||||
|
|
||||||
void THNN_(TemporalRowConvolution_updateGradInput)(
|
void THNN_(TemporalRowConvolution_updateGradInput)(
|
||||||
THNNState *state,
|
THNNState *state,
|
||||||
THTensor *input,
|
THTensor *input,
|
||||||
THTensor *gradOutput,
|
THTensor *gradOutput,
|
||||||
THTensor *gradInput,
|
THTensor *gradInput,
|
||||||
THTensor *weight,
|
THTensor *weight,
|
||||||
THTensor *finput,
|
THTensor *finput,
|
||||||
THTensor *fgradInput,
|
THTensor *fgradInput,
|
||||||
int kW,
|
int kW,
|
||||||
int dW,
|
int dW,
|
||||||
int padW,
|
int padW,
|
||||||
bool featFirst) {
|
bool featFirst) {
|
||||||
|
|
||||||
int ndim = input->dim();
|
int ndim = input->dim();
|
||||||
|
|
||||||
THTensor *tinput, *tgradOutput;
|
THTensor *tinput, *tgradOutput;
|
||||||
|
|
||||||
if (!featFirst) {
|
if (!featFirst) {
|
||||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||||
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
|
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
|
||||||
|
|
||||||
input = THTensor_(newContiguous)(tinput);
|
input = THTensor_(newContiguous)(tinput);
|
||||||
gradOutput = THTensor_(newContiguous)(tgradOutput);
|
gradOutput = THTensor_(newContiguous)(tgradOutput);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
input = THTensor_(newContiguous)(input);
|
input = THTensor_(newContiguous)(input);
|
||||||
gradOutput = THTensor_(newContiguous)(gradOutput);
|
gradOutput = THTensor_(newContiguous)(gradOutput);
|
||||||
}
|
}
|
||||||
|
|
||||||
THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
|
THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
|
||||||
NULL, kW, dW, padW);
|
NULL, kW, dW, padW);
|
||||||
|
|
||||||
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
|
||||||
int64_t nInputFrame = input->size(ndim - 1);
|
int64_t nInputFrame = input->size(ndim - 1);
|
||||||
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
|
||||||
|
|
||||||
THTensor_(resizeAs)(fgradInput, finput);
|
THTensor_(resizeAs)(fgradInput, finput);
|
||||||
THTensor_(resizeAs)(gradInput, input);
|
THTensor_(resizeAs)(gradInput, input);
|
||||||
|
|
||||||
THTensor_(zero)(fgradInput);
|
THTensor_(zero)(fgradInput);
|
||||||
THTensor_(zero)(gradInput);
|
THTensor_(zero)(gradInput);
|
||||||
|
|
||||||
THTensor *tweight = THTensor_(new)();
|
THTensor *tweight = THTensor_(new)();
|
||||||
THTensor_(transpose)(tweight, weight, 1, 2);
|
THTensor_(transpose)(tweight, weight, 1, 2);
|
||||||
|
|
||||||
if (ndim == 2) {
|
if (ndim == 2) {
|
||||||
THNN_(TemporalRowConvolution_updateGradInput_frame)
|
THNN_(TemporalRowConvolution_updateGradInput_frame)
|
||||||
(gradInput, gradOutput, tweight, fgradInput,
|
(gradInput, gradOutput, tweight, fgradInput,
|
||||||
kW, dW, padW,
|
kW, dW, padW,
|
||||||
inputFrameSize, nInputFrame, nOutputFrame);
|
inputFrameSize, nInputFrame, nOutputFrame);
|
||||||
} else {
|
} else {
|
||||||
int64_t T = input->size(0);
|
int64_t T = input->size(0);
|
||||||
int64_t t;
|
int64_t t;
|
||||||
|
|
||||||
#pragma omp parallel for private(t)
|
#pragma omp parallel for private(t)
|
||||||
for (t = 0; t < T; t++) {
|
for (t = 0; t < T; t++) {
|
||||||
|
|
||||||
THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
|
THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
|
||||||
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
|
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
|
||||||
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
|
THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
|
||||||
|
|
||||||
THNN_(TemporalRowConvolution_updateGradInput_frame)
|
THNN_(TemporalRowConvolution_updateGradInput_frame)
|
||||||
(gradInput_t, gradOutput_t, tweight, fgradInput_t,
|
(gradInput_t, gradOutput_t, tweight, fgradInput_t,
|
||||||
kW, dW, padW,
|
kW, dW, padW,
|
||||||
inputFrameSize, nInputFrame, nOutputFrame);
|
inputFrameSize, nInputFrame, nOutputFrame);
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(gradInput_t);
|
c10::raw::intrusive_ptr::decref(gradInput_t);
|
||||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||||
c10::raw::intrusive_ptr::decref(fgradInput_t);
|
c10::raw::intrusive_ptr::decref(fgradInput_t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(tweight);
|
c10::raw::intrusive_ptr::decref(tweight);
|
||||||
|
|
||||||
if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
|
if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(tinput);
|
c10::raw::intrusive_ptr::decref(tinput);
|
||||||
c10::raw::intrusive_ptr::decref(tgradOutput);
|
c10::raw::intrusive_ptr::decref(tgradOutput);
|
||||||
|
|
||||||
THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
|
THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(input);
|
c10::raw::intrusive_ptr::decref(input);
|
||||||
c10::raw::intrusive_ptr::decref(gradOutput);
|
c10::raw::intrusive_ptr::decref(gradOutput);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
||||||
THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
|
THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
|
||||||
THTensor *finput, scalar_t scale) {
|
THTensor *finput, scalar_t scale) {
|
||||||
|
|
||||||
int64_t i;
|
int64_t i;
|
||||||
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
|
THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
|
||||||
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
|
THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
|
||||||
gradOutput->size(0), -1,
|
gradOutput->size(0), -1,
|
||||||
1, -1,
|
1, -1,
|
||||||
gradOutput->size(1), -1);
|
gradOutput->size(1), -1);
|
||||||
|
|
||||||
THTensor *tfinput = THTensor_(new)();
|
THTensor *tfinput = THTensor_(new)();
|
||||||
THTensor_(transpose)(tfinput, finput, 1, 2);
|
THTensor_(transpose)(tfinput, finput, 1, 2);
|
||||||
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
|
// gradOutput3d: inputFrameSize x 1 x nOutputFrame
|
||||||
// finput: inputFrameSize x nOutputFrame x kW
|
// finput: inputFrameSize x nOutputFrame x kW
|
||||||
THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
|
THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
|
||||||
// gradWeight: inputFrameSize x 1 x kW
|
// gradWeight: inputFrameSize x 1 x kW
|
||||||
c10::raw::intrusive_ptr::decref(tfinput);
|
c10::raw::intrusive_ptr::decref(tfinput);
|
||||||
|
|
||||||
if (gradBias != NULL) {
|
if (gradBias != NULL) {
|
||||||
for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
|
for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
|
||||||
int64_t k;
|
int64_t k;
|
||||||
scalar_t sum = 0;
|
scalar_t sum = 0;
|
||||||
scalar_t *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
|
scalar_t *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
|
||||||
+ gradOutput3d->storage_offset()
|
+ gradOutput3d->storage_offset()
|
||||||
+ i * gradOutput3d->stride(0);
|
+ i * gradOutput3d->stride(0);
|
||||||
for (k = 0; k < gradOutput3d->size(2); k++) {
|
for (k = 0; k < gradOutput3d->size(2); k++) {
|
||||||
sum += data[k];
|
sum += data[k];
|
||||||
}
|
}
|
||||||
(THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
|
(THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
|
||||||
+= scale * sum;
|
+= scale * sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(gradOutput3d);
|
c10::raw::intrusive_ptr::decref(gradOutput3d);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void THNN_(TemporalRowConvolution_accGradParameters)(
|
void THNN_(TemporalRowConvolution_accGradParameters)(
|
||||||
THNNState *state,
|
THNNState *state,
|
||||||
THTensor *input,
|
THTensor *input,
|
||||||
THTensor *gradOutput,
|
THTensor *gradOutput,
|
||||||
THTensor *gradWeight,
|
THTensor *gradWeight,
|
||||||
THTensor *gradBias,
|
THTensor *gradBias,
|
||||||
THTensor *finput,
|
THTensor *finput,
|
||||||
THTensor *fgradInput,
|
THTensor *fgradInput,
|
||||||
int kW,
|
int kW,
|
||||||
int dW,
|
int dW,
|
||||||
int padW,
|
int padW,
|
||||||
bool featFirst,
|
bool featFirst,
|
||||||
accreal scale_) {
|
accreal scale_) {
|
||||||
|
|
||||||
scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
|
scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
|
||||||
int ndim = input->dim();
|
int ndim = input->dim();
|
||||||
|
|
||||||
THTensor *tinput = NULL;
|
THTensor *tinput = NULL;
|
||||||
THTensor *tgradOutput = NULL;
|
THTensor *tgradOutput = NULL;
|
||||||
|
|
||||||
if (!featFirst) {
|
if (!featFirst) {
|
||||||
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
|
||||||
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
|
tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
|
||||||
|
|
||||||
input = THTensor_(newContiguous)(tinput);
|
input = THTensor_(newContiguous)(tinput);
|
||||||
gradOutput = THTensor_(newContiguous)(tgradOutput);
|
gradOutput = THTensor_(newContiguous)(tgradOutput);
|
||||||
} else {
|
} else {
|
||||||
input = THTensor_(newContiguous)(input);
|
input = THTensor_(newContiguous)(input);
|
||||||
gradOutput = THTensor_(newContiguous)(gradOutput);
|
gradOutput = THTensor_(newContiguous)(gradOutput);
|
||||||
}
|
}
|
||||||
|
|
||||||
THNN_(TemporalRowConvolution_shapeCheck)
|
THNN_(TemporalRowConvolution_shapeCheck)
|
||||||
(state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
|
(state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
|
||||||
|
|
||||||
if (ndim == 2) {
|
if (ndim == 2) {
|
||||||
THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
||||||
gradOutput, gradWeight, gradBias, finput, scale);
|
gradOutput, gradWeight, gradBias, finput, scale);
|
||||||
} else {
|
} else {
|
||||||
int64_t T = input->size(0);
|
int64_t T = input->size(0);
|
||||||
int64_t t;
|
int64_t t;
|
||||||
|
|
||||||
for (t = 0; t < T; t++) {
|
for (t = 0; t < T; t++) {
|
||||||
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
|
THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
|
||||||
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
|
||||||
|
|
||||||
THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
THNN_(TemporalRowConvolution_accGradParameters_frame)(
|
||||||
gradOutput_t, gradWeight, gradBias, finput_t, scale);
|
gradOutput_t, gradWeight, gradBias, finput_t, scale);
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
c10::raw::intrusive_ptr::decref(gradOutput_t);
|
||||||
c10::raw::intrusive_ptr::decref(finput_t);
|
c10::raw::intrusive_ptr::decref(finput_t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!featFirst) {
|
if (!featFirst) {
|
||||||
c10::raw::intrusive_ptr::decref(tinput);
|
c10::raw::intrusive_ptr::decref(tinput);
|
||||||
c10::raw::intrusive_ptr::decref(tgradOutput);
|
c10::raw::intrusive_ptr::decref(tgradOutput);
|
||||||
}
|
}
|
||||||
|
|
||||||
c10::raw::intrusive_ptr::decref(input);
|
c10::raw::intrusive_ptr::decref(input);
|
||||||
c10::raw::intrusive_ptr::decref(gradOutput);
|
c10::raw::intrusive_ptr::decref(gradOutput);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -105,7 +105,7 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
|
|||||||
|
|
||||||
|
|
||||||
THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
|
THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
|
||||||
"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
|
"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
|
||||||
|
|
||||||
if (input->dim() == 5)
|
if (input->dim() == 5)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -75,7 +75,7 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
|
|||||||
|
|
||||||
if (otime < 1 || owidth < 1 || oheight < 1)
|
if (otime < 1 || owidth < 1 || oheight < 1)
|
||||||
THError("Given input size: (%dx%dx%dx%d). "
|
THError("Given input size: (%dx%dx%dx%d). "
|
||||||
"Calculated output size: (%dx%dx%dx%d). Output size is too small",
|
"Calculated output size: (%dx%dx%dx%d). Output size is too small",
|
||||||
nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
|
nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
|
||||||
|
|
||||||
if (gradOutput != NULL) {
|
if (gradOutput != NULL) {
|
||||||
|
|||||||
@ -119,7 +119,7 @@ static THTensor* THNN_(newViewWeight)(THTensor *weight)
|
|||||||
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
|
int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
|
||||||
THTensor *old_weight = weight;
|
THTensor *old_weight = weight;
|
||||||
weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
|
weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
|
||||||
s1, -1, s2, -1);
|
s1, -1, s2, -1);
|
||||||
c10::raw::intrusive_ptr::decref(old_weight);
|
c10::raw::intrusive_ptr::decref(old_weight);
|
||||||
}
|
}
|
||||||
return weight;
|
return weight;
|
||||||
|
|||||||
@ -274,7 +274,7 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
|
|||||||
const int64_t k_ = 1;
|
const int64_t k_ = 1;
|
||||||
|
|
||||||
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
|
// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
|
||||||
if (bias) {
|
if (bias) {
|
||||||
THBlas_(gemm)(
|
THBlas_(gemm)(
|
||||||
't', 'n',
|
't', 'n',
|
||||||
n_, m_, k_,
|
n_, m_, k_,
|
||||||
|
|||||||
@ -7,13 +7,13 @@
|
|||||||
#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
|
#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
|
||||||
#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
|
#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
|
||||||
|
|
||||||
#define THNN_CHECK_SHAPE(I1, I2) \
|
#define THNN_CHECK_SHAPE(I1, I2) \
|
||||||
if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2)) \
|
if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2)) \
|
||||||
{ \
|
{ \
|
||||||
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
|
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
|
||||||
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
|
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
|
||||||
THError(#I1 " and " #I2 " shapes do not match: " \
|
THError(#I1 " and " #I2 " shapes do not match: " \
|
||||||
#I1 " %s, " #I2 " %s", s1.str, s2.str); \
|
#I1 " %s, " #I2 " %s", s1.str, s2.str); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define THNN_CHECK_SHAPE_INDICES(I1, I2) \
|
#define THNN_CHECK_SHAPE_INDICES(I1, I2) \
|
||||||
@ -26,39 +26,39 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define THNN_CHECK_NELEMENT(I1, I2) \
|
#define THNN_CHECK_NELEMENT(I1, I2) \
|
||||||
if (I1 != NULL && I2 != NULL ) { \
|
if (I1 != NULL && I2 != NULL ) { \
|
||||||
ptrdiff_t n1 = THTensor_(nElement)(I1); \
|
ptrdiff_t n1 = THTensor_(nElement)(I1); \
|
||||||
ptrdiff_t n2 = THTensor_(nElement)(I2); \
|
ptrdiff_t n2 = THTensor_(nElement)(I2); \
|
||||||
if (n1 != n2) \
|
if (n1 != n2) \
|
||||||
{ \
|
{ \
|
||||||
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
|
THDescBuff s1 = THTensor_(sizeDesc)(I1); \
|
||||||
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
|
THDescBuff s2 = THTensor_(sizeDesc)(I2); \
|
||||||
THError(#I1 " and " #I2 " have different number of elements: " \
|
THError(#I1 " and " #I2 " have different number of elements: " \
|
||||||
#I1 "%s has %ld elements, while " \
|
#I1 "%s has %ld elements, while " \
|
||||||
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
|
#I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
|
#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
|
||||||
if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
|
if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
|
||||||
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
|
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
|
||||||
THDescBuff s1 = THTensor_(sizeDesc)(T); \
|
THDescBuff s1 = THTensor_(sizeDesc)(T); \
|
||||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||||
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \
|
#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \
|
||||||
if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
|
if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM || \
|
||||||
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
|
THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \
|
||||||
THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \
|
THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \
|
||||||
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
|
||||||
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
" but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define THNN_ARGCHECK(COND, ARG, T, FORMAT) \
|
#define THNN_ARGCHECK(COND, ARG, T, FORMAT) \
|
||||||
if (!(COND)) { \
|
if (!(COND)) { \
|
||||||
THDescBuff s1 = THTensor_(sizeDesc)(T); \
|
THDescBuff s1 = THTensor_(sizeDesc)(T); \
|
||||||
THArgCheck(COND, ARG, FORMAT, s1.str); \
|
THArgCheck(COND, ARG, FORMAT, s1.str); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#include <THNN/generic/AbsCriterion.c>
|
#include <THNN/generic/AbsCriterion.c>
|
||||||
|
|||||||
@ -206,7 +206,7 @@ TEST(LeftRightTest, givenInt_whenWriteThrowsExceptionOnSecondCall_thenKeepsNewSt
|
|||||||
write_called = true;
|
write_called = true;
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
MyException
|
MyException
|
||||||
);
|
);
|
||||||
|
|
||||||
// check reading it returns new value
|
// check reading it returns new value
|
||||||
|
|||||||
290
c10/util/Half.h
290
c10/util/Half.h
@ -85,41 +85,41 @@ namespace detail {
|
|||||||
* @note The implementation doesn't use any floating-point operations.
|
* @note The implementation doesn't use any floating-point operations.
|
||||||
*/
|
*/
|
||||||
inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
|
inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
|
||||||
/*
|
/*
|
||||||
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
|
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
|
||||||
* +---+-----+------------+-------------------+
|
* +---+-----+------------+-------------------+
|
||||||
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||||
* +---+-----+------------+-------------------+
|
* +---+-----+------------+-------------------+
|
||||||
* Bits 31 26-30 16-25 0-15
|
* Bits 31 26-30 16-25 0-15
|
||||||
*
|
*
|
||||||
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
|
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
|
||||||
*/
|
*/
|
||||||
const uint32_t w = (uint32_t) h << 16;
|
const uint32_t w = (uint32_t) h << 16;
|
||||||
/*
|
/*
|
||||||
* Extract the sign of the input number into the high bit of the 32-bit word:
|
* Extract the sign of the input number into the high bit of the 32-bit word:
|
||||||
*
|
*
|
||||||
* +---+----------------------------------+
|
* +---+----------------------------------+
|
||||||
* | S |0000000 00000000 00000000 00000000|
|
* | S |0000000 00000000 00000000 00000000|
|
||||||
* +---+----------------------------------+
|
* +---+----------------------------------+
|
||||||
* Bits 31 0-31
|
* Bits 31 0-31
|
||||||
*/
|
*/
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
/*
|
/*
|
||||||
* Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
|
* Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
|
||||||
*
|
*
|
||||||
* +---+-----+------------+-------------------+
|
* +---+-----+------------+-------------------+
|
||||||
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||||
* +---+-----+------------+-------------------+
|
* +---+-----+------------+-------------------+
|
||||||
* Bits 30 27-31 17-26 0-16
|
* Bits 30 27-31 17-26 0-16
|
||||||
*/
|
*/
|
||||||
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
|
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
|
||||||
/*
|
/*
|
||||||
* Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
|
* Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
|
||||||
* If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
|
* If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
|
||||||
* In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
|
* In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
|
||||||
* denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
|
* denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
|
||||||
* biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
|
* biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
|
||||||
*/
|
*/
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
unsigned long nonsign_bsr;
|
unsigned long nonsign_bsr;
|
||||||
_BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
|
_BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
|
||||||
@ -176,62 +176,62 @@ namespace detail {
|
|||||||
* floating-point operations and bitcasts between integer and floating-point variables.
|
* floating-point operations and bitcasts between integer and floating-point variables.
|
||||||
*/
|
*/
|
||||||
inline float fp16_ieee_to_fp32_value(uint16_t h) {
|
inline float fp16_ieee_to_fp32_value(uint16_t h) {
|
||||||
/*
|
/*
|
||||||
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
|
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
|
||||||
* +---+-----+------------+-------------------+
|
* +---+-----+------------+-------------------+
|
||||||
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
|
||||||
* +---+-----+------------+-------------------+
|
* +---+-----+------------+-------------------+
|
||||||
* Bits 31 26-30 16-25 0-15
|
* Bits 31 26-30 16-25 0-15
|
||||||
*
|
*
|
||||||
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
|
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
|
||||||
*/
|
*/
|
||||||
const uint32_t w = (uint32_t) h << 16;
|
const uint32_t w = (uint32_t) h << 16;
|
||||||
/*
|
/*
|
||||||
* Extract the sign of the input number into the high bit of the 32-bit word:
|
* Extract the sign of the input number into the high bit of the 32-bit word:
|
||||||
*
|
*
|
||||||
* +---+----------------------------------+
|
* +---+----------------------------------+
|
||||||
* | S |0000000 00000000 00000000 00000000|
|
* | S |0000000 00000000 00000000 00000000|
|
||||||
* +---+----------------------------------+
|
* +---+----------------------------------+
|
||||||
* Bits 31 0-31
|
* Bits 31 0-31
|
||||||
*/
|
*/
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
/*
|
/*
|
||||||
* Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
|
* Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
|
||||||
*
|
*
|
||||||
* +-----+------------+---------------------+
|
* +-----+------------+---------------------+
|
||||||
* |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
|
* |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
|
||||||
* +-----+------------+---------------------+
|
* +-----+------------+---------------------+
|
||||||
* Bits 27-31 17-26 0-16
|
* Bits 27-31 17-26 0-16
|
||||||
*/
|
*/
|
||||||
const uint32_t two_w = w + w;
|
const uint32_t two_w = w + w;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
|
* Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
|
||||||
* of a single-precision floating-point number:
|
* of a single-precision floating-point number:
|
||||||
*
|
*
|
||||||
* S|Exponent | Mantissa
|
* S|Exponent | Mantissa
|
||||||
* +-+---+-----+------------+----------------+
|
* +-+---+-----+------------+----------------+
|
||||||
* |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
|
* |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
|
||||||
* +-+---+-----+------------+----------------+
|
* +-+---+-----+------------+----------------+
|
||||||
* Bits | 23-31 | 0-22
|
* Bits | 23-31 | 0-22
|
||||||
*
|
*
|
||||||
* Next, there are some adjustments to the exponent:
|
* Next, there are some adjustments to the exponent:
|
||||||
* - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
|
* - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
|
||||||
* formats (0x7F - 0xF = 0x70)
|
* formats (0x7F - 0xF = 0x70)
|
||||||
* - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
|
* - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
|
||||||
* Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
|
* Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
|
||||||
* of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
|
* of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
|
||||||
* - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
|
* - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
|
||||||
* by the difference in the exponent bias (see above).
|
* by the difference in the exponent bias (see above).
|
||||||
* - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
|
* - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
|
||||||
* exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
|
* exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
|
||||||
* The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
|
* The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
|
||||||
* partially IEEE754-compliant implementations.
|
* partially IEEE754-compliant implementations.
|
||||||
*
|
*
|
||||||
* Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
|
* Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
|
||||||
* operate on denormal inputs, and do not produce denormal results.
|
* operate on denormal inputs, and do not produce denormal results.
|
||||||
*/
|
*/
|
||||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||||
// const float exp_scale = 0x1.0p-112f;
|
// const float exp_scale = 0x1.0p-112f;
|
||||||
uint32_t scale_bits = (uint32_t) 15 << 23;
|
uint32_t scale_bits = (uint32_t) 15 << 23;
|
||||||
float exp_scale_val;
|
float exp_scale_val;
|
||||||
@ -239,48 +239,48 @@ namespace detail {
|
|||||||
const float exp_scale = exp_scale_val;
|
const float exp_scale = exp_scale_val;
|
||||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Convert denormalized half-precision inputs into single-precision results (always normalized).
|
* Convert denormalized half-precision inputs into single-precision results (always normalized).
|
||||||
* Zero inputs are also handled here.
|
* Zero inputs are also handled here.
|
||||||
*
|
*
|
||||||
* In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
|
* In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
|
||||||
* First, we shift mantissa into bits 0-9 of the 32-bit word.
|
* First, we shift mantissa into bits 0-9 of the 32-bit word.
|
||||||
*
|
*
|
||||||
* zeros | mantissa
|
* zeros | mantissa
|
||||||
* +---------------------------+------------+
|
* +---------------------------+------------+
|
||||||
* |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
|
* |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
|
||||||
* +---------------------------+------------+
|
* +---------------------------+------------+
|
||||||
* Bits 10-31 0-9
|
* Bits 10-31 0-9
|
||||||
*
|
*
|
||||||
* Now, remember that denormalized half-precision numbers are represented as:
|
* Now, remember that denormalized half-precision numbers are represented as:
|
||||||
* FP16 = mantissa * 2**(-24).
|
* FP16 = mantissa * 2**(-24).
|
||||||
* The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
|
* The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
|
||||||
* and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
|
* and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
|
||||||
* A normalized single-precision floating-point number is represented as:
|
* A normalized single-precision floating-point number is represented as:
|
||||||
* FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
|
* FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
|
||||||
* Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
|
* Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
|
||||||
* number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
|
* number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
|
||||||
*
|
*
|
||||||
* The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
|
* The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
|
||||||
* is zero, the constructed single-precision number has the value of
|
* is zero, the constructed single-precision number has the value of
|
||||||
* FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
|
* FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
|
||||||
* Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
|
* Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
|
||||||
* the input half-precision number.
|
* the input half-precision number.
|
||||||
*/
|
*/
|
||||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||||
const float magic_bias = 0.5f;
|
const float magic_bias = 0.5f;
|
||||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
|
* - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
|
||||||
* input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
|
* input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
|
||||||
* input is either a denormal number, or zero.
|
* input is either a denormal number, or zero.
|
||||||
* - Combine the result of conversion of exponent and mantissa with the sign of the input number.
|
* - Combine the result of conversion of exponent and mantissa with the sign of the input number.
|
||||||
*/
|
*/
|
||||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||||
const uint32_t result = sign |
|
const uint32_t result = sign |
|
||||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||||
return fp32_from_bits(result);
|
return fp32_from_bits(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -301,22 +301,22 @@ namespace detail {
|
|||||||
const float scale_to_inf = scale_to_inf_val;
|
const float scale_to_inf = scale_to_inf_val;
|
||||||
const float scale_to_zero = scale_to_zero_val;
|
const float scale_to_zero = scale_to_zero_val;
|
||||||
|
|
||||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||||
|
|
||||||
const uint32_t w = fp32_to_bits(f);
|
const uint32_t w = fp32_to_bits(f);
|
||||||
const uint32_t shl1_w = w + w;
|
const uint32_t shl1_w = w + w;
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||||
if (bias < UINT32_C(0x71000000)) {
|
if (bias < UINT32_C(0x71000000)) {
|
||||||
bias = UINT32_C(0x71000000);
|
bias = UINT32_C(0x71000000);
|
||||||
}
|
}
|
||||||
|
|
||||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||||
const uint32_t bits = fp32_to_bits(base);
|
const uint32_t bits = fp32_to_bits(base);
|
||||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|||||||
@ -55,7 +55,7 @@ Assertion Passed!
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
)DOC")
|
)DOC")
|
||||||
.Arg(
|
.Arg(
|
||||||
"error_msg",
|
"error_msg",
|
||||||
"(*string*): custom error message to be thrown when the input does not pass assertion",
|
"(*string*): custom error message to be thrown when the input does not pass assertion",
|
||||||
|
|||||||
@ -107,17 +107,17 @@ Testing CountUp operator...
|
|||||||
'count' value after CountUp test: 10
|
'count' value after CountUp test: 10
|
||||||
|
|
||||||
Testing CountDown operator...
|
Testing CountDown operator...
|
||||||
'count' value after CountDown: 9 'done' value: False
|
'count' value after CountDown: 9 'done' value: False
|
||||||
'count' value after CountDown: 8 'done' value: False
|
'count' value after CountDown: 8 'done' value: False
|
||||||
'count' value after CountDown: 7 'done' value: False
|
'count' value after CountDown: 7 'done' value: False
|
||||||
'count' value after CountDown: 6 'done' value: False
|
'count' value after CountDown: 6 'done' value: False
|
||||||
'count' value after CountDown: 5 'done' value: False
|
'count' value after CountDown: 5 'done' value: False
|
||||||
'count' value after CountDown: 4 'done' value: False
|
'count' value after CountDown: 4 'done' value: False
|
||||||
'count' value after CountDown: 3 'done' value: False
|
'count' value after CountDown: 3 'done' value: False
|
||||||
'count' value after CountDown: 2 'done' value: False
|
'count' value after CountDown: 2 'done' value: False
|
||||||
'count' value after CountDown: 1 'done' value: False
|
'count' value after CountDown: 1 'done' value: False
|
||||||
'count' value after CountDown: 0 'done' value: False
|
'count' value after CountDown: 0 'done' value: False
|
||||||
'count' value after CountDown: -1 'done' value: True
|
'count' value after CountDown: -1 'done' value: True
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|||||||
@ -24,11 +24,11 @@ OPERATOR_SCHEMA(Expand)
|
|||||||
.NumInputs(2)
|
.NumInputs(2)
|
||||||
.NumOutputs(1)
|
.NumOutputs(1)
|
||||||
.SetDoc(R"DOC(
|
.SetDoc(R"DOC(
|
||||||
Broadcast the input tensor to a materialized new tensor using given shape.
|
Broadcast the input tensor to a materialized new tensor using given shape.
|
||||||
Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
|
Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
|
||||||
Dimensions are right alignment;
|
Dimensions are right alignment;
|
||||||
Two corresponding dimensions must have the same value, or one of them
|
Two corresponding dimensions must have the same value, or one of them
|
||||||
equals to 1.
|
equals to 1.
|
||||||
In order to align with PyTorch's `expand`, `shape` is allowed to have entries
|
In order to align with PyTorch's `expand`, `shape` is allowed to have entries
|
||||||
equal to -1, which means to preserve the size of the corresponding dimension
|
equal to -1, which means to preserve the size of the corresponding dimension
|
||||||
in `X` (so it's actually equivalent to equal to 1).
|
in `X` (so it's actually equivalent to equal to 1).
|
||||||
|
|||||||
@ -758,10 +758,10 @@ if(USE_CUDA)
|
|||||||
endif()
|
endif()
|
||||||
if(CAFFE2_USE_CUDNN)
|
if(CAFFE2_USE_CUDNN)
|
||||||
IF(CUDNN_STATIC_LINKAGE)
|
IF(CUDNN_STATIC_LINKAGE)
|
||||||
LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
|
LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
|
||||||
caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
|
caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
|
||||||
ELSE()
|
ELSE()
|
||||||
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
|
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
|
||||||
ENDIF()
|
ENDIF()
|
||||||
else()
|
else()
|
||||||
caffe2_update_option(USE_CUDNN OFF)
|
caffe2_update_option(USE_CUDNN OFF)
|
||||||
@ -1166,7 +1166,7 @@ if (NOT BUILD_ATEN_MOBILE)
|
|||||||
|
|
||||||
CHECK_C_SOURCE_COMPILES("#include <stdint.h>
|
CHECK_C_SOURCE_COMPILES("#include <stdint.h>
|
||||||
static inline void cpuid(uint32_t *eax, uint32_t *ebx,
|
static inline void cpuid(uint32_t *eax, uint32_t *ebx,
|
||||||
uint32_t *ecx, uint32_t *edx)
|
uint32_t *ecx, uint32_t *edx)
|
||||||
{
|
{
|
||||||
uint32_t a = *eax, b, c = *ecx, d;
|
uint32_t a = *eax, b, c = *ecx, d;
|
||||||
asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
|
asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
|
||||||
|
|||||||
@ -3,16 +3,16 @@
|
|||||||
# CUB_INCLUDE_DIRS - the CUB include directory
|
# CUB_INCLUDE_DIRS - the CUB include directory
|
||||||
|
|
||||||
find_path(CUB_INCLUDE_DIR
|
find_path(CUB_INCLUDE_DIR
|
||||||
NAMES cub/cub.cuh
|
NAMES cub/cub.cuh
|
||||||
DOC "The directory where CUB includes reside"
|
DOC "The directory where CUB includes reside"
|
||||||
)
|
)
|
||||||
|
|
||||||
set(CUB_INCLUDE_DIRS ${CUB_INCLUDE_DIR})
|
set(CUB_INCLUDE_DIRS ${CUB_INCLUDE_DIR})
|
||||||
|
|
||||||
include(FindPackageHandleStandardArgs)
|
include(FindPackageHandleStandardArgs)
|
||||||
find_package_handle_standard_args(CUB
|
find_package_handle_standard_args(CUB
|
||||||
FOUND_VAR CUB_FOUND
|
FOUND_VAR CUB_FOUND
|
||||||
REQUIRED_VARS CUB_INCLUDE_DIR
|
REQUIRED_VARS CUB_INCLUDE_DIR
|
||||||
)
|
)
|
||||||
|
|
||||||
mark_as_advanced(CUB_FOUND)
|
mark_as_advanced(CUB_FOUND)
|
||||||
|
|||||||
@ -35,20 +35,20 @@ find_package_handle_standard_args(
|
|||||||
MIOPEN DEFAULT_MSG MIOPEN_INCLUDE_DIR MIOPEN_LIBRARY)
|
MIOPEN DEFAULT_MSG MIOPEN_INCLUDE_DIR MIOPEN_LIBRARY)
|
||||||
|
|
||||||
if(MIOPEN_FOUND)
|
if(MIOPEN_FOUND)
|
||||||
# get MIOpen version
|
# get MIOpen version
|
||||||
file(READ ${MIOPEN_INCLUDE_DIR}/version.h MIOPEN_HEADER_CONTENTS)
|
file(READ ${MIOPEN_INCLUDE_DIR}/version.h MIOPEN_HEADER_CONTENTS)
|
||||||
string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
|
string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
|
||||||
MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
|
MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
|
||||||
string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
|
string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
|
||||||
MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
|
MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
|
||||||
string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
|
string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
|
||||||
MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
|
MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
|
||||||
string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
|
string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
|
||||||
MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
|
MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
|
||||||
string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
|
string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
|
||||||
MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
|
MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
|
||||||
string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
|
string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
|
||||||
MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
|
MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
|
||||||
# Assemble MIOpen version
|
# Assemble MIOpen version
|
||||||
if(NOT MIOPEN_VERSION_MAJOR)
|
if(NOT MIOPEN_VERSION_MAJOR)
|
||||||
set(MIOPEN_VERSION "?")
|
set(MIOPEN_VERSION "?")
|
||||||
|
|||||||
@ -3,16 +3,16 @@
|
|||||||
# pybind11_INCLUDE_DIRS - the pybind11 include directory
|
# pybind11_INCLUDE_DIRS - the pybind11 include directory
|
||||||
|
|
||||||
find_path(pybind11_INCLUDE_DIR
|
find_path(pybind11_INCLUDE_DIR
|
||||||
NAMES pybind11/pybind11.h
|
NAMES pybind11/pybind11.h
|
||||||
DOC "The directory where pybind11 includes reside"
|
DOC "The directory where pybind11 includes reside"
|
||||||
)
|
)
|
||||||
|
|
||||||
set(pybind11_INCLUDE_DIRS ${pybind11_INCLUDE_DIR})
|
set(pybind11_INCLUDE_DIRS ${pybind11_INCLUDE_DIR})
|
||||||
|
|
||||||
include(FindPackageHandleStandardArgs)
|
include(FindPackageHandleStandardArgs)
|
||||||
find_package_handle_standard_args(pybind11
|
find_package_handle_standard_args(pybind11
|
||||||
FOUND_VAR pybind11_FOUND
|
FOUND_VAR pybind11_FOUND
|
||||||
REQUIRED_VARS pybind11_INCLUDE_DIR
|
REQUIRED_VARS pybind11_INCLUDE_DIR
|
||||||
)
|
)
|
||||||
|
|
||||||
mark_as_advanced(pybind11_FOUND)
|
mark_as_advanced(pybind11_FOUND)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -145,10 +145,10 @@ allowed values for these axes at the moment are:
|
|||||||
|
|
||||||
.. tip::
|
.. tip::
|
||||||
|
|
||||||
There exist "Rust-style" shorthands for dtypes, like ``kF32`` instead of
|
There exist "Rust-style" shorthands for dtypes, like ``kF32`` instead of
|
||||||
``kFloat32``. See `here
|
``kFloat32``. See `here
|
||||||
<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/types.h>`_
|
<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/types.h>`_
|
||||||
for the full list.
|
for the full list.
|
||||||
|
|
||||||
|
|
||||||
An instance of ``TensorOptions`` stores a concrete value for each of these
|
An instance of ``TensorOptions`` stores a concrete value for each of these
|
||||||
@ -314,8 +314,8 @@ we can convert it from ``int64`` to ``float32``:
|
|||||||
|
|
||||||
.. attention::
|
.. attention::
|
||||||
|
|
||||||
The result of the conversion, ``float_tensor``, is a new tensor pointing to
|
The result of the conversion, ``float_tensor``, is a new tensor pointing to
|
||||||
new memory, unrelated to the source ``source_tensor``.
|
new memory, unrelated to the source ``source_tensor``.
|
||||||
|
|
||||||
We can then move it from CPU memory to GPU memory:
|
We can then move it from CPU memory to GPU memory:
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,7 @@ pushd %~dp0
|
|||||||
REM Command file for Sphinx documentation
|
REM Command file for Sphinx documentation
|
||||||
|
|
||||||
if "%SPHINXBUILD%" == "" (
|
if "%SPHINXBUILD%" == "" (
|
||||||
set SPHINXBUILD=sphinx-build
|
set SPHINXBUILD=sphinx-build
|
||||||
)
|
)
|
||||||
set SOURCEDIR=source
|
set SOURCEDIR=source
|
||||||
set BUILDDIR=build
|
set BUILDDIR=build
|
||||||
@ -15,15 +15,15 @@ if "%1" == "" goto help
|
|||||||
|
|
||||||
%SPHINXBUILD% >NUL 2>NUL
|
%SPHINXBUILD% >NUL 2>NUL
|
||||||
if errorlevel 9009 (
|
if errorlevel 9009 (
|
||||||
echo.
|
echo.
|
||||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||||
echo.may add the Sphinx directory to PATH.
|
echo.may add the Sphinx directory to PATH.
|
||||||
echo.
|
echo.
|
||||||
echo.If you don't have Sphinx installed, grab it from
|
echo.If you don't have Sphinx installed, grab it from
|
||||||
echo.http://sphinx-doc.org/
|
echo.http://sphinx-doc.org/
|
||||||
exit /b 1
|
exit /b 1
|
||||||
)
|
)
|
||||||
|
|
||||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
|
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
|
||||||
|
|||||||
@ -770,34 +770,34 @@ Interpreting Graphs
|
|||||||
|
|
||||||
The example script above produces the graph::
|
The example script above produces the graph::
|
||||||
|
|
||||||
graph(%len : int) {
|
graph(%len : int) {
|
||||||
%15 : int = prim::Constant[value=1]()
|
%15 : int = prim::Constant[value=1]()
|
||||||
%9 : bool = prim::Constant[value=1]()
|
%9 : bool = prim::Constant[value=1]()
|
||||||
%7 : Device = prim::Constant[value="cpu"]()
|
%7 : Device = prim::Constant[value="cpu"]()
|
||||||
%6 : int = prim::Constant[value=0]()
|
%6 : int = prim::Constant[value=0]()
|
||||||
%5 : int = prim::Constant[value=6]()
|
%5 : int = prim::Constant[value=6]()
|
||||||
%1 : int = prim::Constant[value=3]()
|
%1 : int = prim::Constant[value=3]()
|
||||||
%2 : int = prim::Constant[value=4]()
|
%2 : int = prim::Constant[value=4]()
|
||||||
%11 : int = prim::Constant[value=10]()
|
%11 : int = prim::Constant[value=10]()
|
||||||
%14 : float = prim::Constant[value=1]()
|
%14 : float = prim::Constant[value=1]()
|
||||||
%4 : int[] = prim::ListConstruct(%1, %2)
|
%4 : int[] = prim::ListConstruct(%1, %2)
|
||||||
%rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
|
%rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
|
||||||
%rv : Tensor = prim::Loop(%len, %9, %rv.1)
|
%rv : Tensor = prim::Loop(%len, %9, %rv.1)
|
||||||
block0(%i : int, %13 : Tensor) {
|
block0(%i : int, %13 : Tensor) {
|
||||||
%12 : bool = aten::lt(%i, %11)
|
%12 : bool = aten::lt(%i, %11)
|
||||||
%rv.4 : Tensor = prim::If(%12)
|
%rv.4 : Tensor = prim::If(%12)
|
||||||
block0() {
|
block0() {
|
||||||
%rv.2 : Tensor = aten::sub(%13, %14, %15)
|
%rv.2 : Tensor = aten::sub(%13, %14, %15)
|
||||||
-> (%rv.2)
|
-> (%rv.2)
|
||||||
}
|
}
|
||||||
block1() {
|
block1() {
|
||||||
%rv.3 : Tensor = aten::add(%13, %14, %15)
|
%rv.3 : Tensor = aten::add(%13, %14, %15)
|
||||||
-> (%rv.3)
|
-> (%rv.3)
|
||||||
}
|
}
|
||||||
-> (%9, %rv.4)
|
-> (%9, %rv.4)
|
||||||
}
|
}
|
||||||
return (%rv);
|
return (%rv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
|
Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
|
||||||
@ -850,39 +850,39 @@ Automatic Trace Checking
|
|||||||
traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
|
traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
|
||||||
|
|
||||||
Gives us the following diagnostic information::
|
Gives us the following diagnostic information::
|
||||||
ERROR: Graphs differed across invocations!
|
ERROR: Graphs differed across invocations!
|
||||||
Graph diff::
|
Graph diff::
|
||||||
|
|
||||||
graph(%x : Tensor) {
|
graph(%x : Tensor) {
|
||||||
%1 : int = prim::Constant[value=0]()
|
%1 : int = prim::Constant[value=0]()
|
||||||
%2 : int = prim::Constant[value=0]()
|
%2 : int = prim::Constant[value=0]()
|
||||||
%result.1 : Tensor = aten::select(%x, %1, %2)
|
%result.1 : Tensor = aten::select(%x, %1, %2)
|
||||||
%4 : int = prim::Constant[value=0]()
|
%4 : int = prim::Constant[value=0]()
|
||||||
%5 : int = prim::Constant[value=0]()
|
%5 : int = prim::Constant[value=0]()
|
||||||
%6 : Tensor = aten::select(%x, %4, %5)
|
%6 : Tensor = aten::select(%x, %4, %5)
|
||||||
%result.2 : Tensor = aten::mul(%result.1, %6)
|
%result.2 : Tensor = aten::mul(%result.1, %6)
|
||||||
%8 : int = prim::Constant[value=0]()
|
%8 : int = prim::Constant[value=0]()
|
||||||
%9 : int = prim::Constant[value=1]()
|
%9 : int = prim::Constant[value=1]()
|
||||||
%10 : Tensor = aten::select(%x, %8, %9)
|
%10 : Tensor = aten::select(%x, %8, %9)
|
||||||
- %result : Tensor = aten::mul(%result.2, %10)
|
- %result : Tensor = aten::mul(%result.2, %10)
|
||||||
+ %result.3 : Tensor = aten::mul(%result.2, %10)
|
+ %result.3 : Tensor = aten::mul(%result.2, %10)
|
||||||
? ++
|
? ++
|
||||||
%12 : int = prim::Constant[value=0]()
|
%12 : int = prim::Constant[value=0]()
|
||||||
%13 : int = prim::Constant[value=2]()
|
%13 : int = prim::Constant[value=2]()
|
||||||
%14 : Tensor = aten::select(%x, %12, %13)
|
%14 : Tensor = aten::select(%x, %12, %13)
|
||||||
+ %result : Tensor = aten::mul(%result.3, %14)
|
+ %result : Tensor = aten::mul(%result.3, %14)
|
||||||
+ %16 : int = prim::Constant[value=0]()
|
+ %16 : int = prim::Constant[value=0]()
|
||||||
+ %17 : int = prim::Constant[value=3]()
|
+ %17 : int = prim::Constant[value=3]()
|
||||||
+ %18 : Tensor = aten::select(%x, %16, %17)
|
+ %18 : Tensor = aten::select(%x, %16, %17)
|
||||||
- %15 : Tensor = aten::mul(%result, %14)
|
- %15 : Tensor = aten::mul(%result, %14)
|
||||||
? ^ ^
|
? ^ ^
|
||||||
+ %19 : Tensor = aten::mul(%result, %18)
|
+ %19 : Tensor = aten::mul(%result, %18)
|
||||||
? ^ ^
|
? ^ ^
|
||||||
- return (%15);
|
- return (%15);
|
||||||
? ^
|
? ^
|
||||||
+ return (%19);
|
+ return (%19);
|
||||||
? ^
|
? ^
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
This message indicates to us that the computation differed between when
|
This message indicates to us that the computation differed between when
|
||||||
@ -912,19 +912,19 @@ Automatic Trace Checking
|
|||||||
|
|
||||||
Which produces::
|
Which produces::
|
||||||
|
|
||||||
graph(%x : Tensor) {
|
graph(%x : Tensor) {
|
||||||
%5 : bool = prim::Constant[value=1]()
|
%5 : bool = prim::Constant[value=1]()
|
||||||
%1 : int = prim::Constant[value=0]()
|
%1 : int = prim::Constant[value=0]()
|
||||||
%result.1 : Tensor = aten::select(%x, %1, %1)
|
%result.1 : Tensor = aten::select(%x, %1, %1)
|
||||||
%4 : int = aten::size(%x, %1)
|
%4 : int = aten::size(%x, %1)
|
||||||
%result : Tensor = prim::Loop(%4, %5, %result.1)
|
%result : Tensor = prim::Loop(%4, %5, %result.1)
|
||||||
block0(%i : int, %7 : Tensor) {
|
block0(%i : int, %7 : Tensor) {
|
||||||
%10 : Tensor = aten::select(%x, %1, %i)
|
%10 : Tensor = aten::select(%x, %1, %i)
|
||||||
%result.2 : Tensor = aten::mul(%7, %10)
|
%result.2 : Tensor = aten::mul(%7, %10)
|
||||||
-> (%5, %result.2)
|
-> (%5, %result.2)
|
||||||
}
|
}
|
||||||
return (%result);
|
return (%result);
|
||||||
}
|
}
|
||||||
|
|
||||||
Tracer Warnings
|
Tracer Warnings
|
||||||
^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^
|
||||||
|
|||||||
@ -213,8 +213,8 @@ Multiprocessing error without if-clause protection
|
|||||||
.. code-block:: py3tb
|
.. code-block:: py3tb
|
||||||
|
|
||||||
RuntimeError:
|
RuntimeError:
|
||||||
An attempt has been made to start a new process before the
|
An attempt has been made to start a new process before the
|
||||||
current process has finished its bootstrapping phase.
|
current process has finished its bootstrapping phase.
|
||||||
|
|
||||||
This probably means that you are not using fork to start your
|
This probably means that you are not using fork to start your
|
||||||
child processes and you have forgotten to use the proper idiom
|
child processes and you have forgotten to use the proper idiom
|
||||||
|
|||||||
@ -8,24 +8,24 @@
|
|||||||
PyInit*;
|
PyInit*;
|
||||||
init*;
|
init*;
|
||||||
state;
|
state;
|
||||||
_ZGVZN2at*;
|
_ZGVZN2at*;
|
||||||
_ZN2at*;
|
_ZN2at*;
|
||||||
_ZNK2at*Type*;
|
_ZNK2at*Type*;
|
||||||
_ZNK2at*Tensor*;
|
_ZNK2at*Tensor*;
|
||||||
_ZNK2at*Storage*;
|
_ZNK2at*Storage*;
|
||||||
_ZNK2at*Scalar*;
|
_ZNK2at*Scalar*;
|
||||||
_ZNK2at*CUDA*;
|
_ZNK2at*CUDA*;
|
||||||
*2at7Context*;
|
*2at7Context*;
|
||||||
_ZTIN2at*;
|
_ZTIN2at*;
|
||||||
_ZTIZN2at*;
|
_ZTIZN2at*;
|
||||||
_ZTSN2at*;
|
_ZTSN2at*;
|
||||||
_ZTSPN2at*;
|
_ZTSPN2at*;
|
||||||
_ZTSZN2at*;
|
_ZTSZN2at*;
|
||||||
_ZTVN2at*;
|
_ZTVN2at*;
|
||||||
_ZZN2at*;
|
_ZZN2at*;
|
||||||
_Z*torch*;
|
_Z*torch*;
|
||||||
_Z*Tensor*;
|
_Z*Tensor*;
|
||||||
_Z*tensor*;
|
_Z*tensor*;
|
||||||
local:
|
local:
|
||||||
*;
|
*;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -18,9 +18,9 @@ struct ConvOptions {
|
|||||||
int64_t input_channels,
|
int64_t input_channels,
|
||||||
int64_t output_channels,
|
int64_t output_channels,
|
||||||
ExpandingArray<D> kernel_size) :
|
ExpandingArray<D> kernel_size) :
|
||||||
input_channels_(input_channels),
|
input_channels_(input_channels),
|
||||||
output_channels_(output_channels),
|
output_channels_(output_channels),
|
||||||
kernel_size_(std::move(kernel_size)) {}
|
kernel_size_(std::move(kernel_size)) {}
|
||||||
|
|
||||||
/// The number of channels the input volumes will have.
|
/// The number of channels the input volumes will have.
|
||||||
/// Changing this parameter after construction __has no effect__.
|
/// Changing this parameter after construction __has no effect__.
|
||||||
|
|||||||
@ -370,21 +370,21 @@ As the trace runs, individual operators create Nodes in the Graph being traced t
|
|||||||
torch::jit::Node* node = nullptr;
|
torch::jit::Node* node = nullptr;
|
||||||
std::shared_ptr<jit::tracer::TracingState> tracer_state;
|
std::shared_ptr<jit::tracer::TracingState> tracer_state;
|
||||||
if (jit::tracer::isTracing()) {
|
if (jit::tracer::isTracing()) {
|
||||||
tracer_state = jit::tracer::getTracingState();
|
tracer_state = jit::tracer::getTracingState();
|
||||||
at::Symbol op_name;
|
at::Symbol op_name;
|
||||||
op_name = jit::Symbol::fromQualString("aten::__ilshift__");
|
op_name = jit::Symbol::fromQualString("aten::__ilshift__");
|
||||||
node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
|
node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
|
||||||
jit::tracer::recordSourceLocation(node);
|
jit::tracer::recordSourceLocation(node);
|
||||||
jit::tracer::addInputs(node, "self", self);
|
jit::tracer::addInputs(node, "self", self);
|
||||||
jit::tracer::addInputs(node, "other", other);
|
jit::tracer::addInputs(node, "other", other);
|
||||||
tracer_state->graph->insertNode(node);
|
tracer_state->graph->insertNode(node);
|
||||||
|
|
||||||
jit::tracer::setTracingState(nullptr);
|
jit::tracer::setTracingState(nullptr);
|
||||||
}
|
}
|
||||||
TypeDefault::__ilshift__(self, other);
|
TypeDefault::__ilshift__(self, other);
|
||||||
if (tracer_state) {
|
if (tracer_state) {
|
||||||
jit::tracer::setTracingState(std::move(tracer_state));
|
jit::tracer::setTracingState(std::move(tracer_state));
|
||||||
jit::tracer::addOutput(node, self);
|
jit::tracer::addOutput(node, self);
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -412,15 +412,15 @@ Our frontends produce ASTs in the form of Tree objects. Trees are similar to [s-
|
|||||||
|
|
||||||
```
|
```
|
||||||
(-
|
(-
|
||||||
(+
|
(+
|
||||||
(variable (ident x))
|
(variable (ident x))
|
||||||
(variable (ident y)))
|
(variable (ident y)))
|
||||||
(apply
|
(apply
|
||||||
(.
|
(.
|
||||||
(variable (ident z))
|
(variable (ident z))
|
||||||
(ident sigmoid))
|
(ident sigmoid))
|
||||||
(list)
|
(list)
|
||||||
(list))))
|
(list))))
|
||||||
```
|
```
|
||||||
|
|
||||||
This is printed in s-expression style with `(kind ...)` representing compound trees and `string_value` representing strings.
|
This is printed in s-expression style with `(kind ...)` representing compound trees and `string_value` representing strings.
|
||||||
@ -454,16 +454,16 @@ The typical way to traverse a tree is to `switch` on the kind and then construct
|
|||||||
```cpp
|
```cpp
|
||||||
switch (tree.kind()) {
|
switch (tree.kind()) {
|
||||||
case TK_VAR:
|
case TK_VAR:
|
||||||
auto var = Var(tree); // construct tree-view
|
auto var = Var(tree); // construct tree-view
|
||||||
return environment_stack->getSugaredVar(var.name());
|
return environment_stack->getSugaredVar(var.name());
|
||||||
case '.': {
|
case '.': {
|
||||||
auto select = Select(tree); // construct tree-view
|
auto select = Select(tree); // construct tree-view
|
||||||
auto sv = emitSugaredExpr(select.value(), 1);
|
auto sv = emitSugaredExpr(select.value(), 1);
|
||||||
return sv->attr(select.range(), method, select.selector().name());
|
return sv->attr(select.range(), method, select.selector().name());
|
||||||
}
|
}
|
||||||
case TK_APPLY: {
|
case TK_APPLY: {
|
||||||
auto apply = Apply(tree); // construct tree-view
|
auto apply = Apply(tree); // construct tree-view
|
||||||
return emitApplyExpr(apply, n_binders);
|
return emitApplyExpr(apply, n_binders);
|
||||||
} break;
|
} break;
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -507,7 +507,7 @@ Tokens are either keywords (`def`), operators (`+`), literals (`3.4`), or identi
|
|||||||
|
|
||||||
```cpp
|
```cpp
|
||||||
if (lexer.nextIf('+')) {
|
if (lexer.nextIf('+')) {
|
||||||
// handle + ...
|
// handle + ...
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -650,10 +650,10 @@ using Operation = std::function<int(Stack&)>;
|
|||||||
|
|
||||||
// schema: example_add(Tensor a, Tensor b) -> Tensor
|
// schema: example_add(Tensor a, Tensor b) -> Tensor
|
||||||
int example_add(Stack& stack) {
|
int example_add(Stack& stack) {
|
||||||
Tensor a, b;
|
Tensor a, b;
|
||||||
// stack before: ? ? ? a b <- back
|
// stack before: ? ? ? a b <- back
|
||||||
pop(stack, a, b); //Templated helper function
|
pop(stack, a, b); //Templated helper function
|
||||||
// that pops a, b and converts them to tensor
|
// that pops a, b and converts them to tensor
|
||||||
push(stack, a + b);
|
push(stack, a + b);
|
||||||
// stack after:
|
// stack after:
|
||||||
// ? ? ? c <- back
|
// ? ? ? c <- back
|
||||||
@ -1126,7 +1126,7 @@ As a more involved example, the following TorchScript snippet:
|
|||||||
```python
|
```python
|
||||||
@torch.jit.script
|
@torch.jit.script
|
||||||
def foo(a : Tensor, b : Tensor):
|
def foo(a : Tensor, b : Tensor):
|
||||||
c = 2 * b
|
c = 2 * b
|
||||||
a += 1
|
a += 1
|
||||||
if a.max() > 4:
|
if a.max() > 4:
|
||||||
r = a[0]
|
r = a[0]
|
||||||
|
|||||||
@ -71,15 +71,15 @@ public:
|
|||||||
auto size = tuple ? PyTuple_GET_SIZE(source) : PyList_GET_SIZE(source);
|
auto size = tuple ? PyTuple_GET_SIZE(source) : PyList_GET_SIZE(source);
|
||||||
v_value.resize(size);
|
v_value.resize(size);
|
||||||
for (int idx = 0; idx < size; idx++) {
|
for (int idx = 0; idx < size; idx++) {
|
||||||
PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
|
PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
|
||||||
if (THPVariable_Check(obj)) {
|
if (THPVariable_Check(obj)) {
|
||||||
v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
|
v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
|
||||||
} else if (PyLong_Check(obj)) {
|
} else if (PyLong_Check(obj)) {
|
||||||
// use THPUtils_unpackLong after it is safe to include python_numbers.h
|
// use THPUtils_unpackLong after it is safe to include python_numbers.h
|
||||||
v_value[idx] = THPUtils_unpackLong(obj);
|
v_value[idx] = THPUtils_unpackLong(obj);
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
value = v_value;
|
value = v_value;
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@ -195,7 +195,7 @@ ScalarType numpy_dtype_to_aten(int dtype) {
|
|||||||
|
|
||||||
bool is_numpy_scalar(PyObject* obj) {
|
bool is_numpy_scalar(PyObject* obj) {
|
||||||
return (PyArray_IsIntegerScalar(obj) ||
|
return (PyArray_IsIntegerScalar(obj) ||
|
||||||
PyArray_IsScalar(obj, Floating));
|
PyArray_IsScalar(obj, Floating));
|
||||||
}
|
}
|
||||||
|
|
||||||
}} // namespace torch::utils
|
}} // namespace torch::utils
|
||||||
|
|||||||
Reference in New Issue
Block a user