mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/35556 Pull Request resolved: https://github.com/pytorch/pytorch/pull/35542 Apply explicit vectorization to lstm_unit operator. Enabled by -DENABLE_VECTORIZATION=1 This optimization requires vector library support and was tested with Intel SVML & clang. However, compiler which support OpenMP4.5 with omp simd extention should also benefit. After the code changes In file included from caffe2/caffe2/operators/lstm_unit_op.cc:1: caffe2/caffe2/operators/lstm_unit_op.h:60:1: remark: vectorized loop (vectorization width: 8, interleaved count: 1) [-Rpass=loop-vectorize] VECTOR_LOOP for (int d = 0; d < D; ++d) { caffe2/caffe2/operators/lstm_unit_op.h:60:1: remark: vectorized loop (vectorization width: 8, interleaved count: 1) [-Rpass=loop-vectorize] caffe2/caffe2/operators/lstm_unit_op.h:112:1: remark: vectorized loop (vectorization width: 8, interleaved count: 1) [-Rpass=loop-vectorize] VECTOR_LOOP for (int d = 0; d < D; ++d) { Test Plan: Check failures at OSS CI - No build failures related to this change - Failing tests are: - py3.6-clang7-rocmdeb-ubuntu16.04-test2 >RuntimeError: fft: ATen not compiled with MKL support - caffe2_onnx_ort2_py3_6_clang7_ubuntu16_04_test - >gradient_check_test.py::TestMakeTwo Exited with code exit status 1 - pytorch_macos_10_13_py3_test , Test errors like: > ERROR [0.014s]: test_boolean_indexing_weirdness_cpu (__main__.NumpyTestsCPU) RuntimeError: shape mismatch: indexing tensors could not be broadcast together with shapes [0], [2] - caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test - No failure info Reviewed By: jspark1105 Differential Revision: D20484640 fbshipit-source-id: 8fb82dbd6698c8de3e0bbbc0b48d15b70e36ca94
74 lines
1.4 KiB
C++
74 lines
1.4 KiB
C++
#pragma once
|
|
#include <cstdint>
|
|
|
|
namespace caffe2 {
|
|
namespace detail {
|
|
|
|
// Forward declration of the LSTMUnit templated
|
|
// implementation
|
|
template <typename T>
|
|
void LstmUnitCpu(
|
|
const int N,
|
|
const int D,
|
|
const int t,
|
|
const T* H_prev,
|
|
const T* C_prev,
|
|
const T* X,
|
|
const int32_t* seqLengths,
|
|
const bool drop_states,
|
|
T* C,
|
|
T* H,
|
|
const float forget_bias);
|
|
|
|
// Forward specialization
|
|
extern template void LstmUnitCpu<float>(
|
|
const int N,
|
|
const int D,
|
|
const int t,
|
|
const float* H_prev,
|
|
const float* C_prev,
|
|
const float* X,
|
|
const int32_t* seqLengths,
|
|
const bool drop_states,
|
|
float* C,
|
|
float* H,
|
|
const float forget_bias);
|
|
|
|
template <typename T>
|
|
void LstmUnitGradientCpu(
|
|
int N,
|
|
int D,
|
|
int t,
|
|
const T* C_prev,
|
|
const T* X,
|
|
const int32_t* seqLengths,
|
|
const T* C,
|
|
const T* H,
|
|
const T* C_diff,
|
|
const T* H_diff,
|
|
bool drop_states,
|
|
T* H_prev_diff,
|
|
T* C_prev_diff,
|
|
T* X_diff,
|
|
const float forget_bias);
|
|
|
|
extern template void LstmUnitGradientCpu<float>(
|
|
int N,
|
|
int D,
|
|
int t,
|
|
const float* C_prev,
|
|
const float* X,
|
|
const int32_t* seqLengths,
|
|
const float* C,
|
|
const float* H,
|
|
const float* C_diff,
|
|
const float* H_diff,
|
|
bool drop_states,
|
|
float* H_prev_diff,
|
|
float* C_prev_diff,
|
|
float* X_diff,
|
|
const float forget_bias);
|
|
|
|
} // namespace detail
|
|
} // namespace caffe2
|