gpu: add nvidia support

This commit is contained in:
Denis Samoilov
2020-11-23 16:40:22 -08:00
parent e73665ff86
commit 5d63af1b4a
124 changed files with 12918 additions and 30 deletions

View File

@ -180,6 +180,7 @@
Copyright 2016-2020 Intel Corporation
Copyright 2018 YANDEX LLC
Copyright 2020 Arm Limited and affiliates
Copyright 2020 Codeplay Software Limited
Copyright 2019-2020 FUJITSU LIMITED
Licensed under the Apache License, Version 2.0 (the "License");

View File

@ -21,6 +21,7 @@ The library is optimized for Intel Architecture Processors, Intel Processor
Graphics and Xe architecture-based Graphics. oneDNN has experimental support
for the following architectures:
* Arm\* 64-bit Architecture (AArch64)
* NVIDIA\* GPU
* OpenPOWER\* Power ISA (PPC64)
* IBMz\* (s390x)
@ -190,6 +191,18 @@ is enabled:
* [Intel oneAPI DPC++ Compiler](https://software.intel.com/en-us/oneapi/dpc-compiler) Beta
* OpenCL runtime library (OpenCL version 1.2 or later)
* [oneAPI Level Zero](https://github.com/oneapi-src/level-zero)
* DPCPP runtime with NVIDIA GPU support requires
* [oneAPI DPC++ Compiler](https://github.com/intel/llvm)
* OpenCL runtime library (OpenCL version 1.2 or later)
* NVIDIA CUDA\* driver
* cuBLAS 10.1 or later
* cuDNN 7.6 or later
> **WARNING**
>
> NVIDIA GPU support is experimental. General information, build instructions
> and implementation limitations is available in
> [NVIDIA backend readme](https://github.com/oneapi-src/oneDNN/blob/master/src/gpu/NVIDIA/README.md).
### Runtime Dependencies

View File

@ -178,6 +178,7 @@ Copyright (c) 2015-2017 Martin Hensel
Copyright (c) 2007, Apostolos Syropoulos (<asyropoulos@yahoo.com)
ComputeCPP SDK (cmake/FindComputeCpp.cmake)
Copyright 2016-2018 Codeplay Software Ltd.
Xbyak_aarch64 (src/cpu/aarch64/xbyak_aarch64/)
Copyright 2019-2020 FUJITSU LIMITED

31
cmake/FindPI_CUDA.cmake Normal file
View File

@ -0,0 +1,31 @@
#===============================================================================
# Copyright 2020 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
find_library(PI_CUDA_LIBRARIES
NAMES pi_cuda libpi_cuda.so PATHS
PATH_SUFFIXES lib)
find_package_handle_standard_args(PI_CUDA REQUIRED_VARS PI_CUDA_LIBRARIES)
if(TARGET PI_CUDA::PI_CUDA OR NOT PI_CUDA_FOUND)
return()
endif()
add_library(PI_CUDA::PI_CUDA UNKNOWN IMPORTED)
set_target_properties(PI_CUDA::PI_CUDA PROPERTIES
IMPORTED_LOCATION ${PI_CUDA_LIBRARIES})
mark_as_advanced(PI_CUDA_LIBRARIES)

45
cmake/FindcuBLAS.cmake Normal file
View File

@ -0,0 +1,45 @@
#===============================================================================
# Copyright 2020 Intel Corporation
# Copyright 2020 Codeplay Software Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
find_package(CUDA 10.0 REQUIRED)
find_package(Threads REQUIRED)
find_path(CUBLAS_INCLUDE_DIR "cublas_v2.h"
HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
find_library(CUBLAS_LIBRARY cublas)
find_library(CUDA_DRIVER_LIBRARY cuda)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(cuBLAS
REQUIRED_VARS
CUBLAS_INCLUDE_DIR
CUDA_INCLUDE_DIRS
CUBLAS_LIBRARY
CUDA_LIBRARIES
CUDA_DRIVER_LIBRARY
)
if(NOT TARGET cuBLAS::cuBLAS)
add_library(cuBLAS::cuBLAS SHARED IMPORTED)
set_target_properties(cuBLAS::cuBLAS PROPERTIES
IMPORTED_LOCATION ${CUBLAS_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES
"${CUBLAS_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}"
INTERFACE_LINK_LIBRARIES
"Threads::Threads;${CUDA_DRIVER_LIBRARY};${CUDA_LIBRARIES}"
INTERFACE_COMPILE_DEFINITIONS CUDA_NO_HALF)
endif()

55
cmake/FindcuDNN.cmake Normal file
View File

@ -0,0 +1,55 @@
#===============================================================================
# Copyright 2020 Intel Corporation
# Copyright 2020 Codeplay Software Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
find_package(CUDA 10.0 REQUIRED)
find_path(CUDNN_INCLUDE_DIR "cudnn.h"
HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
find_library(CUDNN_LIBRARY cudnn)
find_library(CUDA_DRIVER_LIBRARY cuda)
# this is work around to avoid duplication half creation in both cuda and SYCL
find_package(Threads REQUIRED)
include(FindPackageHandleStandardArgs)
find_library(
CUDNN_LIBRARY cudnn
HINTS ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib lib64 bin)
find_package_handle_standard_args(cuDNN
REQUIRED_VARS
CUDNN_INCLUDE_DIR
CUDA_INCLUDE_DIRS
CUDNN_LIBRARY
CUDA_LIBRARIES
CUDA_DRIVER_LIBRARY
)
if(NOT TARGET cuDNN::cuDNN)
add_library(cuDNN::cuDNN SHARED IMPORTED)
set_target_properties(cuDNN::cuDNN PROPERTIES
IMPORTED_LOCATION
${CUDNN_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES
"${CUDA_INCLUDE_DIRS};${CUDNN_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES
"Threads::Threads;${CUDA_DRIVER_LIBRARY};${CUDA_LIBRARIES}"
INTERFACE_COMPILE_DEFINITIONS
CUDA_NO_HALF)
endif()

View File

@ -153,6 +153,13 @@ if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$")
message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}")
endif()
set(DNNL_GPU_VENDOR "INTEL" CACHE STRING
"specifies target GPU vendor for GPU engines.
Can be INTEL (default) or NVIDIA.")
if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(INTEL|NVIDIA)$")
message(FATAL_ERROR "Unsupported GPU vendor: ${DNNL_GPU_VENDOR}")
endif()
set(OPENCLROOT "" CACHE STRING
"path to Intel SDK for OpenCL applications.
Use this option to specify custom location for OpenCL.")
@ -167,6 +174,10 @@ endif()
if(DNNL_GPU_RUNTIME STREQUAL "DPCPP" OR DNNL_GPU_RUNTIME STREQUAL "SYCL")
set(DNNL_GPU_SYCL true)
set(DNNL_SYCL_CUDA OFF)
if(DNNL_GPU_VENDOR STREQUAL "NVIDIA")
set(DNNL_SYCL_CUDA ON)
endif()
else()
set(DNNL_GPU_SYCL false)
endif()

View File

@ -61,6 +61,14 @@ if(DNNL_CPU_SYCL)
endforeach()
endif()
# Skip examples for CUDA since USM is a default model for the library which is
# not yet supported for Nvidia backend.
if(DNNL_SYCL_CUDA)
foreach(f ${sources})
list(REMOVE_ITEM sources "${f}")
endforeach()
endif()
foreach(src ${sources})
file(RELATIVE_PATH src_rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${src})
string(REGEX REPLACE "[/_\\.]" "-" example_name ${src_rel_path})

View File

@ -130,7 +130,7 @@ inline int dnnl_get_current_num_threads() {
return tbb::this_task_arena::max_concurrency();
#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
using namespace dnnl::impl::threadpool_utils;
dnnl::threadpool_iface *tp = get_active_threadpool();
dnnl::threadpool_interop::threadpool_iface *tp = get_active_threadpool();
return (tp) ? dnnl_get_max_threads() : 1;
#else
return 1;

View File

@ -177,6 +177,11 @@ enum {
key_conv_amx_wsp_buffer,
key_conv_bia_reduction,
key_conv_bias_bf16_convert_wsp,
key_conv_cudnn,
key_conv_cudnn_algo,
key_conv_cudnn_filter,
key_conv_cudnn_temp,
key_conv_dst_bf16_convert_wsp,
key_conv_bwd_w_1st_bia_reorder,
key_conv_bwd_w_1st_wei_reorder,
key_conv_gemm_acc,

View File

@ -55,7 +55,6 @@ struct cpu_stream_t : public stream_t {
threadpool_utils::deactivate_threadpool();
}
#endif
};
} // namespace cpu

View File

@ -33,3 +33,8 @@ set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
add_subdirectory(compute)
add_subdirectory(jit)
add_subdirectory(ocl)
if(DNNL_SYCL_CUDA)
add_subdirectory(nvidia)
# Pass ${LIB_NAME}_INTERFACE to upper level for proper linking
set(${LIB_NAME}_INTERFACE "${${LIB_NAME}_INTERFACE}" PARENT_SCOPE)
endif()

View File

@ -0,0 +1,51 @@
#===============================================================================
# Copyright 2020 Intel Corporation
# Copyright 2020 Codeplay Software Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
file(GLOB_RECURSE SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
)
set(OBJ_LIB ${LIB_NAME}_sycl_nvidia)
add_library(${OBJ_LIB} OBJECT ${SOURCES})
find_package(OpenCL REQUIRED)
set_target_properties(
${OBJ_LIB}
PROPERTIES
COMPILE_DEFINITIONS
"$<TARGET_PROPERTY:cuBLAS::cuBLAS,INTERFACE_COMPILE_DEFINITIONS>;$<TARGET_PROPERTY:cuDNN::cuDNN,INTERFACE_COMPILE_DEFINITIONS>"
COMPILE_OPTIONS
"$<TARGET_PROPERTY:cuBLAS::cuBLAS,INTERFACE_COMPILE_OPTIONS>;$<TARGET_PROPERTY:cuDNN::cuDNN,INTERFACE_COMPILE_OPTIONS>;$<TARGET_PROPERTY:OpenCL::OpenCL,INTERFACE_COMPILE_OPTIONS>"
)
target_include_directories(
${OBJ_LIB}
PRIVATE $<TARGET_PROPERTY:OpenCL::OpenCL,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:cuDNN::cuDNN,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:cuBLAS::cuBLAS,INTERFACE_INCLUDE_DIRECTORIES>)
add_library(${OBJ_LIB}_interface INTERFACE)
target_link_libraries(${OBJ_LIB}_interface INTERFACE cuBLAS::cuBLAS
cuDNN::cuDNN
OpenCL::OpenCL)
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
$<TARGET_OBJECTS:${OBJ_LIB}>)
set(${LIB_NAME}_INTERFACE
${${LIB_NAME}_INTERFACE} ${OBJ_LIB}_interface
PARENT_SCOPE)

330
src/gpu/nvidia/README.md Normal file
View File

@ -0,0 +1,330 @@
# Nvidia backend support
## General information
The Nvidia backend for oneDNN can be exposed to the user via the
`dnnl::engine::kind::gpu` engine kind. Currently, for the case when user's
system has both Intel and Nvidia GPUs, `DNNL_GPU_VENDOR=NVIDIA` flag is used in
CMake, since the devices are clustered based on the device vendor ID and index
pattern can not be used to distinguish between Intel GPU and Nvidia GPU.
However, Intel is working on restructuring the engine creation, so that it would
be possible to choose engine kind and vendor kind at runtime. Also, it is
possible to create oneDNN engines using `sycl::device` objects corresponding to
Nvidia GPUs. The stream in Nvidia backend for oneDNN defines an out-of-order
SYCL queue by default. Similar to the existing oneDNN API, user can specify an
in-order queue when creating a stream if needed.
## Build command
```bash
export CC=/path/to/dpcpp/install/bin/clang
export CXX=/path/to/dpcpp/install/bin/clang++
mkdir build
cd build
cmake -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP \
-DDNNL_GPU_VENDOR=NVIDIA -G Ninja \
-DOPENCLROOT=/path/to/the/root/folder/of/libOpenCL.so ..
```
## Memory
Currently, only the buffer-based oneDNN API is supported for Nvidia backend.
## Suported Data Types
The following table documents the supported data types.
| Data Type | Computation Mode |
|-----------|-----------------------------|
| f32 | Training, Inference |
| f16 | Inference |
| s8 | Inference (when applicable) |
## Supported Primitives and Implementation Limitations
cuDNN functions are not necessarily the same as oneDNN primitives due to lack of
standard API for DNN. For each primitive the cuDNN equivalent function is added
to the Nvidia backend for oneDNN. However, the added backend cannot provide all
functionalities supported by oneDNN primitives. The detailed limitations of each
cuDNN primitive are explained as follow.
### Batch normalization
The closest equivalent to oneDNN batch normalization can be
`cudnnBatchNormalizationForward` and `cudnnBatchNormalizationBackward`
operations. However, there are some difference between cuDNN and oneDNN batch
normalization.
#### Forward direction
* When `global_stats` flag is set for batch normalization, the mean and variance
are input only parameters. However, cuDNN does not have the option to accept
the mean and variance as inputs in the forward training operation. Therefore,
`cudnnBatchNormalizationForwardInference` is used to match the oneDNN feature.
Although inference is not supported without `global_stats` flags set.
* The cuDNN precision is different from that of oneDNN for Batch Normalization.
(e.g `fp:0.0170898 dt:0.0170907 diff:8.27014e-07 rdiff:4.83922e-05`)
* The forward training with no flags accepts mean and variance as an output.
However, in cuDNN the mean and variance are running mean and variance
respectably so they are both input and output variable. Therefore, they are
required to have a sensible value (cannot be NaN). Since oneDNN will not set
value for the mean and variance when no flag is passed, the NaN can be
propagated as a result. To avoid NaN propagation, `cudaMemset` function is
used to initialize the mean and variance with zero.
* cuDNN always requires the values for scale and shift. When shift and scale are
not defined in oneDNN, `cudaMemset` is used to initialize scale to 1 and shift
to 0.
* For performance reason in the backward pass, cuDNN requires the mean and
inverse variance to be saved in the forward pass. Therefore, when Nvidia
backend is used for batch normalization, the workspace must be provided to
save the mean and inverse variance.
* When `dnnl_fuse_norm_relu` flag is set for batch normalization, the
`cudnnActivationForward` operation is called immediately after the batch
normalization, since cuDNN does not have a fused batch normalization with
`RELU`. The implementation for element-wise post operations is the same.
* When `dnnl_fuse_norm_relu` is used, the intermediate output of batch
normalization, which is used as an input to the activation function, is saved
in the workspace as well. This is required to compute the backward pass for
`dnnl_fuse_norm_relu` flag.
* Forward pass supports f32, f16 and s8 data types. Although blocking is not
supported for s8.
#### Backward direction
* cuDNN uses `alpha` and `beta` parameters to blend the `dy`, `shift` and
`scale`. Since oneDNN does not have this feature, the `alpha` and `beta`
values in the backward direction are set to 1 and 0 respectively to avoid
blending.
* Nvidia backend for backward direction requires the workspace as an input
containing the mean and inverse variance computed in the forward pass.
* The Nvidia backend for oneDNN does not support the backward direction for
batch normalization when the flag is set to `global_stats`. This is due to the
fact that oneDNN will skip the
<p align="center">
<img src="https://render.githubusercontent.com/render/math?math=$d_{y} -= \left ( \frac{\beta + \left ( \frac{src-mean}{\sqrt{\delta ^{2} + \epsilon }} \right )}{NHW} \right )$" >
</p>
since the mean and variance are constant, however, cuDNN does not have an
option to skip this operation.
* When `dnnl_fuse_norm_relu` flag is set, Nvidia backend requires the
intermediate result of the batch normalization saved in the forward pass. This
is used to compute the backward direction of the activation function used for
`RELU`.
### Binary
The `cudnnOpTensor` is equivalent of oneDNN binary primitives.
* Only scales attribute is supported. Post-op attribute is not supported.
* Blocking is only supported for `int8` and only in the C dimension with either
4 or 32 block size (same as other cuDNN primitives).
### Concat
The concat operation uses the reorder primitive to concatenate tensors over the
chosen dimension, so the same limitation as reorder applies here.
### Convolution
The `cudnnConvolutionForward`, `cudnnConvolutionBackward` and
`cudnnConvolutionBackwardFilter` is used to compute forward, backward by data or
backward by weights for a convolution operation.
* Blocking is only supported for `int8` and only in the C dimension with block
size of 4. Input and output tensors must have the same data type.
* For int8 (s8s8s8) with post-ops the operations are performed as s8s8f32 (due
to cuDNN limitations) then reordered to `s8` at the end which impacts
performance.
* Direct convolution is not supported, so implicit GEMM is used in those cases.
* "Left" padding must be greater or equal to "right" padding, and the requested
spatial output should match the output formula for two "left" padding used.
* Eltwise post-op limitations are the same as our eltwise limitation as post-ops
are not fused.
* cuDNN requires padding tensors to 4 dimensions, so 1D convolutions are
supported but are performed as 2D.
The following table shows the convolution status for the oneDNN Nvidia backend:
#### Forward direction
| Weights Format | Winograd Supported | Supported Input Format | Supported Output Format | Supported Data Type | Limitations |
|----------------|--------------------|------------------------|-------------------------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 2D NCHW | YES | NCHW, NHWC | NCHW, NHWC | f32, f16 | The Winograd algorithm has limitations: <br> * Filter size must be 3x3 or 5x5. <br> * Dilation must be zero for all dimensions. <br> * Horizontal and vertical filter stride must be 1. |
| 2D NHWC | NO | NHWC | NHWC | f32, f16, int8 | * Dilation must be zero in all dimensions. <br> * Output feature maps must be multiple of 4 for `int8` type. |
| 3D NCHW | NO | NCHW, NHWC | NCHW, NHWC | f32, f16 | |
#### Backward direction
| Weights Format | Winograd Supported | Supported Input Format | Supported Output Format | Supported Data Type | Limitations |
|----------------|--------------------|------------------------|-------------------------|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 2D NCHW | YES | NCHW, NHWC | NCHW | f32, f16 | 1. Dilation must be zero for all dimensions. <br> 2. The Winograd algorithm has limitations: <br> * Filter size must be 3x3 or 5x5. <br> * Dilation must be zero for all dimensions. <br> * Horizontal and vertical filter stride must be 1. |
| 2D NHWC | NO | NHWC | NHWC | f32, f16 | |
| 3D NCHW | NO | NCHW, NHWC | NCHW | f32, f16 | |
### Deconvolution
Deconvolution primitive is implemented through the convolution with swapped
input abd output channels.
* Currently, there is a bug, likely in this code, which causes crashes in
memory_tracking for 3D backward_weights with bias when backward_weights
without bias was also a part of the run. Cache interrogation is suspected due
to cache-free runs are successful. Switched off in benchdnn until further
investigation and the fix.
### Eltwise
The `cudnnActivationForward` and `cudnnActivationBackward` is the equivalent of
eltwise forward and eltwise backward in oneDNN respectively. There are some
limitations when using Nvidia backend for eltwise primitive:
* cuDNN only supports the following operations - `RELU`, `ELU`, `TANH`,
`LOGISTIC` and `BRELU`.
* `RELU` is only supported with alpha = 0.
* cuDNN expects `x`, `y` and `dy` as inputs to the backward pass, hence, only
`RELU` and `BRELU` operations are supported in the backward pass.
TODO: add `ELU_DST`, `TANH_DST` and `LOGISTIC_DST` support which require `dy`.
* Forward pass supports `f32`, `f16` and `s8` data types. Although blocking is
not supported for `s8`.
* Backward pass supports `f32` and `f16` data types.
### Inner product
The inner product primitives is an implementation of matrix multiplication plus
bias activation. There are two implementation of inner product in cuDNN backend.
#### Using GEMM
The default backend for inner product is the gemm backend using `cublasGemmEx`
for forward, backward data, and backward weight and `cudnnReduceTensor` for
backward bias. A function called `gemm_consitency_check()`, `dense_check()` is
used to see if the gemm backend can be used for inner product. `reorder_check()`
is used when reorder is required. If none of the above condition are met, it
falls back to the convolution backend. `cudnnActivationForward` operation is
used for eltwise operation and `cudnnAddTensor` is used for bias operation. The
`beta` parameter in gemm is used for the sum scale and `alpha` parameter is used
for the output scale.
#### Using convolution
For the forward direction, this operation can be implemented by using
`cudnnConvolutionBiasActivation` by converting the inner product to `1x1`
convolution. For the backward direction the inner product operation will be
equivalent of `cudnnConvolutionBackwardData`, `cudnnConvolutionBackwardWeights`
and `cudnnConvolutionBackwardBias` when applied. This implementation of inner
product has the following restrictions and performance implications:
* The only blocked layouts are those that are supported in cuDNN - namely that
the blocking is done on the C dimension, the block size is 4, and only for
`int8` inference. The additional requirement is that both the input and filter
must be blocked.
* The `ReLU` and sum are supported as a fused post-op, for other post-op a
separate call to eltwise primitive is performed. So the limitation for the
eltwise primitive is applied here.
* Only `mask = 0` case is supported for output scale.
* The restrictions for the convolution primitive are applied here for input and
filter format. When required, the filter is internally reordered to match the
convolution restriction.
* For `int8` cuDNN requires both input and output feature maps to be a multiple
of 4.
### LRN
The local response normalization primitive in the Nvidia backend is implemented
with the `cudnnLRNForward` and `cudnnLRNBackward` functions for forward and
backward propagation respectively.
* `WITHIN` algorithm is not supported.
* There is a difference in the LRN algorithm used in oneDNN and cuDNN which
causes a mismatch when the local size is even.
* cuDNN supports NCHW tensor formats for all valid dimensions. However, it does
not support the NHWC tensor format for above 5 dimensions.
### Matrix Multiplication
The matrix multiplication primitive in the Nvidia backend is implemented with
`cublasGemmEx` and `cublasGemmStridedBatchedEx` functions.
* Zero points support is not provided by cuBLAS and, hence, not supported by the
Nvidia backend.
* Post-ops and output scale limitations are same as for Inner Product.
### Pooling
The pooling primitive in the Nvidia backend is implemented with the
`cudnnPoolingForward` and `cudnnPoolingBackward` functions for forward and
backward propagation respectively.
* cuDNN only allows the use of symmetric padding, i.e. padding at the beginning
of a dimension must be the same as the padding at the end of that dimension.
oneDNN doesn't have this limitation. Therefore,
- Configurations where padding in the beginning is larger than padding at
the end are supported and work as expected.
- For configurations where padding at the end is larger than padding in the
beginning of any dimension, the primitive returns `status::unimplemented`.
* For backward propagation cuDNN requires the parameters `x`, `y`, `dx` and
`dy`, while oneDNN requires only `dx`, `dy` and workspace when the `MAX`
algorithm is used. Hence, the workspace is used to store the `x` and `y`
parameters in the forward pass for the Nvidia backend. Therefore, the
workspace is always required when the Nvidia backend is used (except for the
forward inference).
### Reorder
The `cudnnTransform` function is the equivalent of oneDNN reorder function.
However, there are some limitations when using SYCL_API-DNN reorder on Nvidia
GPU:
* Per dimension scaling is not supported (a single alpha and beta value is
accepted by the transform tensor function).
* Blocking is only permitted for the channel dimension in cuDNN. This primitive
currently supports block size of 4.
* Blocking is only supported when channel dimension is a multiple of the block
size and the datatype is `int8`.
### Resampling
The `cudnnSpatialTfSamplerForward` and `cudnnSpatialTfSamplerBackward` are used
to implement the resampling primitive.
The Nvidia's spatial sampling is based on
[Spacial Transformer Network](https://papers.nips.cc/paper/5854-spatial-transformer-networks.pdf)
where all the data locations are normalized between `-1 <= (xi, yi) <= 1`.
* cuDNN backend requires a grid of coordinates that can be sample-up/down based
on `theta`. The grid is generated by `cudnnSpatialTfGridGeneratorForward`.
* The `theta` is a `MB * 2 * 3` matrix scaling factor for each coordinate and is
used to generate the grid.
* The grid value must be normalized in range [-1 , 1]. cuDNN clamps the out of
bounds coordinate to zero. Therefore, it is needed to manually clamp the out
of bound coordinate to edges in order to avoid incorrect result.
* 3D spatial sampling is not supported in cuDNN.
* `Nearest neighbour` algorithm is not supported in cuDNN.
* Since cuDNN computation is different from that of oneDNN, the error threshold
is smaller than other oneDNN implementation, so reduced testing accuracy for
`fp32` and `fp16` data types are required.
* The backward pass requires an output parameter for `d_grid` which cannot be
`nullptr`. However, since the grid coordinates are not a tunable parameter in
oneDNN, a dummy memory for `d_grid` is created and is deleted when the
destructor of the primitive is called.
### Softmax/LogSoftmax
The `cudnnSoftmaxForward` and `cudnnSoftmaxBackward` are used to implement the
softmax primitive. For logsoftmax primitive the same functions will be used and
the algorithm selection in cuDNN for the above mentioned functions will be
changed to `CUDNN_SOFTMAX_LOG`.
* The softmax axis is supported for only the channel dimension, (i.e., axis=1).
* There is a bug in cuDNN softmax for 5D tensor with format `NHWC`. When the
channel size is greater than 1, it only applies softmax for a single channel
and leave the others untouched.
### Sum
The sum operation uses the reorder primitive to sum tensors, so the same
limitation as reorder applies here.
### Other primitives
Rest primitives not listed above are not supported by Nvidia backend. This is
likely due to either missed functionality in cuDNN or cuBLAS, or lack of
priority in supporting of such functionality.

View File

@ -0,0 +1,38 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_batch_normalization.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_batch_normalization_fwd_t::execute(const exec_ctx_t &ctx) const {
return cudnn_batch_normalization_common_t::execute(
ctx, ctx.stream()->engine(), pd());
}
status_t cudnn_batch_normalization_bwd_t::execute(const exec_ctx_t &ctx) const {
return cudnn_batch_normalization_common_t::execute(
ctx, ctx.stream()->engine(), pd());
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,198 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_HPP
#define GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_HPP
#include <cudnn.h>
#include <CL/sycl.hpp>
#include "common/batch_normalization_pd.hpp"
#include "common/c_types_map.hpp"
#include "common/primitive.hpp"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/cudnn_batch_normalization_executor.hpp"
#include "gpu/nvidia/cudnn_batch_normalization_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_batch_normalization_common_t {
template <typename pd_t>
static status_t execute(
const exec_ctx_t &ctx, engine_t *engine, const pd_t *pd) {
if (memory_desc_wrapper(pd->src_md()).has_zero_dim())
return status::success;
return pd->executor_->execute(ctx, engine, pd->bnorm_impl_);
}
template <typename pd_t>
static void init_ws(const pd_t *pd, memory_desc_t &ws_md) {
const auto wrap = memory_desc_wrapper(pd->src_md());
const auto y_size = wrap.nelems();
const size_t mean_invvar_size = 2 * pd->C();
const dims_t ws_size
= {(dim_t)(y_size * pd->fuse_norm_relu() + mean_invvar_size)};
dnnl_memory_desc_init_by_tag(
&ws_md, 1, ws_size, wrap.data_type(), format_tag::x);
}
};
struct cudnn_batch_normalization_fwd_t : public primitive_t {
struct pd_t : public batch_normalization_fwd_pd_t {
pd_t(const batch_normalization_desc_t *adesc,
const primitive_attr_t *attr,
const batch_normalization_fwd_pd_t *hint_fwd_pd)
: batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_batch_normalization_fwd_t);
status_t init(engine_t *) {
using namespace data_type;
using namespace types;
auto src_dt = src_md()->data_type;
const auto attr_skip_mask = primitive_attr_t::skip_mask_t::post_ops;
bool ok = true && is_fwd() && utils::one_of(src_dt, f16, f32, s8)
&& attr()->has_default_values(attr_skip_mask)
&& IMPLICATION(!attr()->has_default_values(),
attr()->post_ops_.len() == 1 && with_relu_post_op())
&& IMPLICATION(utils::one_of(src_dt, s8, f16),
!is_training() && stats_is_src())
&& src_md()->format_desc.blocking.inner_nblks == 0;
if (!ok) return status::unimplemented;
if (is_training()) {
cudnn_batch_normalization_common_t::init_ws(this, ws_md_);
}
if (use_global_stats()) {
bnorm_impl_.reset(
new cudnn_batch_normalization_fwd_stats_impl_t());
} else {
bnorm_impl_.reset(new cudnn_batch_normalization_fwd_impl_t());
}
if (!is_training() && !use_global_stats() && !use_scaleshift()) {
executor_.reset(new bnorm_exec_fwd_inf_t());
} else if (!is_training() && use_scaleshift()
&& !use_global_stats()) {
executor_.reset(new bnorm_exec_fwd_inf_ss_t());
} else if (!use_scaleshift() && !use_global_stats()) {
executor_.reset(new bnorm_exec_fwd_t());
} else if (use_scaleshift() && !use_global_stats()) {
executor_.reset(new bnorm_exec_fwd_ss_t);
} else if (!use_scaleshift() && use_global_stats()) {
// Same for training and inference
executor_.reset(new bnorm_exec_fwd_inf_stats_t());
} else if (use_scaleshift() && use_global_stats()) {
// Same for training and inference
executor_.reset(new bnorm_exec_fwd_inf_ss_stats_t());
} else {
return status::unimplemented;
}
return bnorm_impl_->init(this);
}
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl_;
std::shared_ptr<bnorm_exec_base_t> executor_;
};
cudnn_batch_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
struct cudnn_batch_normalization_bwd_t : public primitive_t {
struct pd_t : public batch_normalization_bwd_pd_t {
pd_t(const batch_normalization_desc_t *adesc,
const primitive_attr_t *attr,
const batch_normalization_fwd_pd_t *hint_fwd_pd)
: batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_batch_normalization_bwd_t);
status_t init(engine_t *) {
using namespace data_type;
using namespace types;
bool ok = true && is_bwd() && set_default_formats_common()
&& IMPLICATION(
desc()->prop_kind == prop_kind::backward_data,
!use_scaleshift())
&& (utils::everyone_is(
f32, src_md()->data_type, diff_src_md()->data_type))
&& attr()->has_default_values() && !use_global_stats()
&& src_md()->format_desc.blocking.inner_nblks == 0
&& diff_src_md()->format_desc.blocking.inner_nblks == 0;
if (!ok) return status::unimplemented;
cudnn_batch_normalization_common_t::init_ws(this, ws_md_);
if (!compare_ws(hint_fwd_pd_)) return status::unimplemented;
if (fuse_norm_relu()) {
bnorm_impl_.reset(
new cudnn_batch_normalization_bwd_relu_impl_t());
} else {
bnorm_impl_.reset(new cudnn_batch_normalization_bwd_impl_t());
}
bool is_bwd_d = desc()->prop_kind == prop_kind::backward_data;
if (!is_bwd_d && use_scaleshift() && !use_global_stats()) {
executor_.reset(new bnorm_exec_bwd_dw_ss_t);
} else if (is_bwd_d && use_scaleshift() && !use_global_stats()) {
executor_.reset(new bnorm_exec_bwd_d_ss_t);
} else if (!use_scaleshift() && !use_global_stats()) {
// Same for bwd_d and bwd_dw
executor_.reset(new bnorm_exec_bwd_t());
} else {
return status::unimplemented;
}
return bnorm_impl_->init(this);
}
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl_;
std::shared_ptr<bnorm_exec_base_t> executor_;
};
cudnn_batch_normalization_bwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,549 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_EXECUTOR_HPP
#define GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_EXECUTOR_HPP
#include "common/batch_normalization_pd.hpp"
#include "common/c_types_map.hpp"
#include "common/primitive.hpp"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/cudnn_batch_normalization_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct bnorm_exec_base_t {
virtual status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_batch_normalization_impl_base_t>
bnorm_impl) const = 0;
protected:
template <typename T, cl::sycl::access::mode md, typename sc_t>
void *mean_var_ptr(cl::sycl::accessor<T, 1, md> acc, sc_t &sc,
const cl::sycl::interop_handler &ih) const {
return sc.template memory<void *>(ih, acc);
}
template <typename sc_t>
std::nullptr_t mean_var_ptr(std::nullptr_t acc, sc_t &,
const cl::sycl::interop_handler &ih) const {
return acc;
}
template <typename read_acc_t, typename write_acc_t, typename wkspace_st_t,
typename float_acc_t, typename maybe_nullptr_t>
void interop_task_fwd(
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl,
engine_t *engine, cl::sycl::handler &cgh,
nvidia::sycl_cuda_stream_t *cuda_stream, read_acc_t src_acc,
write_acc_t dst_acc, maybe_nullptr_t mean_acc,
maybe_nullptr_t var_acc, float_acc_t scale_acc,
float_acc_t bias_acc, wkspace_st_t wkspace_st, bool init_ss,
bool init_mean_var) const {
std::shared_ptr<
cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write>>
wkspace_acc;
if (!wkspace_st->is_null()) {
wkspace_acc.reset(new cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::write>(
utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
wkspace_st)
->buffer()
.template get_access<cl::sycl::access::mode::write>(
cgh)));
}
maybe_init_mean_var(cuda_stream, mean_acc, var_acc, init_mean_var);
maybe_init_ss(cuda_stream, scale_acc, bias_acc, init_ss);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto x = sc.memory<void *>(ih, src_acc);
auto y = sc.memory<void *>(ih, dst_acc);
auto mean = mean_var_ptr(mean_acc, sc, ih);
auto var = mean_var_ptr(var_acc, sc, ih);
auto scale = sc.memory<float *>(ih, scale_acc);
auto bias = sc.memory<float *>(ih, bias_acc) + bnorm_impl->C();
uint8_t *y_prime = nullptr, *save_mean = nullptr,
*save_var = nullptr;
if (!wkspace_st->is_null()) {
save_mean = sc.memory<uint8_t *>(ih, *wkspace_acc);
save_var = save_mean + bnorm_impl->mean_var_size_bytes();
y_prime = save_var + bnorm_impl->mean_var_size_bytes();
}
std::shared_ptr<bnorm_args_t> args(new bnorm_fwd_args_t(x, y, mean,
var, scale, bias, y_prime, save_mean, save_var));
bnorm_impl->execute(handle, args);
});
}
template <typename read_acc_t, typename write_acc_t, typename ss_acc_t,
typename d_ss_acc_t>
void interop_task_bwd(
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl,
engine_t *engine, cl::sycl::handler &cgh,
nvidia::sycl_cuda_stream_t *cuda_stream, read_acc_t src_acc,
read_acc_t diff_dst_acc, write_acc_t diff_src_acc,
ss_acc_t scale_acc, ss_acc_t bias_acc,
d_ss_acc_t diff_scaleshift_acc, read_acc_t wkspace_acc,
std::shared_ptr<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>>
temp_relu_output,
bool init_ss, bool init_mean_var) const {
maybe_init_ss(cuda_stream, scale_acc, bias_acc, init_ss);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto x = sc.memory<void *>(ih, src_acc);
auto dy = sc.memory<void *>(ih, diff_dst_acc);
auto dx = sc.memory<void *>(ih, diff_src_acc);
auto scale = sc.memory<uint8_t *>(ih, scale_acc);
auto bias = sc.memory<uint8_t *>(ih, bias_acc)
+ (bnorm_impl->C() * sizeof(float));
auto diff_scale = sc.memory<uint8_t *>(ih, diff_scaleshift_acc);
auto diff_bias = diff_scale + (bnorm_impl->C() * sizeof(float));
auto save_mean = sc.memory<uint8_t *>(ih, wkspace_acc);
auto save_var = save_mean + bnorm_impl->mean_var_size_bytes();
auto wkspace = save_var + bnorm_impl->mean_var_size_bytes();
auto relu_dy = bnorm_impl->fuse_norm_relu()
? sc.memory<void *>(ih, *temp_relu_output)
: nullptr;
std::shared_ptr<bnorm_args_t> args(
new bnorm_bwd_args_t(x, dx, dy, save_mean, save_var, scale,
bias, diff_scale, diff_bias, wkspace, relu_dy));
bnorm_impl->execute(handle, args);
});
}
template <typename T>
void maybe_init_ss(
nvidia::sycl_cuda_stream_t *cuda_stream, T, T, bool) const {}
template <typename T>
void maybe_init_ss(nvidia::sycl_cuda_stream_t *cuda_stream,
cl::sycl::accessor<T, 1, cl::sycl::access::mode::write> scale_acc,
cl::sycl::accessor<T, 1, cl::sycl::access::mode::write> bias_acc,
bool init_ss) const {
if (init_ss) {
constexpr T scale_val = 1, bias_val = 0;
cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
cgh.fill(scale_acc, scale_val);
});
cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
cgh.fill(bias_acc, bias_val);
});
}
}
// Handle the cases when mean and var are read-only accessors or nullptr
template <typename T>
void maybe_init_mean_var(
nvidia::sycl_cuda_stream_t *cuda_stream, T, T, bool) const {}
template <typename T>
void maybe_init_mean_var(nvidia::sycl_cuda_stream_t *cuda_stream,
cl::sycl::accessor<T, 1, cl::sycl::access::mode::write> mean_acc,
cl::sycl::accessor<T, 1, cl::sycl::access::mode::write> var_acc,
bool init_mean_var) const {
if (init_mean_var) {
constexpr T mean_var_val = 0;
cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
cgh.fill(mean_acc, mean_var_val);
});
cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
cgh.fill(var_acc, mean_var_val);
});
}
}
};
struct bnorm_exec_fwd_inf_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto wkspace_storage = bnorm_impl->is_training()
? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage()
: &memory_storage_t::empty_storage();
auto n_channels = bnorm_impl->C();
cl::sycl::buffer<float> scaleshift_buff(n_channels * 2);
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, n_channels);
bool init_ss = true, init_mean_var = false;
interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
dst_acc, nullptr, nullptr, scale_acc, bias_acc,
wkspace_storage, init_ss, init_mean_var);
});
}
};
struct bnorm_exec_fwd_inf_ss_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto wkspace_storage = bnorm_impl->is_training()
? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage()
: &memory_storage_t::empty_storage();
auto scaleshift_buff
= utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
&CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT))
->buffer();
auto n_channels = bnorm_impl->C();
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::read>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::read>(
cgh, n_channels, n_channels);
bool init_ss = false, init_mean_var = false;
interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
dst_acc, nullptr, nullptr, scale_acc, bias_acc,
wkspace_storage, init_ss, init_mean_var);
});
}
};
struct bnorm_exec_fwd_inf_stats_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto wkspace_storage = bnorm_impl->is_training()
? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage()
: &memory_storage_t::empty_storage();
auto n_channels = bnorm_impl->C();
cl::sycl::buffer<float> scaleshift_buff(n_channels * 2);
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto mean_acc = CTX_IN_ACCESSOR(DNNL_ARG_MEAN);
auto var_acc = CTX_IN_ACCESSOR(DNNL_ARG_VARIANCE);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, n_channels);
bool init_ss = true, init_mean_var = false;
interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
dst_acc, mean_acc, var_acc, scale_acc, bias_acc,
wkspace_storage, init_ss, init_mean_var);
});
}
};
struct bnorm_exec_fwd_inf_ss_stats_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto wkspace_storage = bnorm_impl->is_training()
? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage()
: &memory_storage_t::empty_storage();
auto scaleshift_buff
= utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
&CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT))
->buffer();
auto n_channels = bnorm_impl->C();
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto mean_acc = CTX_IN_ACCESSOR(DNNL_ARG_MEAN);
auto var_acc = CTX_IN_ACCESSOR(DNNL_ARG_VARIANCE);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::read>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::read>(
cgh, n_channels, n_channels);
bool init_ss = false, init_mean_var = false;
interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
dst_acc, mean_acc, var_acc, scale_acc, bias_acc,
wkspace_storage, init_ss, init_mean_var);
});
}
};
struct bnorm_exec_fwd_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto wkspace_storage = bnorm_impl->is_training()
? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage()
: &memory_storage_t::empty_storage();
auto n_channels = bnorm_impl->C();
cl::sycl::buffer<float> scaleshift_buff(n_channels * 2);
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto mean_acc = CTX_OUT_ACCESSOR(DNNL_ARG_MEAN);
auto var_acc = CTX_OUT_ACCESSOR(DNNL_ARG_VARIANCE);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, n_channels);
bool init_ss = true, init_mean_var = true;
interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
dst_acc, mean_acc, var_acc, scale_acc, bias_acc,
wkspace_storage, init_ss, init_mean_var);
});
}
};
struct bnorm_exec_fwd_ss_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto wkspace_storage = bnorm_impl->is_training()
? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage()
: &memory_storage_t::empty_storage();
auto scaleshift_buff
= utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
&CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT))
->buffer();
auto n_channels = bnorm_impl->C();
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto mean_acc = CTX_OUT_ACCESSOR(DNNL_ARG_MEAN);
auto var_acc = CTX_OUT_ACCESSOR(DNNL_ARG_VARIANCE);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, n_channels);
bool init_ss = false, init_mean_var = true;
interop_task_fwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
dst_acc, mean_acc, var_acc, scale_acc, bias_acc,
wkspace_storage, init_ss, init_mean_var);
});
}
};
struct bnorm_exec_bwd_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto n_channels = bnorm_impl->C();
cl::sycl::buffer<float> scaleshift_buff(n_channels * 2);
cl::sycl::buffer<float> diff_scaleshift_buff(n_channels * 2);
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
auto wkspace_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE);
auto diff_scaleshift_acc
= diff_scaleshift_buff
.get_access<cl::sycl::access::mode::read>(cgh);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::write>(
cgh, n_channels, n_channels);
bool init_ss = true, init_mean_var = false;
std::shared_ptr<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>>
temp_relu_output = nullptr;
if (bnorm_impl->fuse_norm_relu()) {
temp_relu_output = std::make_shared<cl::sycl::accessor<uint8_t,
1, cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>>(
CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none));
}
interop_task_bwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
diff_dst_acc, diff_src_acc, scale_acc, bias_acc,
diff_scaleshift_acc, wkspace_acc, temp_relu_output, init_ss,
init_mean_var);
});
}
};
struct bnorm_exec_bwd_dw_ss_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto scaleshift_buff
= utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
&CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT))
->buffer();
auto n_channels = bnorm_impl->C();
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
auto diff_scaleshift_acc
= CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SCALE_SHIFT);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::read>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::read>(
cgh, n_channels, n_channels);
auto wkspace_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE);
bool init_ss = false, init_mean_var = false;
std::shared_ptr<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>>
temp_relu_output = nullptr;
if (bnorm_impl->fuse_norm_relu()) {
temp_relu_output = std::make_shared<cl::sycl::accessor<uint8_t,
1, cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>>(
CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none));
}
interop_task_bwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
diff_dst_acc, diff_src_acc, scale_acc, bias_acc,
diff_scaleshift_acc, wkspace_acc, temp_relu_output, init_ss,
init_mean_var);
});
}
};
struct bnorm_exec_bwd_d_ss_t : public bnorm_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
std::shared_ptr<cudnn_batch_normalization_impl_base_t> bnorm_impl)
const override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
auto scaleshift_buff
= utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
&CTX_IN_STORAGE(DNNL_ARG_SCALE_SHIFT))
->buffer();
auto n_channels = bnorm_impl->C();
cl::sycl::buffer<float> diff_scaleshift_buff(n_channels * 2);
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
auto scale_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::read>(
cgh, n_channels, 0);
auto bias_acc
= scaleshift_buff.get_access<cl::sycl::access::mode::read>(
cgh, n_channels, n_channels);
auto diff_scaleshift_acc
= diff_scaleshift_buff
.get_access<cl::sycl::access::mode::read>(cgh);
auto wkspace_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE);
bool init_ss = false, init_mean_var = false;
std::shared_ptr<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>>
temp_relu_output = nullptr;
if (bnorm_impl->fuse_norm_relu()) {
temp_relu_output = std::make_shared<cl::sycl::accessor<uint8_t,
1, cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>>(
CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none));
}
interop_task_bwd(bnorm_impl, engine, cgh, cuda_stream, src_acc,
diff_dst_acc, diff_src_acc, scale_acc, bias_acc,
diff_scaleshift_acc, wkspace_acc, temp_relu_output, init_ss,
init_mean_var);
});
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,347 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_IMPL_HPP
#define GPU_NVIDIA_CUDNN_BATCH_NORMALIZATION_IMPL_HPP
#include <cudnn.h>
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct bnorm_args_t {
public:
bnorm_args_t(void *x, void *mean, void *var, void *scale, void *bias)
: x_(x), mean_(mean), var_(var), scale_(scale), bias_(bias) {}
void *x_, *mean_, *var_, *scale_, *bias_;
};
struct bnorm_fwd_args_t : public bnorm_args_t {
bnorm_fwd_args_t(void *x, void *y, void *mean, void *var, void *scale,
void *bias, void *y_prime, void *save_mean, void *save_var)
: bnorm_args_t::bnorm_args_t(x, mean, var, scale, bias)
, y_(y)
, y_prime_(y_prime)
, save_mean_(save_mean)
, save_var_(save_var) {}
void *y_, *y_prime_, *save_mean_, *save_var_;
};
struct bnorm_bwd_args_t : public bnorm_args_t {
bnorm_bwd_args_t(void *x, void *dx, void *dy, void *mean, void *var,
void *scale, void *bias, void *diff_scale, void *diff_bias,
void *wkspace, void *relu_dx)
: bnorm_args_t(x, mean, var, scale, bias)
, dx_(dx)
, dy_(dy)
, diff_scale_(diff_scale)
, diff_bias_(diff_bias)
, wkspace_(wkspace)
, relu_dx_(relu_dx) {}
void *dx_, *dy_, *diff_scale_, *diff_bias_, *wkspace_, *relu_dx_;
};
struct cudnn_batch_normalization_impl_base_t {
virtual ~cudnn_batch_normalization_impl_base_t() {
for (size_t i = 0; i < NUM_IO; ++i) {
if (tensor_descs_[i]) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, tensor_descs_[i]);
}
}
if ((fuse_norm_relu_ || with_relu_postop_) && act_desc_) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyActivationDescriptor, act_desc_);
}
}
virtual status_t init(batch_normalization_pd_t *pd) = 0;
virtual void execute(
cudnnHandle_t handle, std::shared_ptr<bnorm_args_t> args) const = 0;
bool is_bwd_d() const { return is_bwd_data_; }
bool is_training() const { return is_training_; }
bool fuse_norm_relu() const { return fuse_norm_relu_; }
std::size_t dt_size() const { return dt_size_; }
std::size_t mean_var_size_bytes() { return mean_var_size_bytes_; }
uint8_t default_mean_var() const { return 0; }
int C() const { return nchannels_; }
protected:
status_t init_common(batch_normalization_pd_t *pd) {
ndims_ = pd->ndims() < 4 ? 4 : pd->ndims();
if (ndims_ > 5) { return status::invalid_arguments; }
memory_desc_wrapper wrap(pd->src_md());
fuse_norm_relu_ = pd->fuse_norm_relu();
is_training_ = pd->is_training();
with_global_stats_ = pd->use_global_stats();
is_bwd_data_ = pd->desc()->prop_kind == prop_kind::backward_data;
dt_size_ = types::data_type_size(wrap.data_type());
nchannels_ = pd->C();
mean_var_size_bytes_ = nchannels_ * dt_size_;
eps_ = pd->desc()->batch_norm_epsilon;
y_prime_size_ = wrap.nelems() * dt_size_;
with_relu_postop_ = pd->with_relu_post_op();
auto n = static_cast<float>(pd->MB() * pd->D() * pd->H() * pd->W());
var_scaling_factor_ = (n - 1.f) / n;
convert_dims(pd->src_md()->padded_dims, dims_[src], pd->ndims());
convert_dims(pd->src_md()->format_desc.blocking.strides, strides_[src],
pd->ndims());
CHECK(convert_data_type(pd->src_md(), &data_types_[src]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src],
data_types_[src], ndims_, dims_[src], strides_[src]));
CHECK(create_and_set_scaleshift_desc());
if (fuse_norm_relu_ || with_relu_postop_) {
CHECK(create_and_set_activation_desc());
}
return status::success;
}
virtual status_t create_and_set_scaleshift_desc() {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateTensorDescriptor, &tensor_descs_[scl]));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnDeriveBNTensorDescriptor,
tensor_descs_[scl], tensor_descs_[src], mode_));
return status::success;
}
virtual status_t create_and_set_activation_desc() {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateActivationDescriptor, &act_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_,
CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, relu_coef_));
return status::success;
}
virtual status_t to_population_variance(
cudnnHandle_t handle, void *var) const {
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnScaleTensor, handle, tensor_descs_[scl],
var, &var_scaling_factor_));
return status::success;
}
enum io { src = 0, dst, scl, NUM_IO };
cudnnDataType_t data_types_[NUM_IO];
cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {};
cudnnActivationDescriptor_t act_desc_;
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL;
int dims_[NUM_IO][DNNL_MAX_NDIMS];
int strides_[NUM_IO][DNNL_MAX_NDIMS];
int ndims_, nchannels_;
float alpha_ = 1.f, beta = 0.f;
double relu_coef_ = 0.0;
double factor_ = 1.0;
double eps_ = CUDNN_BN_MIN_EPSILON;
float var_scaling_factor_ = 0.f;
bool fuse_norm_relu_ = false;
bool with_relu_postop_ = false;
bool with_global_stats_ = false;
bool is_training_ = false;
bool is_bwd_data_ = false;
std::size_t y_prime_size_;
std::size_t dt_size_, mean_var_size_bytes_;
};
struct cudnn_batch_normalization_fwd_impl_t
: public cudnn_batch_normalization_impl_base_t {
using cudnn_batch_normalization_impl_base_t::
cudnn_batch_normalization_impl_base_t;
status_t init(batch_normalization_pd_t *pd) override {
init_common(pd);
convert_dims(pd->dst_md()->padded_dims, dims_[dst], pd->ndims());
convert_dims(pd->dst_md()->format_desc.blocking.strides, strides_[dst],
pd->ndims());
CHECK(convert_data_type(pd->dst_md(), &data_types_[dst]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst],
data_types_[dst], ndims_, dims_[dst], strides_[dst]));
return status::success;
}
void execute(cudnnHandle_t handle,
std::shared_ptr<bnorm_args_t> args) const override {
auto fwd_args = static_cast<bnorm_fwd_args_t *>(args.get());
CUDNN_EXECUTE_FUNC(cudnnBatchNormalizationForwardTraining, handle,
mode_, &alpha_, &beta, tensor_descs_[src], fwd_args->x_,
tensor_descs_[dst], fwd_args->y_, tensor_descs_[scl],
fwd_args->scale_, fwd_args->bias_, factor_, fwd_args->mean_,
fwd_args->var_, eps_, fwd_args->save_mean_,
fwd_args->save_var_);
if (is_training_) { to_population_variance(handle, fwd_args->var_); }
if (fuse_norm_relu_ || with_relu_postop_) { do_relu(handle, fwd_args); }
}
protected:
void do_relu(cudnnHandle_t handle, bnorm_fwd_args_t *fwd_args) const {
if (is_training_ && fuse_norm_relu_) {
// Copy the result to the workspace
CUDNN_EXECUTE_FUNC(cudnnAddTensor, handle, &alpha_,
tensor_descs_[dst], fwd_args->y_, &beta, tensor_descs_[dst],
fwd_args->y_prime_);
}
CUDNN_EXECUTE_FUNC(cudnnActivationForward, handle, act_desc_, &alpha_,
tensor_descs_[dst], fwd_args->y_, &beta, tensor_descs_[dst],
fwd_args->y_);
}
};
struct cudnn_batch_normalization_fwd_stats_impl_t
: public cudnn_batch_normalization_fwd_impl_t {
status_t init(batch_normalization_pd_t *pd) override {
return cudnn_batch_normalization_fwd_impl_t::init(pd);
}
void execute(cudnnHandle_t handle,
std::shared_ptr<bnorm_args_t> args) const override {
auto fwd_args = static_cast<bnorm_fwd_args_t *>(args.get());
CUDNN_EXECUTE_FUNC(cudnnBatchNormalizationForwardInference, handle,
mode_, &alpha_, &beta, tensor_descs_[src], fwd_args->x_,
tensor_descs_[dst], fwd_args->y_, tensor_descs_[scl],
fwd_args->scale_, fwd_args->bias_, fwd_args->mean_,
fwd_args->var_, eps_);
if (fuse_norm_relu_ || with_relu_postop_) { do_relu(handle, fwd_args); }
}
};
struct cudnn_batch_normalization_bwd_impl_t
: public cudnn_batch_normalization_impl_base_t {
status_t init(batch_normalization_pd_t *pd) override {
init_common(pd);
convert_dims(pd->diff_src_md()->padded_dims, diff_dims_[diff_src],
pd->ndims());
convert_dims(pd->diff_dst_md()->padded_dims, diff_dims_[diff_dst],
pd->ndims());
convert_dims(pd->diff_src_md()->format_desc.blocking.strides,
strides_[diff_src], pd->ndims());
convert_dims(pd->diff_dst_md()->format_desc.blocking.strides,
strides_[diff_dst], pd->ndims());
CHECK(convert_data_type(
pd->diff_src_md(), &diff_data_types_[diff_src]));
CHECK(convert_data_type(
pd->diff_dst_md(), &diff_data_types_[diff_dst]));
CHECK(create_and_set_tensor_descriptor(&diff_tensor_descs_[diff_src],
data_types_[diff_src], ndims_, diff_dims_[diff_src],
strides_[diff_src]));
CHECK(create_and_set_tensor_descriptor(&diff_tensor_descs_[diff_dst],
data_types_[diff_dst], ndims_, diff_dims_[diff_dst],
strides_[diff_dst]));
return status::success;
}
void execute(cudnnHandle_t handle,
std::shared_ptr<bnorm_args_t> args) const override {
auto bwd_args = static_cast<bnorm_bwd_args_t *>(args.get());
CUDNN_EXECUTE_FUNC(cudnnBatchNormalizationBackward, handle, mode_,
&a_data_diff_, &b_data_diff_, &a_param_diff_, &b_param_diff_,
tensor_descs_[src], bwd_args->x_, diff_tensor_descs_[diff_dst],
bwd_args->dy_, diff_tensor_descs_[diff_src], bwd_args->dx_,
tensor_descs_[scl], bwd_args->scale_, bwd_args->diff_scale_,
bwd_args->diff_bias_, eps_, bwd_args->mean_, bwd_args->var_);
}
~cudnn_batch_normalization_bwd_impl_t() {
for (size_t i = 0; i < NUM_DIFF; i++) {
if (diff_tensor_descs_[i]) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, diff_tensor_descs_[i]);
}
}
}
protected:
const float a_data_diff_ = 1.f, b_data_diff_ = 0.f;
const float a_param_diff_ = 1.f, b_param_diff_ = 0.f;
enum diff_tensors { diff_src = 0, diff_dst, NUM_DIFF };
int diff_dims_[NUM_DIFF][DNNL_MAX_NDIMS];
cudnnTensorDescriptor_t diff_tensor_descs_[NUM_DIFF] = {};
cudnnDataType_t diff_data_types_[NUM_DIFF];
};
struct cudnn_batch_normalization_bwd_relu_impl_t
: public cudnn_batch_normalization_bwd_impl_t {
status_t init(batch_normalization_pd_t *pd) override {
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_none,
memory_desc_wrapper(pd->diff_dst_md()).size(), size_t(1));
return cudnn_batch_normalization_bwd_impl_t::init(pd);
}
void execute(cudnnHandle_t handle,
std::shared_ptr<bnorm_args_t> args) const override {
auto bwd_args = static_cast<bnorm_bwd_args_t *>(args.get());
CUDNN_EXECUTE_FUNC(cudnnActivationBackward, handle, act_desc_, &alpha_,
diff_tensor_descs_[dst], bwd_args->wkspace_,
diff_tensor_descs_[dst], bwd_args->dy_, diff_tensor_descs_[dst],
bwd_args->wkspace_, &beta, diff_tensor_descs_[dst],
bwd_args->relu_dx_);
CUDNN_EXECUTE_FUNC(cudnnBatchNormalizationBackward, handle, mode_,
&a_data_diff_, &b_data_diff_, &a_param_diff_, &b_param_diff_,
tensor_descs_[src], bwd_args->x_, diff_tensor_descs_[dst],
bwd_args->relu_dx_, diff_tensor_descs_[src], bwd_args->dx_,
tensor_descs_[scl], bwd_args->scale_, bwd_args->diff_scale_,
bwd_args->diff_bias_, eps_, bwd_args->mean_, bwd_args->var_);
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,58 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_binary.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "sycl/sycl_buffer_memory_storage.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_binary_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->src_md(0)).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_0_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC_0);
auto src_1_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC_1);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto a = sc.memory<void *>(ih, src_0_acc);
auto b = sc.memory<void *>(ih, src_1_acc);
auto c = sc.memory<void *>(ih, dst_acc);
pd()->binary_impl_->execute(handle, a, b, c);
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,125 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_BINARY_HPP
#define GPU_NVIDIA_CUDNN_BINARY_HPP
#include "cudnn.h"
#include <CL/sycl.hpp>
#include "common/binary_pd.hpp"
#include "common/c_types_map.hpp"
#include "common/primitive.hpp"
#include "gpu/nvidia/cudnn_binary_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_binary_t : public primitive_t {
struct pd_t : public binary_pd_t {
using binary_pd_t::binary_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_binary_t);
status_t init(engine_t *) {
using namespace data_type;
bool ok = (set_default_params() == status::success)
&& check_data_types() && check_no_blocking()
&& IMPLICATION(
utils::one_of(src_md(0)->data_type, f32, f16),
attr()->has_default_values())
&& IMPLICATION(utils::one_of(src_md(0)->data_type, s8),
attr()->has_default_values(
primitive_attr_t::skip_mask_t::scales))
&& IMPLICATION(!attr()->scales_.has_default_values(),
check_scales_mask());
if (!ok) return status::unimplemented;
if (check_for_zero_dims()) return status::success;
binary_impl_.reset(new cudnn_binary_impl_t());
return binary_impl_->init(this);
}
bool check_for_zero_dims() const {
return has_zero_dims(src_md(0)->dims, src_md(0)->ndims)
|| has_zero_dims(src_md(1)->dims, src_md(1)->ndims)
|| has_zero_dims(dst_md()->dims, dst_md()->ndims);
}
bool check_scales_mask() const {
for (const auto &s : attr()->scales_.scales_) {
if (s.second.mask_ != 0) return false;
}
return true;
}
bool check_no_blocking() const {
// Blocking is not supported by cudnnOpTensor, return false if any
// blocks are present
return src_md(0)->format_desc.blocking.inner_nblks
+ src_md(1)->format_desc.blocking.inner_nblks
+ dst_md()->format_desc.blocking.inner_nblks
== 0;
}
bool check_data_types() const {
using namespace data_type;
bool inputs_same = src_md(0)->data_type == src_md(1)->data_type;
dnnl_data_type_t input_type = src_md(0)->data_type;
dnnl_data_type_t output_type = dst_md()->data_type;
switch (output_type) {
case f32:
return inputs_same
&& (input_type == f32 || input_type == s8
|| input_type == f16);
case f16:
return inputs_same
&& (input_type == f32 || input_type == f16);
case s8:
return inputs_same
&& (input_type == f32 || input_type == s8);
}
return false;
}
std::shared_ptr<cudnn_binary_impl_base_t> binary_impl_;
};
cudnn_binary_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,143 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_BINARY_IMPL_HPP
#define GPU_NVIDIA_CUDNN_BINARY_IMPL_HPP
#include "cudnn.h"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_binary_impl_base_t {
enum io { src_0 = 0, src_1, dst_0, NUM_IO };
cudnnDataType_t data_types[NUM_IO];
int ndims;
int dims[NUM_IO][DNNL_MAX_NDIMS];
cudnnOpTensorDescriptor_t op_desc = nullptr;
cudnnTensorDescriptor_t tensor_descs[NUM_IO] = {};
cudnnOpTensorOp_t alg_kind;
float alpha[2];
float beta = 0.0f;
virtual ~cudnn_binary_impl_base_t() {
if (op_desc) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyOpTensorDescriptor, op_desc);
}
for (size_t i = 0; i < NUM_IO; i++) {
if (tensor_descs[i]) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, tensor_descs[i]);
}
}
}
virtual status_t init(const binary_pd_t *pd) = 0;
void execute(cudnnHandle_t handle, void *a, void *b, void *c) const {
CUDNN_EXECUTE_FUNC(cudnnOpTensor, handle, op_desc, &alpha[0],
tensor_descs[src_0], a, &alpha[1], tensor_descs[src_1], b,
&beta, tensor_descs[dst_0], c);
}
virtual status_t create_and_set_op_descriptor() {
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateOpTensorDescriptor, &op_desc));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetOpTensorDescriptor, op_desc,
alg_kind, cudnnDataType_t::CUDNN_DATA_FLOAT,
cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN));
return status::success;
}
status_t convert_alg_kind(
alg_kind_t alg_kind, cudnnOpTensorOp_t *cuda_alg_kind) const {
switch (alg_kind) {
case alg_kind::binary_add:
*cuda_alg_kind = cudnnOpTensorOp_t::CUDNN_OP_TENSOR_ADD;
break;
case alg_kind::binary_mul:
*cuda_alg_kind = cudnnOpTensorOp_t::CUDNN_OP_TENSOR_MUL;
break;
case alg_kind::binary_min:
*cuda_alg_kind = cudnnOpTensorOp_t::CUDNN_OP_TENSOR_MIN;
break;
case alg_kind::binary_max:
*cuda_alg_kind = cudnnOpTensorOp_t::CUDNN_OP_TENSOR_MAX;
break;
default: return status::unimplemented;
}
return status::success;
}
};
struct cudnn_binary_impl_t : public cudnn_binary_impl_base_t {
int strides[NUM_IO][DNNL_MAX_NDIMS];
status_t init(const binary_pd_t *pd) override {
// If any of the dimensions are 0 we should not continue with creating
// cudnn descriptors
if (has_zero_dims(pd->src_md(0)->dims, pd->ndims())) {
return status::success;
}
if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
ndims = pd->ndims() < 4 ? 4 : pd->ndims();
convert_dims(pd->src_md(0)->padded_dims, dims[src_0], pd->ndims());
convert_dims(pd->src_md(1)->padded_dims, dims[src_1], pd->ndims());
convert_dims(pd->dst_md()->padded_dims, dims[dst_0], pd->ndims());
convert_dims(pd->src_md(0)->format_desc.blocking.strides,
strides[src_0], pd->ndims());
convert_dims(pd->src_md(1)->format_desc.blocking.strides,
strides[src_1], pd->ndims());
convert_dims(pd->dst_md()->format_desc.blocking.strides, strides[dst_0],
pd->ndims());
alg_kind_t alg = pd->desc()->alg_kind;
auto alg_ok = convert_alg_kind(alg, &alg_kind);
if (alg_ok != status::success) { return status::unimplemented; }
CHECK(convert_data_type(pd->src_md(0), &data_types[src_0]));
CHECK(convert_data_type(pd->src_md(1), &data_types[src_1]));
CHECK(convert_data_type(pd->dst_md(), &data_types[dst_0]));
bool do_scaling = pd->src_md(0)->data_type == dnnl_data_type_t::dnnl_s8;
auto scales_0 = pd->attr()->scales_.get(1).scales_;
auto scales_1 = pd->attr()->scales_.get(2).scales_;
alpha[0] = do_scaling ? scales_0[0] : 1.0f;
alpha[1] = do_scaling ? scales_1[0] : 1.0f;
CHECK(create_and_set_tensor_descriptor(&tensor_descs[src_0],
data_types[src_0], ndims, dims[src_0], strides[src_0]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs[src_1],
data_types[src_1], ndims, dims[src_1], strides[src_1]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs[dst_0],
data_types[dst_0], ndims, dims[dst_0], strides[dst_0]));
CHECK(create_and_set_op_descriptor());
return status::success;
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,42 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/ocl/ref_concat.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace {
using cpd_create_f = dnnl::impl::engine_t::concat_primitive_desc_create_f;
const cpd_create_f cuda_concat_impl_list[]
= {gpu::ocl::ref_concat_t::pd_t::create, nullptr};
} // namespace
const cpd_create_f *
cuda_gpu_engine_impl_list_t::get_concat_implementation_list() {
return cuda_concat_impl_list;
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,169 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_CONV_FILTER_ADJUSTMENT_BASE_HPP
#define GPU_NVIDIA_CUDNN_CONV_FILTER_ADJUSTMENT_BASE_HPP
#include "cublas_v2.h"
#include "cudnn.h"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_conv_filter_adjustment_base_t {
public:
float filter_alpha_ = 1, filter_beta_ = 0;
cudnnTensorDescriptor_t current_filter_desc_, transform_filter_desc_;
// for filter in convolution, cuDNN only support nchw and nhwc.
// the hwio and dhwio is not supported and should be converted
// to either of the above format.
virtual bool supported_filter_format(const memory_desc_t *md) {
const memory_desc_wrapper mem_wrapper(md);
/// NOTE: the transformation for oidhw to oihwd is disabled until cuDNN
// fixes the the current bug for oihwd format. the transformation for
// odhwi to ohwdi has been disabled until cuDNN provides support for
// 3d convolution in ohwdi format.
return (!(mem_wrapper.matches_one_of_tag(/*format_tag::oidhw,*/
/*format_tag::odhwi,*/ format_tag::dhwio, format_tag::hwio)));
}
virtual ~cudnn_conv_filter_adjustment_base_t() {
if (current_filter_desc_) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, current_filter_desc_);
}
if (transform_filter_desc_) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, transform_filter_desc_);
}
}
void propagate_strides(int *strides, const int *dims,
std::initializer_list<int> perm) const {
int prev_p = -1;
for (auto p : perm) {
strides[p] = prev_p == -1 ? 1 : strides[prev_p] * dims[prev_p];
prev_p = p;
}
}
virtual status_t init_filter_transformation(
cudnnDataType_t filter_data_types, int filter_ndims,
int *filter_dims, int *current_filter_strides,
int *transform_filter_strides) {
// Set a descriptor for the current filter.
CHECK(create_and_set_tensor_descriptor(&current_filter_desc_,
filter_data_types, filter_ndims, filter_dims,
current_filter_strides));
// Set a descriptor for the transform filter.
CHECK(create_and_set_tensor_descriptor(&transform_filter_desc_,
filter_data_types, filter_ndims, filter_dims,
transform_filter_strides));
return status::success;
}
virtual void set_filter_nchw(
int filter_ndims, int *transform_filter_strides, int *filter_dims) {
switch (filter_ndims) {
case 4: // Convert to KCRS
return propagate_strides(
transform_filter_strides, filter_dims, {3, 2, 1, 0});
case 5:
/// NOTE: cuDNN claims the filter must be in kcrsd . However
// in the current version(7.6.5) it accepts kcdrs filter is the
// same as ncdhw tensor. So according to cuDNN code should
// looks like:
// propagate_strides(
// transform_filter_strides, filter_dims, {2, 4, 3, 1, 0});
// However, executing the code shows that they actually expect
// the filter format to be kcdrs. Therefore, we convert the
// filter to kcdrs instead:
// propagate_strides(
// transform_filter_strides, filter_dims, {4, 3, 2, 1, 0});
return propagate_strides(
transform_filter_strides, filter_dims, {4, 3, 2, 1, 0});
case 6:
return propagate_strides(transform_filter_strides, filter_dims,
{5, 4, 3, 2, 1, 0});
}
}
virtual void set_filter_nhwc(
int filter_ndims, int *transform_filter_strides, int *filter_dims) {
switch (filter_ndims) {
case 4: // Convert to krsc
return propagate_strides(
transform_filter_strides, filter_dims, {1, 3, 2, 0});
case 5:
/// NOTE: Convert to krsdc. There is no support for krsdc and
// 3d convolution in the current version. So we convert the
// filter to ndhwc and then fold the dhwc for both srd and
// filter to make it a 4d conv. So according to cuDNN code
// should looks like:
// propagate_strides(
// transform_filter_strides, filter_dims, {1, 2, 4, 3,
// 0});
// However, executing the code shows that they actually expect
// the filter format to be kdrsc. Therefore, we convert the
// filter to kdrsc:
// propagate_strides(
// transform_filter_strides, filter_dims, {1, 4, 3, 2, 0});
return propagate_strides(
transform_filter_strides, filter_dims, {1, 4, 3, 2, 0});
case 6:
return propagate_strides(transform_filter_strides, filter_dims,
{1, 5, 4, 3, 2, 0});
}
}
void set_filter_format(int filter_ndims, int *filter_dims,
int *transform_filter_strides, cudnnTensorFormat_t format) {
if (format == CUDNN_TENSOR_NCHW) {
set_filter_nchw(
filter_ndims, transform_filter_strides, filter_dims);
} else {
set_filter_nhwc(
filter_ndims, transform_filter_strides, filter_dims);
}
}
void transform_filter(cudnnHandle_t handle, void *current_filter,
void *transform_filter) const {
CUDNN_EXECUTE_FUNC(cudnnTransformTensor, handle, &filter_alpha_,
current_filter_desc_, current_filter, &filter_beta_,
transform_filter_desc_, transform_filter);
}
void undo_transform_filter(cudnnHandle_t handle, void *transform_filter,
void *current_filter) const {
CUDNN_EXECUTE_FUNC(cudnnTransformTensor, handle, &filter_alpha_,
transform_filter_desc_, transform_filter, &filter_beta_,
current_filter_desc_, current_filter);
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,396 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_CONV_INNER_PRODUCT_HPP
#define GPU_NVIDIA_CUDNN_CONV_INNER_PRODUCT_HPP
#include "cudnn.h"
#include <CL/sycl.hpp>
#include "common/c_types_map.hpp"
#include "common/inner_product_pd.hpp"
#include "common/primitive.hpp"
#include "gpu/nvidia/cudnn_conv_inner_product_impl.hpp"
#include "gpu/nvidia/cudnn_inner_product.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace {
inline status_t init_mem_by_tag(format_tag_t tag, memory_desc_t &md) {
if (tag == format_tag::undef) { return status::unimplemented; }
CHECK(memory_desc_init_by_tag(md, tag));
return status::success;
}
inline format_tag_t get_tag(const memory_desc_t &md) {
using namespace format_tag;
auto tag = memory_desc_matches_one_of_tag(md, ab, abc, abcd,
abcde, // NCHW derivatives
ba, bca, bcda, bcdea, cba, cdba,
cdeba, // IO and spatial derivatives
acb, acdb, acdeb, // NHWC derivatives
aBcd16b, aBcde16b, aBcd8b, aBcde8b, aBcd4b,
aBcde4b); // blocked layouts
return tag;
}
} // namespace
struct cudnn_conv_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
using cudnn_inner_product_fwd_t::cudnn_inner_product_fwd_t;
using parent_pd_t = cudnn_inner_product_fwd_t::pd_t;
struct pd_t : public parent_pd_t {
using parent_pd_t::parent_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:conv", cudnn_conv_inner_product_fwd_t);
status_t init(engine_t *engine) {
using namespace data_type;
using namespace prop_kind;
const auto attr_skip_mask = primitive_attr_t::skip_mask_t::oscale
| primitive_attr_t::skip_mask_t::post_ops;
// Flag for checking if the fused routine can be used for the
// blocked format case. If set to true, that implies ReLU and
// blocking are used.
bool use_fused_path_for_blocking = false;
bool ok = true && set_default_params() == status::success;
ok = ok
&& utils::one_of(desc()->prop_kind, forward_training,
forward_inference)
&& data_types_ok() && memory_format_ok(src_md())
&& memory_format_ok(weights_md(0))
&& memory_format_ok(dst_md())
&& blocking_ok(with_eltwise(), use_fused_path_for_blocking)
&& IMPLICATION(with_bias(), memory_format_ok(weights_md(1)))
&& attr()->has_default_values(attr_skip_mask)
&& post_ops_ok(attr())
&& IMPLICATION(!attr()->output_scales_.has_default_values(),
utils::one_of(src_md_.data_type, s8)
&& attr()->output_scales_.mask_ == 0);
if (!ok) return status::unimplemented;
if (has_zero_dim_memory()) return status::success;
inner_product_impl_.reset(
new cudnn_conv_inner_product_fwd_impl_t());
auto st = inner_product_impl_->init(engine, this, with_relu(),
with_eltwise(), with_sum(), use_fused_path_for_blocking);
return st;
}
bool post_ops_ok(const primitive_attr_t *attr) const {
const auto &p = attr->post_ops_;
auto is_eltwise
= [&](int idx) { return p.entry_[idx].is_eltwise(false); };
auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); };
switch (p.len()) {
case 0: return true; // no post_ops
case 1: return is_eltwise(0) || is_sum(0); // sum OR eltwise
case 2: return is_sum(0) && is_eltwise(1); // sum -> eltwise
default: return false;
}
return false;
}
bool with_eltwise() const {
return attr()->post_ops_.find(primitive_kind::eltwise) != -1;
}
bool with_relu() const {
auto idx = attr()->post_ops_.find(primitive_kind::eltwise);
if (idx != -1) { return attr()->post_ops_.entry_[idx].is_relu(); }
return false;
}
bool with_sum() const {
return attr()->post_ops_.find(primitive_kind::sum) != -1;
}
status_t set_default_params() {
using namespace format_tag;
// Although cuDNN does support arbitrary striding in the src
// and dst tensors, it does not support filters in any format
// where the N dimension follows the C dimension. So transpose the
// filter here if that is that case, and the src along with it.
auto set_default = [&]() {
if (ndims() < 5 && src_md_.data_type == data_type::s8) {
CHECK(init_mem_by_tag(
utils::pick(ndims() - 2, ab, acb, acdb, acdeb),
src_md_));
} else {
CHECK(init_mem_by_tag(
utils::pick(ndims() - 2, ab, abc, abcd, abcde),
src_md_));
}
CHECK(init_mem_by_tag(get_tag(src_md_), weights_md_));
return status::success;
};
if ((src_md()->format_kind == format_kind::any)
&& (weights_md(0)->format_kind == format_kind::any)) {
CHECK(set_default());
} else if ((src_md()->format_kind == format_kind::any)
&& (weights_md(0)->format_kind != format_kind::any)) {
CHECK(init_mem_by_tag(get_tag(weights_md_), src_md_));
} else if ((src_md()->format_kind != format_kind::any)
&& (weights_md(0)->format_kind == format_kind::any)) {
CHECK(init_mem_by_tag(get_tag(src_md_), weights_md_));
}
if (dst_md()->format_kind == format_kind::any)
CHECK(memory_desc_init_by_tag(dst_md_, nc));
if (weights_md(1)->format_kind == format_kind::any)
CHECK(memory_desc_init_by_tag(bias_md_, x));
return status::success;
}
bool blocking_ok(
bool with_relu, bool &use_fused_path_for_blocking) const {
// Bias and dst should not be blocked.
if (weights_md(1)->format_desc.blocking.inner_nblks
+ dst_md()->format_desc.blocking.inner_nblks
!= 0)
return false;
// If the src and filter are not blocked, done.
if (src_md()->format_desc.blocking.inner_nblks
+ weights_md(0)->format_desc.blocking.inner_nblks
== 0)
return true;
use_fused_path_for_blocking = with_relu;
// Otherwise check blocking is done on C dimension, that the block
// size is 4, that INT8 is used, that both srcs are blocked, and
// check whether ReLU is used (this enables the fast path).
return memory_desc_matches_nchw_vect_c(src_md())
&& memory_desc_matches_nchw_vect_c(weights_md(0));
}
bool data_types_ok() const {
using namespace data_type;
dnnl_data_type_t src_type = src_md()->data_type;
dnnl_data_type_t weights_type = weights_md(0)->data_type;
dnnl_data_type_t bias_type = weights_md(1)->data_type;
dnnl_data_type_t dst_type = dst_md()->data_type;
dnnl_data_type_t acc_type = desc()->accum_data_type;
bool src_wei_match = src_type == weights_type;
// If no bias used, there is no need to check it
auto bias_may_use_type = with_bias() ? bias_type : src_type;
bool bias_match = IMPLICATION(with_bias(),
bias_type == f32
|| utils::everyone_is(f16, src_type, weights_type,
bias_type, dst_type));
bool acc_match = src_wei_match && src_type == s8
? acc_type == s32
: bias_match && bias_may_use_type == f16 ? acc_type == f16
: acc_type == f32;
switch (dst_type) {
case f32:
return src_wei_match && bias_match && acc_match
&& src_type == f32;
case f16:
return bias_match && acc_match && bias_may_use_type == f16;
case s8:
return src_wei_match && acc_match && weights_type == s8;
}
return false;
}
};
const pd_t *pd() const override {
return (const pd_t *)primitive_t::pd().get();
}
};
struct cudnn_conv_inner_product_bwd_data_t
: public cudnn_inner_product_bwd_data_t {
using cudnn_inner_product_bwd_data_t::cudnn_inner_product_bwd_data_t;
using parent_pd_t = cudnn_inner_product_bwd_data_t::pd_t;
struct pd_t : public parent_pd_t {
using parent_pd_t::parent_pd_t;
DECLARE_COMMON_PD_T(
"cuda:cudnn:conv", cudnn_conv_inner_product_bwd_data_t);
status_t init(engine_t *engine) {
using namespace data_type;
using namespace prop_kind;
bool ok = true && set_default_params() == status::success;
ok = ok && desc()->prop_kind == backward_data && data_types_ok()
&& no_blocking() && attr()->has_default_values()
&& memory_format_ok(diff_src_md())
&& memory_format_ok(weights_md(0))
&& memory_format_ok(diff_dst_md());
if (!ok) return status::unimplemented;
if (has_zero_dim_memory()) return status::success;
inner_product_impl_.reset(
new cudnn_conv_inner_product_bwd_data_impl_t());
return inner_product_impl_->init(
engine, this, false, false, false, false);
}
status_t set_default_params() {
using namespace format_tag;
auto set_default_diff_src = [&]() {
if (weights_md_.format_kind == format_kind::any) {
CHECK(init_mem_by_tag(
utils::pick(ndims() - 2, ab, abc, abcd, abcde),
diff_src_md_));
} else {
CHECK(init_mem_by_tag(get_tag(weights_md_), diff_src_md_));
}
return status::success;
};
auto set_default_weights = [&]() {
CHECK(init_mem_by_tag(get_tag(diff_src_md_), weights_md_));
return status::success;
};
if (diff_src_md_.format_kind == format_kind::any)
CHECK(set_default_diff_src());
if (weights_md_.format_kind == format_kind::any)
CHECK(set_default_weights());
if (diff_dst_md_.format_kind == format_kind::any)
CHECK(memory_desc_init_by_tag(diff_dst_md_, nc));
return status::success;
}
bool no_blocking() const {
return diff_src_md()->format_desc.blocking.inner_nblks
+ weights_md(0)->format_desc.blocking.inner_nblks
+ diff_dst_md()->format_desc.blocking.inner_nblks
== 0;
}
bool data_types_ok() const {
return utils::everyone_is(data_type::f32, diff_src_md()->data_type,
weights_md(0)->data_type, diff_dst_md()->data_type,
desc()->accum_data_type);
}
};
const pd_t *pd() const override {
return (const pd_t *)primitive_t::pd().get();
}
};
struct cudnn_conv_inner_product_bwd_weights_t
: public cudnn_inner_product_bwd_weights_t {
using cudnn_inner_product_bwd_weights_t::cudnn_inner_product_bwd_weights_t;
using parent_pd_t = cudnn_inner_product_bwd_weights_t::pd_t;
struct pd_t : public parent_pd_t {
using parent_pd_t::parent_pd_t;
DECLARE_COMMON_PD_T(
"cuda:cudnn:conv", cudnn_conv_inner_product_bwd_weights_t);
status_t init(engine_t *engine) {
using namespace data_type;
using namespace prop_kind;
bool ok = true && (set_default_params() == status::success);
ok = ok && (desc()->prop_kind == backward_weights)
&& data_types_ok() && no_blocking()
&& attr()->has_default_values()
&& memory_format_ok(src_md())
&& memory_format_ok(diff_weights_md(0))
&& memory_format_ok(diff_dst_md())
&& IMPLICATION(
with_bias(), memory_format_ok(diff_weights_md(1)));
if (!ok) return status::unimplemented;
if (has_zero_dim_memory()) return status::success;
inner_product_impl_.reset(
new cudnn_conv_inner_product_bwd_weights_impl_t());
return inner_product_impl_->init(
engine, this, false, false, false, false);
}
status_t set_default_params() {
using namespace format_tag;
auto set_default_src = [&]() {
if (diff_weights_md_.format_kind == format_kind::any) {
CHECK(init_mem_by_tag(
utils::pick(ndims() - 2, ab, abc, abcd, abcde),
src_md_));
} else {
CHECK(init_mem_by_tag(get_tag(diff_weights_md_), src_md_));
}
return status::success;
};
auto set_default_diff_weights = [&]() {
CHECK(init_mem_by_tag(get_tag(src_md_), diff_weights_md_));
return status::success;
};
if (src_md_.format_kind == format_kind::any)
CHECK(set_default_src());
if (diff_weights_md_.format_kind == format_kind::any)
CHECK(set_default_diff_weights());
if (diff_dst_md_.format_kind == format_kind::any)
CHECK(memory_desc_init_by_tag(diff_dst_md_, nc));
if (diff_bias_md_.format_kind == format_kind::any)
CHECK(memory_desc_init_by_tag(diff_bias_md_, x));
return status::success;
}
bool no_blocking() const {
return src_md()->format_desc.blocking.inner_nblks
+ diff_weights_md(0)->format_desc.blocking.inner_nblks
+ diff_weights_md(1)->format_desc.blocking.inner_nblks
+ diff_dst_md()->format_desc.blocking.inner_nblks
== 0;
}
bool data_types_ok() const {
return IMPLICATION(with_bias(),
diff_weights_md(1)->data_type == data_type::f32)
&& utils::everyone_is(data_type::f32, src_md()->data_type,
diff_weights_md(0)->data_type,
diff_dst_md()->data_type, desc()->accum_data_type);
}
};
const pd_t *pd() const override {
return (const pd_t *)primitive_t::pd().get();
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,701 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_CONV_INNER_PRODUCT_IMPL_HPP
#define GPU_NVIDIA_CUDNN_CONV_INNER_PRODUCT_IMPL_HPP
#include "cublas_v2.h"
#include "cudnn.h"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp"
#include "gpu/nvidia/cudnn_inner_product_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_conv_inner_product_impl_base_t
: public cudnn_inner_product_fwd_base_t,
public cudnn_conv_filter_adjustment_base_t {
bool unfold_dimensions_ = false;
cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
cudnnFilterDescriptor_t filter_desc_;
status_t filter_tag(
const memory_desc_t &md, format_tag_t &weight_tag) const {
using namespace format_tag;
weight_tag = memory_desc_matches_one_of_tag(md, oidhw, odhwi, dhwio,
oihw, ohwi, hwio, oiw, owi, wio, aBcd4b,
any); // blocked layouts
if (weight_tag == undef) return status::unimplemented;
return status::success;
}
status_t source_tag(const memory_desc_t &md, format_tag_t &src_tag) const {
using namespace format_tag;
src_tag = memory_desc_matches_one_of_tag(
md, ncdhw, ndhwc, nchw, nhwc, ncw, nwc, aBcd4b, any);
if (src_tag == undef) return status::unimplemented;
return status::success;
}
virtual ~cudnn_conv_inner_product_impl_base_t() {
if (conv_desc_) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyConvolutionDescriptor, conv_desc_);
}
if (filter_desc_) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyFilterDescriptor, filter_desc_);
}
for (size_t i = 0; i < NUM_IO - 1; i++) {
if (tensor_descs_[i]) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, tensor_descs_[i]);
}
}
}
void unfold_dims(io memory_index, int *folded_dims, int *folded_strides,
cudnnTensorFormat_t format, int ndims) {
folded_dims[0] = dims_[memory_index][0];
folded_dims[1] = dims_[memory_index][1];
for (int i = 2; i < ndims; i++) {
folded_dims[1] *= dims_[memory_index][i];
folded_dims[i] = 1;
}
for (int i = 2; i < ndims; i++) {
folded_strides[i]
= (format == CUDNN_TENSOR_NHWC ? folded_dims[1] : 1);
}
folded_strides[1] = 1;
folded_strides[0] = folded_dims[1];
}
virtual void execute(cudnnHandle_t handle, cublasHandle_t,
const std::vector<void *> &args) const = 0;
};
struct cudnn_conv_inner_product_fwd_impl_t
: public cudnn_conv_inner_product_impl_base_t {
bool use_fused_path_for_blocking_ = false;
bool input_is_blocked_ = false;
bool filter_is_blocked_ = false;
cudnnConvolutionFwdAlgo_t algo_;
cudnnActivationDescriptor_t act_desc_fuse_relu;
cudnnActivationDescriptor_t act_desc_no_relu_;
cudnnTensorFormat_t source_format_;
~cudnn_conv_inner_product_fwd_impl_t() {
if (with_bias_) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyActivationDescriptor, act_desc_fuse_relu);
}
if ((with_eltwise_ && !with_relu_) || (!with_bias_ && with_relu_)) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyActivationDescriptor, act_desc_no_relu_);
}
}
virtual status_t init(engine_t *engine, inner_product_pd_t *pd,
bool with_relu, bool with_eltwise, bool with_sum,
bool use_fuse_path_for_blocking) override {
with_bias_ = pd->with_bias();
with_relu_ = with_relu;
with_eltwise_ = with_eltwise;
use_fused_path_for_blocking_ = use_fuse_path_for_blocking;
output_scales_ = pd->attr()->output_scales_.scales_[0];
with_sum_ = with_sum;
scale_bias_ = (output_scales_ != 1) && with_bias_;
// scaling factor to add the previous destination value to the current
// computation
sum_scale_ = sum_scale(pd);
input_is_blocked_
= pd->src_md()->format_desc.blocking.inner_blks[0] == 4;
filter_is_blocked_
= pd->weights_md(0)->format_desc.blocking.inner_blks[0] == 4;
// Pad out the dimensions to at least 4.
if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) {
return status::invalid_arguments;
}
ndims_ = pd->ndims() < 4 ? 4 : pd->ndims();
// Initialise meta-data from the descriptors.
// Convert the padded dimensions to the dimensions expected by cuDNN.
get_4d_tensor_descriptor(
pd->src_md(), dims_[io::src], strides_[io::src]);
get_4d_tensor_descriptor(
pd->weights_md(), dims_[io::wei], strides_[io::wei]);
get_4d_tensor_descriptor(
pd->dst_md(), dims_[io::dst], strides_[io::dst]);
// Convert oneDNN data types to their cuDNN counterparts.
CHECK(convert_data_type(pd->src_md(), &data_types_[io::src]));
CHECK(convert_data_type(pd->weights_md(0), &data_types_[io::wei]));
if (input_is_blocked_) {
data_types_[io::dst] = CUDNN_DATA_INT8x4;
} else {
CHECK(convert_data_type(pd->dst_md(), &data_types_[io::dst]));
}
// Ensure INT8 types are accumulated with INT32.
if (data_types_[io::src] != CUDNN_DATA_HALF
&& data_types_[io::src] != CUDNN_DATA_FLOAT) {
data_types_[NUM_IO] = CUDNN_DATA_INT32;
}
cudnnTensorFormat_t weights_format;
format_tag_t w_tag, s_tag;
CHECK(filter_tag(*pd->weights_md(0), w_tag));
CHECK(source_tag(*pd->src_md(0), s_tag));
CHECK(get_format(
pd->src_md(), source_format_, pd->src_md()->ndims == 2));
// Currently cuDNN does not support
// cudnnConvolutionBiasActivationForward
// for 5D convolution. Therefore we have to unfold the dims for 5d when
// it is 5d. Also cuDNN does not support s8 type and nhwc format for
// 5d convolution.
unfold_dimensions_ = ndims_ > 4
&& ((pd->weights_md(0)->data_type == data_type::s8)
|| (source_format_ == CUDNN_TENSOR_NHWC) || with_bias_);
if (!supported_filter_format(pd->weights_md(0))
|| (unfold_dimensions_ && (w_tag != s_tag))
|| ((source_format_ == CUDNN_TENSOR_NCHW)
&& (w_tag != s_tag))) {
set_filter_format(
ndims_, dims_[io::wei], strides_[NUM_IO], source_format_);
CHECK(init_filter_transformation(data_types_[io::wei], ndims_,
dims_[io::wei], strides_[io::wei], strides_[NUM_IO]));
filter_using_spatial_format_ = true;
// we transform the filter based on src format
weights_format = source_format_;
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_none,
memory_desc_wrapper(pd->weights_md(0)).size(), size_t(1));
} else {
CHECK(get_format(pd->weights_md(0), weights_format,
pd->weights_md(0)->ndims == 2));
}
if (scale_bias_) {
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_conv_adjusted_scales,
memory_desc_wrapper(pd->weights_md(1)).size(), size_t(1));
}
// Copy over the strides.
if (with_bias_) {
CHECK(convert_data_type(pd->weights_md(1), &data_types_[io::bia]));
set_bias_dims(weights_format, ndims_, pd->OC());
}
// cuDNN requires Input and output feature maps to be a multiple of 4
// for int8. only nhwc is supported for int8// cudnn doesnot support
// 5d convolution format for int8
if ((pd->weights_md(0)->data_type == data_type::s8)
&& ((pd->IC() % 4 != 0) || (pd->OC() % 4 != 0))) {
return status::unimplemented;
}
// source format and weight format are the same at this stage
if (unfold_dimensions_) {
unfold_dims(io::wei, dims_[io::wei], strides_[io::wei],
source_format_, ndims_);
unfold_dims(io::src, dims_[io::src], strides_[io::src],
source_format_, ndims_);
ndims_ = 4;
}
if (input_is_blocked_) {
CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[io::src],
CUDNN_TENSOR_NCHW_VECT_C, data_types_[io::src], ndims_,
dims_[io::src]));
} else {
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::src],
data_types_[io::src], ndims_, dims_[io::src],
strides_[io::src]));
}
if (with_bias_) {
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia],
data_types_[io::bia], ndims_, dims_[io::bia],
strides_[io::bia]));
}
// If input is blocked, the output needs to be as well.
if (input_is_blocked_) {
CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[io::dst],
CUDNN_TENSOR_NCHW_VECT_C, data_types_[io::dst], ndims_,
dims_[io::dst]));
} else {
cudnnTensorFormat_t out_format
= filter_is_blocked_ ? CUDNN_TENSOR_NCHW : weights_format;
CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[io::dst],
out_format, data_types_[io::dst], ndims_, dims_[io::dst]));
}
CHECK(create_and_set_filter_descriptor(&filter_desc_, weights_format,
data_types_[io::wei], ndims_, dims_[io::wei],
strides_[io::wei]));
// Set the convolution. For inner product, this means unit strides and
// dilation, no padding, and with cross-correlation as the mode.
int conv_dims = ndims_ - 2;
std::vector<int> unit_strides(conv_dims, 1);
std::vector<int> unit_dilation(conv_dims, 1);
std::vector<int> zero_padding(conv_dims, 0);
CHECK(create_and_set_conv_descriptor(&conv_desc_, conv_dims,
zero_padding.data(), unit_strides.data(), unit_dilation.data(),
CUDNN_CROSS_CORRELATION, data_types_[NUM_IO]));
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
// Inner product can choose whatever algorithm it prefers, although
// for the identity post-op the IMPLICIT_PRECOMP_GEMM must be used.
// there is a bug in nvidia that cannot support
// cudnnGetConvolutionForwardAlgorithm for int8 type
if (pd->src_md()->data_type != data_type::s8
&& pd->weights_md(0)->data_type != data_type::s8) {
cudnnConvolutionFwdPreference_t algo_pref
= CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardAlgorithm,
handle, tensor_descs_[io::src], filter_desc_, conv_desc_,
tensor_descs_[io::dst], algo_pref, 0, &algo_));
} else {
algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
if (!with_relu_) {
algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
// Allocate the workspace from the algorithm selection, if applicable.
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize,
handle, tensor_descs_[io::src], filter_desc_, conv_desc_,
tensor_descs_[io::dst], algo_, &workspace_size_));
if (workspace_size_ > 0) {
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_iprod_int_dat_in_acc_dt,
workspace_size_, size_t(1));
}
// Add the eltwise op. Note that this only applies to the forward pass.
CHECK(create_and_set_op_descriptor(pd));
return status::success;
}
void execute(cudnnHandle_t handle, cublasHandle_t,
const std::vector<void *> &args) const override {
auto x = args[0], w = args[1], b = args[2], y = args[3],
workspace = args[4];
assert(args.size() == 7);
auto w_arg = w;
if (filter_using_spatial_format_) {
void *transformed_w = args[5];
transform_filter(handle, w, transformed_w);
w_arg = transformed_w;
}
if (with_bias_) {
auto scaled_bias = b;
if (scale_bias_) {
void *output_scale_workspace = args[6];
CUDNN_EXECUTE_FUNC(cudnnAddTensor, handle, &output_scales_,
tensor_descs_[io::bia], b, &beta_,
tensor_descs_[io::bia], output_scale_workspace);
scaled_bias = output_scale_workspace;
}
CUDNN_EXECUTE_FUNC(cudnnConvolutionBiasActivationForward, handle,
&output_scales_, tensor_descs_[io::src], x, filter_desc_,
w_arg, conv_desc_, algo_, workspace, workspace_size_,
&sum_scale_, tensor_descs_[io::dst], y,
tensor_descs_[io::bia], scaled_bias, act_desc_fuse_relu,
tensor_descs_[io::dst], y);
} else {
CUDNN_EXECUTE_FUNC(cudnnConvolutionForward, handle, &output_scales_,
tensor_descs_[io::src], x, filter_desc_, w_arg, conv_desc_,
algo_, workspace, workspace_size_, &sum_scale_,
tensor_descs_[io::dst], y);
}
if ((with_eltwise_ && !with_relu_) || (!with_bias_ && with_relu_)) {
CUDNN_EXECUTE_FUNC(cudnnActivationForward, handle,
act_desc_no_relu_, &alpha_, tensor_descs_[io::dst], y,
&beta_, tensor_descs_[io::dst], y);
}
}
private:
status_t create_and_set_op_descriptor(inner_product_pd_t *pd) {
if (with_bias_) {
auto mode_fuse = with_relu_ ? CUDNN_ACTIVATION_RELU
: CUDNN_ACTIVATION_IDENTITY;
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateActivationDescriptor, &act_desc_fuse_relu));
// For ReLU, a ceiling of 0 means no limit.
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor,
act_desc_fuse_relu, mode_fuse,
cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN,
eltwise_alpha(pd)));
}
if ((with_eltwise_ && !with_relu_) || (!with_bias_ && with_relu_)) {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateActivationDescriptor, &act_desc_no_relu_));
cudnnActivationMode_t no_relu_mode;
switch (eltwise_algorithm_kind(pd)) {
case alg_kind::eltwise_tanh:
no_relu_mode = CUDNN_ACTIVATION_TANH;
break;
case alg_kind::eltwise_elu:
no_relu_mode = CUDNN_ACTIVATION_ELU;
break;
case alg_kind::eltwise_relu:
no_relu_mode = CUDNN_ACTIVATION_RELU;
break;
case alg_kind::eltwise_logistic:
no_relu_mode = CUDNN_ACTIVATION_SIGMOID;
break;
case alg_kind::eltwise_bounded_relu:
no_relu_mode = CUDNN_ACTIVATION_CLIPPED_RELU;
break;
default: return status::unimplemented;
}
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor,
act_desc_no_relu_, no_relu_mode,
cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN,
eltwise_alpha(pd)));
}
return status::success;
}
};
struct cudnn_conv_inner_product_bwd_data_impl_t
: public cudnn_conv_inner_product_impl_base_t {
cudnnConvolutionBwdDataAlgo_t algo_;
// the type of filter depends on dy, however since dy is nc
// for nhwc filter the source must be nhwc as well.
// So we use the src type for transforming the filter.
cudnnTensorFormat_t diff_source_format_;
virtual status_t init(engine_t *engine, inner_product_pd_t *pd,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool /*using_fused_path_for_blocking*/) override {
// Pad out the dimensions to 4
if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) {
return status::invalid_arguments;
}
ndims_ = pd->ndims() < 4 ? 4 : pd->ndims();
// Initialise meta-data from the descriptors.
// Convert the padded dimensions to the dimensions expected by cuDNN.
get_4d_tensor_descriptor(
pd->diff_src_md(), dims_[io::src], strides_[io::src]);
get_4d_tensor_descriptor(
pd->weights_md(), dims_[io::wei], strides_[io::wei]);
get_4d_tensor_descriptor(
pd->diff_dst_md(), dims_[io::dst], strides_[io::dst]);
// Convert oneDNN data types to their cuDNN counterparts.
CHECK(convert_data_type(pd->diff_src_md(), &data_types_[io::src]));
CHECK(convert_data_type(pd->weights_md(0), &data_types_[io::wei]));
CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst]));
format_tag_t w_tag, s_tag;
CHECK(filter_tag(*pd->weights_md(0), w_tag));
CHECK(source_tag(*pd->diff_src_md(0), s_tag));
cudnnTensorFormat_t weights_format;
CHECK(get_format(pd->diff_src_md(), diff_source_format_));
// Currently nvidia does not support cudnnConvolution
// for 5D convolution when the filter format is nhwc.
// Therefore we have to unfold the dims for 5d when it is 5d.
unfold_dimensions_
= ndims_ > 4 && ((diff_source_format_ == CUDNN_TENSOR_NHWC));
// Copy over the strides.
// weight format and dy format must be the same, since dx is the result
// here, we check with diff_src, to make sure we get the correct result.
if (!supported_filter_format(pd->weights_md(0)) || (w_tag != s_tag)) {
set_filter_format(ndims_, dims_[io::wei], strides_[NUM_IO],
diff_source_format_);
CHECK(init_filter_transformation(data_types_[io::wei], ndims_,
dims_[io::wei], strides_[io::wei], strides_[NUM_IO]));
filter_using_spatial_format_ = true;
// the type of weight format must match
weights_format = diff_source_format_;
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_none,
memory_desc_wrapper(pd->weights_md(0)).size(), size_t(1));
} else {
CHECK(get_format(pd->weights_md(0), weights_format));
}
// source format and weight format are the same at this stage
if (unfold_dimensions_) {
unfold_dims(io::wei, dims_[io::wei], strides_[io::wei],
diff_source_format_, ndims_);
unfold_dims(io::src, dims_[io::src], strides_[io::src],
diff_source_format_, ndims_);
ndims_ = 4;
}
// Set the tensor descriptors from the dimensions and strides.
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::src],
data_types_[io::src], ndims_, dims_[io::src],
strides_[io::src]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::dst],
data_types_[io::dst], ndims_, dims_[io::dst],
strides_[io::dst]));
CHECK(create_and_set_filter_descriptor(&filter_desc_, weights_format,
data_types_[io::wei], ndims_, dims_[io::wei],
strides_[io::wei]));
// Set the convolution. For inner product, this means unit strides and
// dilation, no padding, and with cross-correlation as the mode.
int conv_dims = ndims_ - 2;
std::vector<int> unit_strides(conv_dims, 1);
std::vector<int> unit_dilation(conv_dims, 1);
std::vector<int> zero_padding(conv_dims, 0);
CHECK(create_and_set_conv_descriptor(&conv_desc_, conv_dims,
zero_padding.data(), unit_strides.data(), unit_dilation.data(),
CUDNN_CROSS_CORRELATION, data_types_[NUM_IO]));
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
// Inner product can choose whatever algorithm it prefers.
cudnnConvolutionBwdDataPreference_t algo_pref
= CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
CUDNN_EXECUTE_FUNC(cudnnGetConvolutionBackwardDataAlgorithm, handle,
filter_desc_, tensor_descs_[io::dst], conv_desc_,
tensor_descs_[io::src], algo_pref, 0, &algo_);
// Allocate the workspace from the algorithm selection, if applicable.
CUDNN_EXECUTE_FUNC(cudnnGetConvolutionBackwardDataWorkspaceSize, handle,
filter_desc_, tensor_descs_[io::dst], conv_desc_,
tensor_descs_[io::src], algo_, &workspace_size_);
if (workspace_size_ > 0) {
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_iprod_int_dat_in_acc_dt,
workspace_size_, size_t(1));
}
return status::success;
}
void execute(cudnnHandle_t handle, cublasHandle_t,
const std::vector<void *> &args) const override {
assert(args.size() == 5);
auto dx = args[0], w = args[1], dy = args[2], workspace = args[3];
auto w_arg = w;
if (filter_using_spatial_format_) {
auto transformed_w = args[4];
transform_filter(handle, w, transformed_w);
w_arg = transformed_w;
}
CUDNN_EXECUTE_FUNC(cudnnConvolutionBackwardData, handle, &alpha_,
filter_desc_, w_arg, tensor_descs_[io::dst], dy, conv_desc_,
algo_, workspace, workspace_size_, &beta_,
tensor_descs_[io::src], dx);
}
};
struct cudnn_conv_inner_product_bwd_weights_impl_t
: public cudnn_conv_inner_product_impl_base_t {
cudnnConvolutionBwdFilterAlgo_t algo_;
cudnnTensorFormat_t source_format_;
virtual status_t init(engine_t *engine, inner_product_pd_t *pd,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool /*using_fused_path_for_blocking*/) override {
// If any of the dimensions are 0 we should not continue with creating
// cudnn descriptors
with_bias_ = pd->with_bias();
// Pad out the dimensions to 4
if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) {
return status::invalid_arguments;
}
ndims_ = pd->ndims() < 4 ? 4 : pd->ndims();
// Initialise meta-data from the descriptors.
// Convert the padded dimensions to the dimensions expected by cuDNN.
get_4d_tensor_descriptor(
pd->src_md(), dims_[io::src], strides_[io::src]);
get_4d_tensor_descriptor(
pd->diff_weights_md(), dims_[io::wei], strides_[io::wei]);
get_4d_tensor_descriptor(
pd->diff_dst_md(), dims_[io::dst], strides_[io::dst]);
format_tag_t w_tag, s_tag;
CHECK(filter_tag(*pd->diff_weights_md(0), w_tag));
CHECK(source_tag(*pd->src_md(0), s_tag));
cudnnTensorFormat_t diff_weights_format;
CHECK(get_format(pd->src_md(0), source_format_));
// Currently nvidia does not support cudnnConvolution
// for 5D convolution when the filter format is nhwc.
// Therefore we have to unfold the dims for 5d when it is 5d.
unfold_dimensions_
= ndims_ > 4 && ((source_format_ == CUDNN_TENSOR_NHWC));
// weight format and src format must be the same.
// we check with src, to make sure we get the correct result.
if (!supported_filter_format(pd->diff_weights_md(0))
|| (w_tag != s_tag)) {
set_filter_format(
ndims_, dims_[io::wei], strides_[NUM_IO], source_format_);
CHECK(init_filter_transformation(data_types_[io::wei], ndims_,
dims_[io::wei], strides_[NUM_IO], strides_[io::wei]));
filter_using_spatial_format_ = true;
// the type of weight format must match
diff_weights_format = source_format_;
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_none,
memory_desc_wrapper(pd->diff_weights_md(0)).size(),
size_t(1));
} else {
CHECK(get_format(pd->diff_weights_md(0), diff_weights_format));
}
// Copy over the strides.
// Convert oneDNN data types to their cuDNN counterparts.
CHECK(convert_data_type(pd->src_md(), &data_types_[io::src]));
CHECK(convert_data_type(pd->diff_weights_md(0), &data_types_[io::wei]));
CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst]));
// source format and weight format are the same at this stage
if (unfold_dimensions_) {
unfold_dims(io::wei, dims_[io::wei], strides_[io::wei],
source_format_, ndims_);
unfold_dims(io::src, dims_[io::src], strides_[io::src],
source_format_, ndims_);
ndims_ = 4;
}
if (with_bias_) {
set_bias_dims(diff_weights_format, ndims_, pd->OC());
CHECK(convert_data_type(
pd->diff_weights_md(1), &data_types_[io::bia]));
}
// Set the tensor descriptors from the dimensions and strides.
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::src],
data_types_[io::src], ndims_, dims_[io::src],
strides_[io::src]));
CHECK(create_and_set_filter_descriptor(&filter_desc_,
diff_weights_format, data_types_[io::wei], ndims_,
dims_[io::wei], strides_[io::wei]));
// oneDNN does not set unused dimensions and strides in the output, so
// we do that here. If nhwc filter, then repeat the N stride for the
// spatial dimensions.
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::dst],
data_types_[io::dst], ndims_, dims_[io::dst],
strides_[io::dst]));
if (with_bias_) {
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia],
data_types_[io::bia], ndims_, dims_[io::bia],
strides_[io::bia]));
}
// Set the convolution. For inner product, this means unit strides and
// dilation, no padding, and with cross-correlation as the mode.
int conv_dims = ndims_ - 2;
std::vector<int> unit_strides(conv_dims, 1);
std::vector<int> unit_dilation(conv_dims, 1);
std::vector<int> zero_padding(conv_dims, 0);
CHECK(create_and_set_conv_descriptor(&conv_desc_, conv_dims,
zero_padding.data(), unit_strides.data(), unit_dilation.data(),
CUDNN_CROSS_CORRELATION, data_types_[NUM_IO]));
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
// Inner product can choose whatever algorithm it prefers.
cudnnConvolutionBwdFilterPreference_t algo_pref
= CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
CUDNN_EXECUTE_FUNC(cudnnGetConvolutionBackwardFilterAlgorithm, handle,
tensor_descs_[io::src], tensor_descs_[io::dst], conv_desc_,
filter_desc_, algo_pref, 0, &algo_);
// Allocate the workspace from the algorithm selection, if applicable.
CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionBackwardFilterWorkspaceSize,
handle, tensor_descs_[io::src], tensor_descs_[io::dst],
conv_desc_, filter_desc_, algo_, &workspace_size_);
if (workspace_size_ > 0) {
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_iprod_int_dat_in_acc_dt,
workspace_size_, size_t(1));
}
return status::success;
}
void execute(cudnnHandle_t handle, cublasHandle_t,
const std::vector<void *> &args) const override {
assert(args.size() == 6);
auto x = args[0], dy = args[1], dw = args[2], db = args[3],
workspace = args[4];
auto dw_arg = filter_using_spatial_format_ ? args[5] : dw;
CUDNN_EXECUTE_FUNC(cudnnConvolutionBackwardFilter, handle, &alpha_,
tensor_descs_[io::src], x, tensor_descs_[io::dst], dy,
conv_desc_, algo_, workspace, workspace_size_, &beta_,
filter_desc_, dw_arg);
if (filter_using_spatial_format_) {
// The output of weight is in nvida specific format,
// however a user requires the oneDNN format as an output
transform_filter(handle, dw_arg, dw);
}
if (with_bias_) {
CUDNN_EXECUTE_FUNC(cudnnConvolutionBackwardBias, handle, &alpha_,
tensor_descs_[io::dst], dy, &beta_, tensor_descs_[io::bia],
db);
}
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,256 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_convolution.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_convolution_fwd_t::execute_convolution(
const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write>;
auto x_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto weights_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto y_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
std::shared_ptr<
cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read>>
bias_acc;
std::shared_ptr<scratch_acc_t> scratch_acc;
std::shared_ptr<scratch_acc_t> filter_scratch_acc;
std::shared_ptr<scratch_acc_t> temp_dst_acc;
std::shared_ptr<scratch_acc_t> temp_reorder_acc;
if (with_scratchpad) {
scratch_acc = std::make_shared<scratch_acc_t>(
utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
ctx.get_scratchpad_grantor()
.get_memory_storage(memory_tracking::names::
key_conv_cudnn_algo)
.get())
->buffer()
.get_access<cl::sycl::access::mode::read_write>(
cgh));
}
if (with_bias) {
bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read>>(
CTX_IN_ACCESSOR(DNNL_ARG_BIAS));
}
if (pd()->impl_->using_transformed_filter()) {
filter_scratch_acc
= std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
memory_tracking::names::key_conv_cudnn_filter));
}
if (pd()->use_temp_dst_) {
temp_dst_acc = std::make_shared<scratch_acc_t>(
buffer(scratch_storage.get())
.get_access<cl::sycl::access::mode::read_write>(
cgh));
temp_reorder_acc = std::make_shared<scratch_acc_t>(
buffer(scratch_storage_2.get())
.get_access<cl::sycl::access::mode::read_write>(
cgh));
}
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
std::vector<void *> args;
args.push_back(sc.memory<void *>(ih, x_acc));
args.push_back(sc.memory<void *>(ih, weights_acc));
args.push_back(sc.memory<void *>(ih, y_acc));
args.push_back(
with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
: nullptr);
args.push_back(pd()->impl_->using_transformed_filter()
? sc.memory<void *>(ih, *filter_scratch_acc)
: nullptr);
args.push_back(pd()->use_temp_dst_
? sc.memory<void *>(ih, *temp_dst_acc)
: nullptr);
args.push_back(pd()->use_temp_dst_
? sc.memory<void *>(ih, *temp_reorder_acc)
: nullptr);
pd()->impl_->execute(handle, args);
});
});
}
status_t cudnn_convolution_bwd_data_t::execute_convolution(
const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write>;
auto x_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
auto weights_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
std::shared_ptr<
cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read>>
bias_acc;
std::shared_ptr<scratch_acc_t> scratch_acc;
std::shared_ptr<scratch_acc_t> filter_scratch_acc;
if (with_scratchpad) {
scratch_acc = std::make_shared<scratch_acc_t>(
utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
ctx.get_scratchpad_grantor()
.get_memory_storage(memory_tracking::names::
key_conv_cudnn_algo)
.get())
->buffer()
.get_access<cl::sycl::access::mode::read_write>(
cgh));
}
if (with_bias) {
bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read>>(
CTX_IN_ACCESSOR(DNNL_ARG_BIAS));
}
if (pd()->impl_->using_transformed_filter()) {
filter_scratch_acc
= std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
memory_tracking::names::key_conv_cudnn_filter));
}
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
std::vector<void *> args;
args.push_back(sc.memory<void *>(ih, x_acc));
args.push_back(sc.memory<void *>(ih, weights_acc));
args.push_back(sc.memory<void *>(ih, y_acc));
args.push_back(
with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
: nullptr);
args.push_back(pd()->impl_->using_transformed_filter()
? sc.memory<void *>(ih, *filter_scratch_acc)
: nullptr);
pd()->impl_->execute(handle, args);
});
});
}
status_t cudnn_convolution_bwd_weights_t::execute_zero_dims(
const exec_ctx_t &ctx) const {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto weights_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS);
std::shared_ptr<
cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write>>
bias_acc;
if (pd()->with_bias()) {
bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::write>>(
CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS));
}
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto weights = sc.memory<void *>(ih, weights_acc);
void *bias = nullptr;
if (pd()->with_bias()) bias = sc.memory<void *>(ih, *bias_acc);
pd()->impl_->execute_set_weights_bias(handle, weights, bias, 0.f);
});
});
}
status_t cudnn_convolution_bwd_weights_t::execute_convolution(
const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write>;
auto x_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto weights_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS);
auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
std::shared_ptr<
cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write>>
bias_acc;
std::shared_ptr<scratch_acc_t> scratch_acc;
std::shared_ptr<scratch_acc_t> filter_scratch_acc;
if (with_scratchpad) {
scratch_acc = std::make_shared<scratch_acc_t>(
utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
ctx.get_scratchpad_grantor()
.get_memory_storage(memory_tracking::names::
key_conv_cudnn_algo)
.get())
->buffer()
.get_access<cl::sycl::access::mode::read_write>(
cgh));
}
if (with_bias) {
bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::write>>(
CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS));
}
if (pd()->impl_->using_transformed_filter()) {
filter_scratch_acc
= std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
memory_tracking::names::key_conv_cudnn_filter));
}
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
std::vector<void *> args;
args.push_back(sc.memory<void *>(ih, x_acc));
args.push_back(sc.memory<void *>(ih, weights_acc));
args.push_back(sc.memory<void *>(ih, y_acc));
args.push_back(
with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
: nullptr);
args.push_back(pd()->impl_->using_transformed_filter()
? sc.memory<void *>(ih, *filter_scratch_acc)
: nullptr);
pd()->impl_->execute(handle, args);
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,333 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_CONVOLUTION_HPP
#define GPU_NVIDIA_CUDNN_CONVOLUTION_HPP
#include "cudnn.h"
#include "common/c_types_map.hpp"
#include "common/primitive.hpp"
#include "common/primitive_desc.hpp"
#include "gpu/nvidia/cudnn_convolution_impl.hpp"
#include "gpu/nvidia/cudnn_convolution_pd.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_convolution_fwd_t : public primitive_t {
struct pd_t : public cudnn_convolution_fwd_pd_t {
using cudnn_convolution_fwd_pd_t::cudnn_convolution_fwd_pd_t;
pd_t(const pd_t &other)
: cudnn_convolution_fwd_pd_t(other)
, impl_(other.impl_)
, use_temp_dst_(other.use_temp_dst_)
, dst_md_temp_(other.dst_md_temp_) {}
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_convolution_fwd_t);
status_t init(engine_t *engine) {
using namespace data_type;
const auto attr_skip_mask = primitive_attr_t::skip_mask_t::oscale
| primitive_attr_t::skip_mask_t::post_ops;
bool ok = utils::one_of(desc()->prop_kind,
prop_kind::forward_training, prop_kind::forward_inference);
ok = ok && attr()->has_default_values(attr_skip_mask);
ok = ok && post_ops_ok(attr());
ok = ok
&& (utils::everyone_is(f32, src_md_.data_type,
weights_md_.data_type, dst_md_.data_type)
|| utils::everyone_is(f16, src_md_.data_type,
weights_md_.data_type, dst_md_.data_type)
|| (utils::everyone_is(s8, src_md_.data_type,
weights_md_.data_type)
&& utils::one_of(
dst_md_.data_type, f32, s8)));
ok = ok && this->set_default_formats();
ok = ok
&& IMPLICATION(
desc()->alg_kind == dnnl_convolution_winograd,
ndims() < 5 && src_md_.data_type != s8);
ok = ok
&& IMPLICATION(!attr()->output_scales_.has_default_values(),
src_md_.data_type == s8
&& attr()->output_scales_.mask_ == 0);
ok = ok
&& IMPLICATION(
src_md_.data_type == s8, check_s8_configuration());
ok = ok && memory_format_ok(&src_md_);
ok = ok && memory_format_ok(&weights_md_);
ok = ok && memory_format_ok(&dst_md_);
if (with_bias()) ok = ok && memory_format_ok(&bias_md_);
if (!ok) return status::unimplemented;
if (check_for_zero_dims()) return status::success;
if (use_temp_dst_) {
dst_md_temp_ = dst_md_;
if (dst_md_.data_type == s8) { dst_md_temp_.data_type = f32; }
}
impl_.reset(new cudnn_convolution_impl_fwd_t());
return impl_->init(engine, this, use_temp_dst_);
}
bool with_scratchpad() const { return impl_->with_scratchpad(); }
std::shared_ptr<cudnn_convolution_impl_base_t> impl_;
bool use_temp_dst_ = attr()->post_ops_.len() > 0;
memory_desc_t dst_md_temp_;
private:
bool set_default_formats() {
using namespace format_tag;
if (src_md_.data_type == dnnl_s8) {
auto dat_tag = utils::pick(ndims() - 3, nwc, nhwc, ndhwc);
auto wei_tag = with_groups()
? utils::pick(ndims() - 3, gowi, gohwi, godhwi)
: utils::pick(ndims() - 3, owi, ohwi, odhwi);
return set_default_formats_common(dat_tag, wei_tag, dat_tag);
} else {
auto dat_tag = utils::pick(ndims() - 3, ncw, nchw, ncdhw);
auto wei_tag = with_groups()
? utils::pick(ndims() - 3, goiw, goihw, goidhw)
: utils::pick(ndims() - 3, oiw, oihw, oidhw);
return set_default_formats_common(dat_tag, wei_tag, dat_tag);
}
}
bool post_ops_ok(const primitive_attr_t *attr) const {
const auto &p = attr->post_ops_;
auto is_eltwise
= [&](int idx) { return p.entry_[idx].is_eltwise(false); };
auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); };
switch (p.len()) {
case 0: return true; // no post_ops
case 1: return is_eltwise(0) || is_sum(0); // sum OR eltwise
case 2:
if (src_md_.data_type == dnnl_s8 && is_eltwise(0)
&& is_sum(1))
return true;
return (is_sum(0) && is_eltwise(1));
default: return false;
}
return false;
}
bool check_s8_configuration() const {
const auto check_nhwc = [](const dnnl_memory_desc_t &md,
bool is_weights = false) {
cudnnTensorFormat_t fmt;
get_format(&md, fmt, is_weights);
return fmt == CUDNN_TENSOR_NHWC;
};
return check_nhwc(src_md_) && check_nhwc(dst_md_)
&& check_nhwc(weights_md_, true)
&& (src_md_.dims[1] % 4) == 0 && (dst_md_.dims[1] % 4) == 0
&& ndims() < 5;
}
};
cudnn_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t init_temp_dst(engine_t *engine) {
auto sycl_engine = utils::downcast<sycl_cuda_engine_t *>(engine);
memory_storage_t *scratch_ptr = nullptr;
auto wrap = memory_desc_wrapper(pd()->dst_md_temp_);
CHECK(sycl_engine->create_memory_storage(
&scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
scratch_storage.reset(scratch_ptr);
CHECK(sycl_engine->create_memory_storage(
&scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
scratch_storage_2.reset(scratch_ptr);
return status::success;
}
virtual status_t init(engine_t *engine) {
if (pd()->use_temp_dst_) { init_temp_dst(engine); }
return status::success;
}
status_t execute(const exec_ctx_t &ctx) const override {
if (pd()->check_for_zero_dims()) { return status::success; }
execute_convolution(ctx, pd()->with_bias(), pd()->with_scratchpad());
return status::success;
}
status_t execute_convolution(
const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const;
private:
cl::sycl::buffer<uint8_t, 1> &buffer(memory_storage_t *mem_storage) const {
return utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
mem_storage)
->buffer();
}
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
std::shared_ptr<memory_storage_t> scratch_storage;
std::shared_ptr<memory_storage_t> scratch_storage_2;
};
struct cudnn_convolution_bwd_data_t : public primitive_t {
struct pd_t : public cudnn_convolution_bwd_data_pd_t {
using cudnn_convolution_bwd_data_pd_t::cudnn_convolution_bwd_data_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_convolution_bwd_data_t);
status_t init(engine_t *engine) {
using namespace data_type;
bool ok = desc()->prop_kind == prop_kind::backward_data;
ok = ok && this->set_default_formats();
ok = ok
&& (utils::everyone_is(f32, diff_src_md_.data_type,
weights_md_.data_type, diff_dst_md_.data_type)
|| utils::everyone_is(f16, diff_src_md_.data_type,
weights_md_.data_type,
diff_dst_md_.data_type));
ok = ok
&& IMPLICATION(
desc()->alg_kind == dnnl_convolution_winograd,
ndims() < 5);
ok = ok && memory_format_ok(&diff_src_md_);
ok = ok && memory_format_ok(&weights_md_);
ok = ok && memory_format_ok(&diff_dst_md_);
if (with_bias()) {
ok = ok && memory_format_ok(&bias_md_);
ok = ok && bias_md_.data_type == diff_dst_md_.data_type;
}
if (!ok) return status::unimplemented;
if (check_for_zero_dims()) return status::success;
impl_.reset(new cudnn_convolution_impl_bwd_data_t());
return impl_->init(engine, this);
}
std::shared_ptr<cudnn_convolution_impl_base_t> impl_;
bool set_default_formats() {
using namespace format_tag;
auto dat_tag = utils::pick(ndims() - 3, ncw, nchw, ncdhw);
auto wei_tag = with_groups()
? utils::pick(ndims() - 3, goiw, goihw, goidhw)
: utils::pick(ndims() - 3, oiw, oihw, oidhw);
return set_default_formats_common(dat_tag, wei_tag, dat_tag);
}
bool with_scratchpad() const { return impl_->with_scratchpad(); }
bool support_bias() const override { return true; }
};
cudnn_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
~cudnn_convolution_bwd_data_t() {}
status_t execute(const exec_ctx_t &ctx) const override {
if (pd()->check_for_zero_dims()) { return status::success; }
return execute_convolution(
ctx, pd()->with_bias(), pd()->with_scratchpad());
}
status_t execute_convolution(
const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
struct cudnn_convolution_bwd_weights_t : public primitive_t {
struct pd_t : public cudnn_convolution_bwd_weights_pd_t {
using cudnn_convolution_bwd_weights_pd_t::
cudnn_convolution_bwd_weights_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_convolution_bwd_weights_t);
status_t init(engine_t *engine) {
using namespace data_type;
bool ok = desc()->prop_kind == prop_kind::backward_weights;
ok = ok && this->set_default_formats();
ok = ok
&& (utils::everyone_is(f32, src_md_.data_type,
diff_weights_md_.data_type,
diff_dst_md_.data_type)
|| utils::everyone_is(f16, src_md_.data_type,
diff_weights_md_.data_type,
diff_dst_md_.data_type));
ok = ok
&& IMPLICATION(
desc()->alg_kind == dnnl_convolution_winograd,
ndims() < 5);
ok = ok && memory_format_ok(&src_md_);
ok = ok && memory_format_ok(&diff_weights_md_);
ok = ok && memory_format_ok(&diff_dst_md_);
if (with_bias()) {
ok = ok && memory_format_ok(&diff_bias_md_);
ok = ok && diff_bias_md_.data_type == diff_dst_md_.data_type;
}
if (!ok) return status::unimplemented;
impl_.reset(new cudnn_convolution_impl_bwd_weights_t());
if (check_for_zero_dims()) { return impl_->init_zero_dims(this); };
return impl_->init(engine, this);
}
std::shared_ptr<cudnn_convolution_impl_base_t> impl_;
bool set_default_formats() {
using namespace format_tag;
auto dat_tag = utils::pick(ndims() - 3, ncw, nchw, ncdhw);
auto wei_tag = with_groups()
? utils::pick(ndims() - 3, goiw, goihw, goidhw)
: utils::pick(ndims() - 3, oiw, oihw, oidhw);
return set_default_formats_common(dat_tag, wei_tag, dat_tag);
}
bool with_scratchpad() const { return impl_->with_scratchpad(); }
};
cudnn_convolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {}
~cudnn_convolution_bwd_weights_t() {}
status_t execute(const exec_ctx_t &ctx) const override {
if (pd()->check_for_zero_dims()) { return execute_zero_dims(ctx); }
return execute_convolution(
ctx, pd()->with_bias(), pd()->with_scratchpad());
}
status_t execute_convolution(
const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const;
status_t execute_zero_dims(const exec_ctx_t &ctx) const;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,900 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_CONVOLUTION_IMPL_HPP
#define GPU_NVIDIA_CUDNN_CONVOLUTION_IMPL_HPP
#include "cudnn.h"
#include "common/c_types_map.hpp"
#include "common/convolution_pd.hpp"
#include "gpu/nvidia/cudnn_conv_filter_adjustment_base.hpp"
#include "gpu/nvidia/cudnn_convolution_pd.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_convolution_impl_base_t
: public cudnn_conv_filter_adjustment_base_t {
protected:
enum io { x = 0, bias, weights, y, NUM_IO };
memory_desc_t dnnl_descs[NUM_IO];
cudnnConvolutionDescriptor_t conv_desc;
int padding[CUDNN_DIM_MAX];
int dilation[CUDNN_DIM_MAX];
cudnnTensorDescriptor_t descs[NUM_IO];
cudnnDataType_t data_types[NUM_IO];
int ndims[NUM_IO];
int dims[NUM_IO][DNNL_MAX_NDIMS];
int strides[NUM_IO + 1][DNNL_MAX_NDIMS];
int filter_strides[DNNL_MAX_NDIMS];
cudnnTensorFormat_t formats[NUM_IO];
bool filter_needs_transform = false;
cudnnFilterDescriptor_t weights_desc;
float alpha = 0.f;
float beta = 0.f;
int group_count = 1;
bool with_groups = false;
size_t scratchpad_size = 0;
bool with_bias = false;
bool do_scaling = false;
float output_scaling = 1.0f;
cudnnDataType_t computation_data_type = CUDNN_DATA_FLOAT;
cudnnDataType_t reorder_type = CUDNN_DATA_INT8;
public:
virtual ~cudnn_convolution_impl_base_t() {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyFilterDescriptor, weights_desc);
CUDNN_EXECUTE_FUNC_V(cudnnDestroyConvolutionDescriptor, conv_desc);
for (size_t i = 0; i < io::NUM_IO; i++) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, descs[i]);
}
}
virtual status_t configure_alg_kind(engine_t *, convolution_pd_t *pd) = 0;
virtual bool supported_filter_format(const memory_desc_t *md) const {
const memory_desc_wrapper mem_wrapper(md);
return (mem_wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc,
format_tag::abcd, format_tag::abcde, format_tag::abcdef)
|| (with_groups ? mem_wrapper.matches_one_of_tag(
format_tag::gowi, format_tag::gohwi,
format_tag::godhwi)
: mem_wrapper.matches_one_of_tag(
format_tag::owi, format_tag::ohwi,
format_tag::odhwi)));
}
bool using_transformed_filter() const { return filter_needs_transform; }
bool with_scratchpad() const { return scratchpad_size > 0; }
virtual status_t init(engine_t *engine, convolution_pd_t *pd,
bool use_scratch_dst = false) {
CHECK(configure_parameters(pd, use_scratch_dst));
CHECK(create_cudnn_descs(pd));
CHECK(check_output_dims());
CHECK(configure_alg_kind(engine, pd));
CHECK(init_scratchpad(engine, pd));
return status::success;
}
virtual status_t init_zero_dims(convolution_pd_t *pd) {
return status::success;
}
void get_dims_and_strides(int io) {
convert_dims(
dnnl_descs[io].dims, dims[io], dnnl_descs[io].ndims, ndims[io]);
if (ndims[io] > dnnl_descs[io].ndims) {
std::swap(dims[io][ndims[io] - 1], dims[io][ndims[io] - 2]);
if (ndims[io] == 4) {
if (formats[io] == CUDNN_TENSOR_NHWC) {
propagate_strides(strides[io], dims[io], {1, 3, 2, 0});
} else {
propagate_strides(strides[io], dims[io], {3, 2, 1, 0});
}
}
} else {
convert_dims(dnnl_descs[io].format_desc.blocking.strides,
strides[io], dnnl_descs[io].ndims, ndims[io]);
}
}
status_t configure_parameters(
const convolution_pd_t *pd, bool use_scratch_dst) {
if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
CHECK(set_padding_and_dilation(pd));
with_groups = pd->with_groups();
with_bias = pd->with_bias();
alpha = 1.0f;
beta = 0.0f;
output_scaling = pd->attr()->output_scales_.scales_[0];
do_scaling = output_scaling != 1.f;
dnnl_descs[x] = *pd->invariant_src_md();
dnnl_descs[weights] = *pd->invariant_wei_md();
dnnl_descs[y] = *pd->invariant_dst_md();
if (with_bias) dnnl_descs[bias] = *pd->invariant_bia_md();
ndims[x] = std::max(dnnl_descs[x].ndims, 4);
ndims[weights] = std::max(dnnl_descs[weights].ndims, 4 + with_groups);
ndims[y] = std::max(dnnl_descs[y].ndims, 4);
CHECK(convert_data_type(&dnnl_descs[x], &data_types[x]));
CHECK(convert_data_type(&dnnl_descs[weights], &data_types[weights]));
CHECK(convert_data_type(&dnnl_descs[y], &data_types[y]));
CHECK(get_formats());
set_compute_format();
get_dims_and_strides(x);
get_dims_and_strides(weights);
get_dims_and_strides(y);
if (!supported_filter_format(&dnnl_descs[weights])) {
set_filter_format(
ndims[weights], dims[weights], strides[NUM_IO], formats[x]);
CHECK(init_filter_transformation(data_types[weights],
ndims[weights], dims[weights], strides[weights],
strides[NUM_IO]));
filter_needs_transform = true;
// we transform the filter based on src format
formats[weights] = formats[x];
} else {
CHECK(get_filter_format());
get_dims_and_strides(weights);
}
if (with_groups) {
dims[weights][1] *= pd->G();
ndims[weights] = std::max(4, ndims[weights] - with_groups);
}
if (with_bias) {
ndims[bias] = dnnl_descs[bias].ndims;
CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias]));
convert_dims(
dnnl_descs[bias].dims, dims[bias], ndims[bias], ndims[y]);
std::swap(dims[bias][0], dims[bias][1]);
convert_dims(dnnl_descs[bias].format_desc.blocking.strides,
strides[bias], ndims[bias], ndims[y]);
ndims[bias] = ndims[y];
}
return status::success;
}
status_t create_cudnn_descs(const convolution_pd_t *pd) {
CHECK(create_and_set_convolution_desc(pd));
CHECK(create_and_set_tensor_descriptor(
&descs[x], data_types[x], ndims[x], dims[x], strides[x]));
CHECK(create_and_set_filter_descriptor(&weights_desc, formats[weights],
data_types[weights], ndims[weights],
dims[weights] + with_groups, strides[weights]));
CHECK(create_and_set_tensor_descriptor(
&descs[y], data_types[y], ndims[y], dims[y], strides[y]));
if (with_bias) {
CHECK(create_and_set_tensor_descriptor(&descs[bias],
data_types[bias], ndims[bias], dims[bias], strides[bias]));
}
return status::success;
}
virtual status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) {
if (filter_needs_transform) {
auto sz = memory_desc_wrapper(&dnnl_descs[weights]).size();
auto data_size
= types::data_type_size(pd->invariant_wei_md(0)->data_type);
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_conv_cudnn_filter, sz,
data_size);
}
return status::success;
};
status_t create_and_set_convolution_desc(const convolution_pd_t *pd) {
CUDNN_EXECUTE_FUNC_V(cudnnCreateConvolutionDescriptor, &conv_desc);
CUDNN_EXECUTE_FUNC_V(cudnnSetConvolutionNdDescriptor, conv_desc,
ndims[x] - 2, padding, filter_strides, dilation,
cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
computation_data_type);
// Check for groups and set group count if necessary
if (with_groups) {
group_count = pd->G();
if (group_count > 1)
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnSetConvolutionGroupCount, conv_desc, group_count));
}
return status::success;
}
status_t set_padding_and_dilation(const convolution_pd_t *pd) {
int actual_ndims = pd->ndims();
if (actual_ndims == 3) {
padding[0] = 0;
padding[1] = static_cast<int>(pd->padL());
dilation[0] = 1;
dilation[1] = static_cast<int>(pd->KDW() + 1);
filter_strides[0] = 1;
filter_strides[1] = static_cast<int>(pd->KSW());
} else if (actual_ndims == 4) {
padding[0] = static_cast<int>(pd->padT());
padding[1] = static_cast<int>(pd->padL());
dilation[0] = static_cast<int>(pd->KDH() + 1);
dilation[1] = static_cast<int>(pd->KDW() + 1);
filter_strides[0] = static_cast<int>(pd->KSH());
filter_strides[1] = static_cast<int>(pd->KSW());
} else {
padding[0] = static_cast<int>(pd->padFront());
padding[1] = static_cast<int>(pd->padT());
padding[2] = static_cast<int>(pd->padL());
dilation[0] = static_cast<int>(pd->KDD() + 1);
dilation[1] = static_cast<int>(pd->KDH() + 1);
dilation[2] = static_cast<int>(pd->KDW() + 1);
filter_strides[0] = static_cast<int>(pd->KSD());
filter_strides[1] = static_cast<int>(pd->KSH());
filter_strides[2] = static_cast<int>(pd->KSW());
}
return status::success;
}
virtual void execute(
cudnnHandle_t handle, const std::vector<void *> &args) const = 0;
void execute_sum(cudnnHandle_t handle, void *x, void *y, float alpha_,
float beta_) const {
float alpha = alpha_;
float beta = beta_;
CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x,
&beta, descs[io::y], y);
}
void execute_scale(cudnnHandle_t handle, void *y) const {
if (do_scaling) {
CUDNN_EXECUTE_FUNC_V(
cudnnScaleTensor, handle, descs[io::y], y, &output_scaling);
}
}
void execute_set_weights_bias(
cudnnHandle_t handle, void *weights, void *bias, float value) {
CUDNN_EXECUTE_FUNC_V(
cudnnSetTensor, handle, descs[io::weights], weights, &value);
if (bias) {
CUDNN_EXECUTE_FUNC_V(
cudnnSetTensor, handle, descs[io::bias], bias, &value);
}
}
bool with_eltwise(const convolution_pd_t *pd, int position) const {
return pd->attr()->post_ops_.contain(primitive_kind::eltwise, position);
}
status_t check_output_dims() const {
int expected_dims[CUDNN_DIM_MAX] = {};
CUDNN_EXECUTE_FUNC_V(cudnnGetConvolutionNdForwardOutputDim, conv_desc,
descs[x], weights_desc, ndims[y], &expected_dims[0]);
for (size_t i = 0; i < ndims[y]; i++) {
if (dims[y][i] != expected_dims[i]) return status::unimplemented;
}
return status::success;
}
void set_compute_format() {
if (data_types[x] == CUDNN_DATA_INT8) {
computation_data_type = CUDNN_DATA_INT32;
} else {
computation_data_type = data_types[y];
}
}
status_t get_filter_format() {
memory_desc_wrapper wrapper(&dnnl_descs[weights]);
if (wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc,
format_tag::abcd, format_tag::abcde, format_tag::abcdef)) {
formats[weights] = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW;
} else if ((!with_groups
&& wrapper.matches_one_of_tag(format_tag::owi,
format_tag::ohwi, format_tag::odhwi))
|| (with_groups
&& wrapper.matches_one_of_tag(format_tag::gowi,
format_tag::gohwi, format_tag::godhwi))) {
formats[weights] = cudnnTensorFormat_t::CUDNN_TENSOR_NHWC;
} else {
return status::unimplemented;
}
return status::success;
}
status_t get_formats() {
CHECK(get_format(&dnnl_descs[x], formats[x]));
CHECK(get_format(&dnnl_descs[y], formats[y]));
return status::success;
}
void set_filter_nhwc(int filter_ndims, int *transform_filter_strides,
int *filter_dims) override {
if (with_groups) {
switch (filter_ndims) {
case 4: // Convert to krsc
return propagate_strides(transform_filter_strides,
filter_dims, {2, 3, 1, 0});
case 5:
return propagate_strides(transform_filter_strides,
filter_dims, {2, 4, 3, 1, 0});
case 6:
return propagate_strides(transform_filter_strides,
filter_dims, {2, 5, 4, 3, 1, 0});
}
} else {
cudnn_conv_filter_adjustment_base_t::set_filter_nhwc(
filter_ndims, transform_filter_strides, filter_dims);
}
}
};
struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
protected:
cudnnActivationDescriptor_t activation_desc = nullptr;
cudnnActivationDescriptor_t eltwise_desc = nullptr;
cudnnTensorDescriptor_t reorder_dst_desc = nullptr;
cudnnConvolutionFwdAlgo_t fwd_alg_kind;
std::vector<cudnnConvolutionFwdAlgoPerf_t> perf;
int requested_algo_count = 0;
int returned_algo_count = 0;
int num_post_ops = 0;
primitive_kind_t post_ops[2];
bool need_reorder = false;
bool use_temp_dst = false;
float sum_scale = 1.0f;
public:
virtual ~cudnn_convolution_impl_fwd_t() {
if (activation_desc)
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyActivationDescriptor, activation_desc);
if (eltwise_desc)
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyActivationDescriptor, eltwise_desc);
if (reorder_dst_desc)
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, reorder_dst_desc);
}
status_t configure_post_ops(convolution_pd_t *pd) {
auto &p = pd->attr()->post_ops_;
num_post_ops = p.len();
if (data_types[y] == CUDNN_DATA_INT8 && p.len() > 0) {
data_types[y] = CUDNN_DATA_FLOAT;
need_reorder = true;
}
for (size_t i = 0; i < p.len(); i++) {
post_ops[i] = p.entry_[i].kind;
if (post_ops[i] == dnnl_eltwise) {
create_and_set_eltwise_descriptor(pd);
}
if (post_ops[i] == dnnl_sum) { sum_scale = p.entry_[i].sum.scale; }
}
if (need_reorder)
CHECK(create_and_set_tensor_descriptor_ex(&reorder_dst_desc,
formats[y], reorder_type, ndims[y], dims[y]));
return status::success;
}
status_t init(engine_t *engine, convolution_pd_t *pd,
bool use_scratch_dst) override {
use_temp_dst = use_scratch_dst;
CHECK(configure_parameters(pd, use_temp_dst));
CHECK(configure_post_ops(pd));
CHECK(create_cudnn_descs(pd));
CHECK(configure_alg_kind(engine, pd));
CHECK(init_scratchpad(engine, pd));
return status::success;
}
void execute_reorder(cudnnHandle_t handle, void *src, void *dst,
bool flip_formats) const {
const float alpha = 1.0f;
const float beta = 0.0f;
if (flip_formats) {
CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
reorder_dst_desc, src, &beta, descs[y], dst);
} else {
CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y],
src, &beta, reorder_dst_desc, dst);
}
}
void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const {
float alpha = 1.0f;
float beta = 0.0f;
CUDNN_EXECUTE_FUNC_V(cudnnActivationForward, handle, eltwise_desc,
&alpha, descs[io::y], src, &beta, descs[io::y], dst);
}
void execute(cudnnHandle_t handle,
const std::vector<void *> &args) const override {
auto x = args[0], weights = args[1], y = args[2], bias = args[3],
scratchpad = args[4], post_op_scratch = args[6],
post_op_reorder = args[7];
void *output = use_temp_dst ? post_op_scratch : y;
if (using_transformed_filter()) {
auto w_scratch = args[5];
transform_filter(handle, weights, w_scratch);
weights = w_scratch;
}
if (computation_data_type == CUDNN_DATA_INT32 && bias) {
CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBiasActivationForward, handle,
&alpha, descs[io::x], x, weights_desc, weights, conv_desc,
fwd_alg_kind, scratchpad, scratchpad_size, &beta,
descs[io::y], output, descs[io::bias], bias,
activation_desc, descs[io::y], output);
} else {
const float bias_alpha = 1.0f;
const float bias_beta = 1.0f;
CUDNN_EXECUTE_FUNC_V(cudnnConvolutionForward, handle, &alpha,
descs[io::x], x, weights_desc, weights, conv_desc,
fwd_alg_kind, scratchpad, scratchpad_size, &beta,
descs[io::y], output);
if (with_bias) {
CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &bias_alpha,
descs[io::bias], bias, &bias_beta, descs[io::y],
output);
}
}
execute_scale(handle, output);
for (int i = 0; i < num_post_ops; i++) {
bool last_op = i == num_post_ops - 1 && !need_reorder;
if (last_op) output = y;
switch (post_ops[i]) {
case dnnl_sum:
if (need_reorder) {
execute_reorder(handle, y, post_op_reorder, true);
execute_sum(handle, post_op_reorder, post_op_scratch,
sum_scale, 1.0f);
} else if (last_op) {
execute_sum(
handle, post_op_scratch, y, 1.0f, sum_scale);
} else {
execute_sum(
handle, y, post_op_scratch, sum_scale, 1.0f);
}
break;
case dnnl_eltwise:
execute_eltwise(handle, post_op_scratch, output);
break;
}
}
if (need_reorder) {
execute_reorder(handle, post_op_scratch, y, false);
}
}
status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize,
handle, descs[x], weights_desc, conv_desc, descs[y],
fwd_alg_kind, &scratchpad_size));
if (scratchpad_size > 0)
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_conv_cudnn_algo,
scratchpad_size, size_t(1));
return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd);
}
status_t configure_alg_kind(
engine_t *engine, convolution_pd_t *pd) override {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardAlgorithmMaxCount,
handle, &requested_algo_count));
perf.resize(requested_algo_count);
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionForwardAlgorithm, handle,
descs[x], weights_desc, conv_desc, descs[y],
requested_algo_count, &returned_algo_count, perf.data()));
for (size_t i = 0; i < returned_algo_count; i++) {
if (perf[i].status == CUDNN_STATUS_SUCCESS) {
// cudnnFindConvolutionForwardAlgorithm can erroneously report
// algorithms for int8 which does not work so ensure that we
// only allow CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
// in this case.
if (computation_data_type == CUDNN_DATA_INT32
&& perf[i].algo
!= CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
continue;
}
switch (pd->desc()->alg_kind) {
case dnnl_convolution_auto:
if (utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)) {
utils::downcast<cudnn_convolution_fwd_pd_t *>(pd)
->set_alg_kind(dnnl_convolution_direct);
} else {
utils::downcast<cudnn_convolution_fwd_pd_t *>(pd)
->set_alg_kind(dnnl_convolution_winograd);
}
break;
case dnnl_convolution_direct:
if (!utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM))
continue;
break;
case dnnl_convolution_winograd:
if (!utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED))
continue;
break;
default: return status::unimplemented;
}
fwd_alg_kind = perf[i].algo;
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType,
conv_desc, perf[i].mathType));
break;
} else {
return status::unimplemented;
}
}
if (fwd_alg_kind == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateActivationDescriptor, &activation_desc));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor,
activation_desc,
cudnnActivationMode_t::CUDNN_ACTIVATION_IDENTITY,
CUDNN_NOT_PROPAGATE_NAN, 1.0));
}
return status::success;
}
status_t create_and_set_eltwise_descriptor(const convolution_pd_t *pd) {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateActivationDescriptor, &eltwise_desc));
cudnnActivationMode_t act_mode;
switch (eltwise_algorithm_kind(pd)) {
case alg_kind::eltwise_tanh:
act_mode = CUDNN_ACTIVATION_TANH;
break;
case alg_kind::eltwise_elu: act_mode = CUDNN_ACTIVATION_ELU; break;
case alg_kind::eltwise_relu:
act_mode = CUDNN_ACTIVATION_RELU;
break;
case alg_kind::eltwise_logistic:
act_mode = CUDNN_ACTIVATION_SIGMOID;
break;
case alg_kind::eltwise_bounded_relu:
act_mode = CUDNN_ACTIVATION_CLIPPED_RELU;
break;
default: return status::unimplemented;
}
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, eltwise_desc,
act_mode, cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN,
eltwise_alpha(pd)));
return status::success;
}
dnnl::impl::alg_kind_t eltwise_algorithm_kind(
const convolution_pd_t *pd) const {
const int eltwise_idx
= pd->attr()->post_ops_.find(primitive_kind::eltwise);
return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alg;
}
float eltwise_alpha(const convolution_pd_t *pd) const {
const int eltwise_idx
= pd->attr()->post_ops_.find(primitive_kind::eltwise);
return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alpha;
}
};
struct cudnn_convolution_impl_bwd_data_t
: public cudnn_convolution_impl_base_t {
protected:
cudnnConvolutionBwdDataAlgo_t bwd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perf;
int requested_algo_count = 0;
int returned_algo_count = 0;
status_t configure_alg_kind(
engine_t *engine, convolution_pd_t *pd) override {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnGetConvolutionBackwardDataAlgorithmMaxCount, handle,
&requested_algo_count));
perf.resize(requested_algo_count);
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionBackwardDataAlgorithm,
handle, weights_desc, descs[y], conv_desc, descs[x],
requested_algo_count, &returned_algo_count, perf.data()));
for (size_t i = 0; i < returned_algo_count; i++) {
if (perf[i].status == CUDNN_STATUS_SUCCESS) {
switch (pd->desc()->alg_kind) {
case dnnl_convolution_auto:
if (utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1)) {
utils::downcast<cudnn_convolution_bwd_data_pd_t *>(
pd)
->set_alg_kind(dnnl_convolution_direct);
} else {
utils::downcast<cudnn_convolution_bwd_data_pd_t *>(
pd)
->set_alg_kind(dnnl_convolution_winograd);
}
break;
case dnnl_convolution_direct:
if (!utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1))
continue;
break;
case dnnl_convolution_winograd:
if (!utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED))
continue;
break;
default: return status::unimplemented;
}
bwd_algo = perf[i].algo;
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType,
conv_desc, perf[i].mathType));
break;
} else {
return status::unimplemented;
}
}
return status::success;
}
status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) override {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionBackwardDataWorkspaceSize,
handle, weights_desc, descs[io::y], conv_desc, descs[io::x],
bwd_algo, &scratchpad_size));
if (scratchpad_size > 0)
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_conv_cudnn_algo,
scratchpad_size, size_t(1));
return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd);
}
void execute(cudnnHandle_t handle,
const std::vector<void *> &args) const override {
auto x = args[0], weights = args[1], y = args[2], bias = args[3],
scratchpad = args[4];
if (using_transformed_filter()) {
auto w_scratch = args[5];
transform_filter(handle, weights, w_scratch);
weights = w_scratch;
}
const float bias_alpha = 1.0f;
const float bias_beta = 1.0f;
CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardData, handle, &alpha,
weights_desc, weights, descs[io::y], y, conv_desc, bwd_algo,
scratchpad, scratchpad_size, &beta, descs[io::x], x);
if (with_bias) {
CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &bias_alpha,
descs[io::bias], bias, &bias_beta, descs[io::x], x);
}
}
};
struct cudnn_convolution_impl_bwd_weights_t
: public cudnn_convolution_impl_base_t {
protected:
cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo
= CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perf;
int requested_algo_count = 0;
int returned_algo_count = 0;
public:
status_t init_zero_dims(convolution_pd_t *pd) override {
if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
dnnl_descs[weights] = *pd->invariant_wei_md();
CHECK(get_format(&dnnl_descs[weights], formats[weights], true));
ndims[y] = pd->invariant_dst_md()->ndims;
ndims[weights] = dnnl_descs[weights].ndims - pd->with_groups();
CHECK(convert_data_type(&dnnl_descs[weights], &data_types[weights]));
convert_dims(dnnl_descs[weights].dims + pd->with_groups(),
dims[weights], ndims[weights]);
ndims[weights] = std::max(4, ndims[weights]);
convert_dims(dnnl_descs[weights].format_desc.blocking.strides,
strides[weights], ndims[weights]);
CHECK(create_and_set_tensor_descriptor(&descs[weights],
data_types[weights], ndims[weights], dims[weights],
strides[weights]));
if (pd->with_bias()) {
dnnl_descs[bias] = *pd->invariant_bia_md();
ndims[bias] = dnnl_descs[bias].ndims;
CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias]));
convert_dims(dnnl_descs[bias].padded_dims, dims[bias], ndims[bias],
ndims[y]);
std::swap(dims[bias][0], dims[bias][1]);
convert_dims(dnnl_descs[bias].format_desc.blocking.strides,
strides[bias], ndims[bias], ndims[weights]);
ndims[bias] = ndims[y];
CHECK(create_and_set_tensor_descriptor(&descs[bias],
data_types[bias], ndims[bias], dims[bias], strides[bias]));
}
return status::success;
}
virtual status_t configure_alg_kind(
engine_t *engine, convolution_pd_t *pd) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount, handle,
&requested_algo_count));
perf.resize(requested_algo_count);
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnFindConvolutionBackwardFilterAlgorithm,
handle, descs[x], descs[y], conv_desc, weights_desc,
requested_algo_count, &returned_algo_count, perf.data()));
for (size_t i = 0; i < returned_algo_count; i++) {
if (perf[i].status == CUDNN_STATUS_SUCCESS) {
switch (pd->desc()->alg_kind) {
case dnnl_convolution_auto:
if (utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3)) {
utils::downcast<
cudnn_convolution_bwd_weights_pd_t *>(pd)
->set_alg_kind(dnnl_convolution_direct);
} else {
utils::downcast<
cudnn_convolution_bwd_weights_pd_t *>(pd)
->set_alg_kind(dnnl_convolution_winograd);
}
break;
case dnnl_convolution_direct:
if (!utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3))
continue;
break;
case dnnl_convolution_winograd:
if (!utils::one_of(perf[i].algo,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED))
continue;
break;
default: return status::unimplemented;
}
bwd_filter_algo = perf[i].algo;
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionMathType,
conv_desc, perf[i].mathType));
break;
} else {
return status::unimplemented;
}
}
return status::success;
}
status_t init_scratchpad(engine_t *engine, convolution_pd_t *pd) override {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnGetConvolutionBackwardFilterWorkspaceSize, handle,
descs[io::x], descs[io::y], conv_desc, weights_desc,
bwd_filter_algo, &scratchpad_size));
if (scratchpad_size > 0)
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_conv_cudnn_algo,
scratchpad_size, size_t(1));
return cudnn_convolution_impl_base_t::init_scratchpad(engine, pd);
}
void execute(cudnnHandle_t handle,
const std::vector<void *> &args) const override {
auto x = args[0], weights = args[1], y = args[2], bias = args[3],
scratchpad = args[4];
auto filter = weights;
if (using_transformed_filter()) {
auto w_scratch = args[5];
transform_filter(handle, weights, w_scratch);
filter = w_scratch;
}
const float bias_alpha = 1.0f;
const float bias_beta = 0.0f;
CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardFilter, handle, &alpha,
descs[io::x], x, descs[io::y], y, conv_desc, bwd_filter_algo,
scratchpad, scratchpad_size, &beta, weights_desc, filter);
if (with_bias) {
CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardBias, handle,
&bias_alpha, descs[io::y], y, &bias_beta, descs[io::bias],
bias);
}
if (using_transformed_filter()) {
undo_transform_filter(handle, filter, weights);
}
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,77 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_CONVOLUTION_PD_HPP
#define GPU_NVIDIA_CUDNN_CONVOLUTION_PD_HPP
#include "common/convolution_pd.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_convolution_fwd_pd_t : public convolution_fwd_pd_t {
using convolution_fwd_pd_t::convolution_fwd_pd_t;
bool set_alg_kind(alg_kind_t kind) { return set_default_alg_kind(kind); }
bool check_for_zero_dims() const {
return has_zero_dims(
invariant_src_md()->dims, invariant_src_md()->ndims)
|| has_zero_dims(
invariant_wei_md(0)->dims, invariant_wei_md(0)->ndims)
|| has_zero_dims(
invariant_dst_md()->dims, invariant_dst_md()->ndims);
}
};
struct cudnn_convolution_bwd_data_pd_t : public convolution_bwd_data_pd_t {
using convolution_bwd_data_pd_t::convolution_bwd_data_pd_t;
bool set_alg_kind(alg_kind_t kind) { return set_default_alg_kind(kind); }
bool check_for_zero_dims() const {
return has_zero_dims(
invariant_src_md()->dims, invariant_src_md()->ndims)
|| has_zero_dims(
invariant_wei_md(0)->dims, invariant_wei_md(0)->ndims)
|| has_zero_dims(
invariant_dst_md()->dims, invariant_dst_md()->ndims);
}
};
struct cudnn_convolution_bwd_weights_pd_t
: public convolution_bwd_weights_pd_t {
using convolution_bwd_weights_pd_t::convolution_bwd_weights_pd_t;
bool set_alg_kind(alg_kind_t kind) { return set_default_alg_kind(kind); }
bool check_for_zero_dims() const {
return has_zero_dims(
invariant_src_md()->dims, invariant_src_md()->ndims)
|| has_zero_dims(
invariant_wei_md(0)->dims, invariant_wei_md(0)->ndims)
|| has_zero_dims(
invariant_dst_md()->dims, invariant_dst_md()->ndims);
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,57 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_deconvolution.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_deconvolution_bwd_weights_t::execute_bias(
const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->diff_dst_md(0)).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto bias_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS);
auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto bias = sc.memory<void *>(ih, bias_acc);
auto y = sc.memory<void *>(ih, y_acc);
impl_->execute_bias(handle, y, bias);
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,476 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_DECONVOLUTION_HPP
#define GPU_NVIDIA_CUDNN_DECONVOLUTION_HPP
#include "cudnn.h"
#include "common/c_types_map.hpp"
#include "common/deconvolution_pd.hpp"
#include "common/primitive_iterator.hpp"
#include "gpu/nvidia/cudnn_convolution.hpp"
#include "gpu/nvidia/cudnn_deconvolution_impl.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace {
static status_t compute_blocked_format(
bool with_groups, const memory_desc_t *oi_md, memory_desc_t *io_md) {
/* Computes blocking for *i*o* format from *o*i* format */
bool sanity_check_ok = true && oi_md->ndims == io_md->ndims
&& oi_md->format_kind == format_kind::blocked;
if (!sanity_check_ok) return status::invalid_arguments;
const blocking_desc_t &oi_blk = oi_md->format_desc.blocking;
blocking_desc_t io_blk = io_md->format_desc.blocking;
io_md->format_kind = format_kind::blocked;
io_blk = oi_blk;
const int ID_OC = 0 + with_groups;
const int ID_IC = 1 + with_groups;
nstl::swap(io_blk.strides[ID_OC], io_blk.strides[ID_IC]);
for (int i_blk = 0; i_blk < io_blk.inner_nblks; ++i_blk) {
if (utils::one_of(io_blk.inner_idxs[i_blk], ID_OC, ID_IC)) {
io_blk.inner_idxs[i_blk]
= (io_blk.inner_idxs[i_blk] == ID_OC ? ID_IC : ID_OC);
}
}
return memory_desc_init_by_blocking_desc(*io_md, io_blk);
}
static status_t conv_descr_create(
const deconvolution_desc_t *dd, convolution_desc_t *cd) {
using namespace prop_kind;
alg_kind_t alg_kind = dd->alg_kind == alg_kind::deconvolution_direct
? alg_kind::convolution_direct
: alg_kind::convolution_winograd;
const memory_desc_t *src_md, *dst_md, *d_weights_d;
prop_kind_t prop_kind;
memory_desc_t c_weights_d;
if (utils::one_of(dd->prop_kind, forward_training, forward_inference)) {
prop_kind = backward_data;
src_md = &dd->dst_desc;
dst_md = &dd->src_desc;
d_weights_d = &dd->weights_desc;
} else if (dd->prop_kind == backward_data) {
prop_kind = forward_training;
src_md = &dd->diff_dst_desc;
dst_md = &dd->diff_src_desc;
d_weights_d = &dd->weights_desc;
} else {
prop_kind = dd->prop_kind;
src_md = &dd->diff_dst_desc;
dst_md = &dd->src_desc;
d_weights_d = &dd->diff_weights_desc;
}
const bool with_groups = d_weights_d->ndims == src_md->ndims + 1;
/* create weights desc for convolution */
c_weights_d = *d_weights_d;
const int ID_OC = 0 + with_groups;
const int ID_IC = 1 + with_groups;
nstl::swap(c_weights_d.dims[ID_OC], c_weights_d.dims[ID_IC]);
nstl::swap(c_weights_d.padded_dims[ID_OC], c_weights_d.padded_dims[ID_IC]);
nstl::swap(c_weights_d.padded_offsets[ID_OC],
c_weights_d.padded_offsets[ID_IC]);
if (c_weights_d.format_kind != format_kind::any)
CHECK(compute_blocked_format(with_groups, d_weights_d, &c_weights_d));
return conv_desc_init(cd, prop_kind, alg_kind, src_md, &c_weights_d,
prop_kind != backward_weights ? &dd->bias_desc : nullptr, dst_md,
dd->strides, dd->dilates, dd->padding[0], dd->padding[1]);
}
} // namespace
struct cudnn_deconvolution_fwd_t : public primitive_t {
struct pd_t : public deconvolution_fwd_pd_t {
pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
const deconvolution_fwd_pd_t *hint_fwd_pd)
: deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
, conv_pd_(nullptr) {}
pd_t(const pd_t &other)
: deconvolution_fwd_pd_t(other)
, conv_pd_(other.conv_pd_->clone())
, conv_supports_bias_(other.conv_supports_bias_)
, dst_tag_(other.dst_tag_) {}
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_deconvolution_fwd_t);
status_t init_convolution(engine_t *engine) {
using namespace format_tag;
using namespace data_type;
convolution_desc_t cd;
CHECK(conv_descr_create(desc(), &cd));
primitive_attr_t conv_attr = *attr();
conv_attr.set_scratchpad_mode(scratchpad_mode::user);
dnnl_primitive_desc_iterator it(
engine, (op_desc_t *)&cd, &conv_attr, nullptr);
while (++it != it.end()) {
primitive_desc_t *conv_pd = it.fetch_once();
conv_supports_bias_
= static_cast<convolution_bwd_data_pd_t *>(conv_pd)
->support_bias();
bool ref_deconv_supports_bias = true
&& desc()->accum_data_type == data_type::f32
&& utils::one_of(desc()->dst_desc.data_type, f32, f16)
&& IMPLICATION(desc()->src_desc.data_type == f16,
memory_desc_matches_one_of_tag(
*conv_pd->diff_src_md(),
utils::pick(ndims() - 3, ncw, nchw,
ncdhw)));
bool ok = true
&& conv_pd->weights_md()->extra.flags == 0
/* deconv reference code can process only f32 bias */
&& IMPLICATION(with_bias(),
conv_supports_bias_
|| ref_deconv_supports_bias);
if (ok) {
conv_pd_.reset(conv_pd);
return status::success;
}
}
conv_pd_.reset();
return status::unimplemented;
}
status_t init(engine_t *engine) {
using namespace format_tag;
bool ok = true && is_fwd();
ok = ok
&& utils::one_of(desc()->alg_kind,
alg_kind::deconvolution_direct,
alg_kind::deconvolution_winograd);
ok = ok && attr_.has_default_values();
ok = ok
&& (utils::everyone_is(data_type::f32,
desc()->src_desc.data_type,
desc()->weights_desc.data_type,
desc()->dst_desc.data_type)
|| utils::everyone_is(data_type::f16,
desc()->src_desc.data_type,
desc()->weights_desc.data_type,
desc()->dst_desc.data_type));
if (ok) {
CHECK(init_convolution(engine));
if (weights_md_.format_kind == format_kind::any) {
CHECK(compute_blocked_format(with_groups(),
conv_pd_->weights_md(), &desc_.weights_desc));
weights_md_ = desc_.weights_desc;
}
if (src_md_.format_kind == format_kind::any)
src_md_ = *conv_pd_->diff_dst_md();
if (dst_md_.format_kind == format_kind::any)
dst_md_ = *conv_pd_->diff_src_md();
if (bias_md_.format_kind == format_kind::any)
CHECK(memory_desc_init_by_tag(bias_md_, x));
dst_tag_ = memory_desc_matches_one_of_tag(dst_md_,
utils::pick(ndims() - 3, ncw, nchw, ncdhw),
utils::pick(ndims() - 3, nCw4c, nChw4c, nCdhw4c));
init_scratchpad();
return status::success;
}
return status::unimplemented;
}
void init_scratchpad() {
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_nested,
conv_pd_->scratchpad_registry());
}
std::unique_ptr<primitive_desc_t> conv_pd_;
bool conv_supports_bias_;
format_tag_t dst_tag_;
};
cudnn_deconvolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
~cudnn_deconvolution_fwd_t() {}
virtual status_t init(engine_t *engine) {
return pd()->conv_pd_->create_primitive(conv_p_, engine);
}
status_t execute(const exec_ctx_t &ctx) const {
using namespace memory_tracking::names;
const auto &args = ctx.args();
exec_args_t conv_args;
conv_args[DNNL_ARG_DIFF_DST] = args.at(DNNL_ARG_SRC);
conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS);
conv_args[DNNL_ARG_DIFF_SRC] = args.at(DNNL_ARG_DST);
if (pd()->with_bias())
conv_args[DNNL_ARG_BIAS] = args.at(DNNL_ARG_BIAS);
exec_ctx_t conv_ctx(ctx.stream(), std::move(conv_args));
nested_scratchpad_t ns(ctx, key_nested, conv_p_);
conv_ctx.set_scratchpad_grantor(ns.grantor());
// Executing the convolution kernel
status_t status = conv_p_->execute(conv_ctx);
return status;
}
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
std::shared_ptr<primitive_t> conv_p_;
};
struct cudnn_deconvolution_bwd_data_t : public primitive_t {
struct pd_t : public deconvolution_bwd_data_pd_t {
pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
const deconvolution_fwd_pd_t *hint_fwd_pd)
: deconvolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd)
, conv_pd_(nullptr) {}
pd_t(const pd_t &other)
: deconvolution_bwd_data_pd_t(other)
, conv_pd_(other.conv_pd_->clone()) {}
~pd_t() {}
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_deconvolution_bwd_data_t);
status_t init_convolution(engine_t *engine) {
convolution_desc_t cd;
CHECK(conv_descr_create(desc(), &cd));
primitive_attr_t conv_attr = *attr();
conv_attr.set_scratchpad_mode(scratchpad_mode::user);
dnnl_primitive_desc_iterator it(
engine, (op_desc_t *)&cd, &conv_attr, nullptr);
while (++it != it.end()) {
primitive_desc_t *_conv_pd = it.fetch_once();
conv_pd_.reset(_conv_pd);
return status::success;
}
return status::unimplemented;
}
status_t init(engine_t *engine) {
bool ok = true && desc()->prop_kind == prop_kind::backward_data
&& (utils::everyone_is(data_type::f32,
desc()->diff_src_desc.data_type,
desc()->weights_desc.data_type,
desc()->diff_dst_desc.data_type)
|| utils::everyone_is(data_type::f16,
desc()->weights_desc.data_type,
desc()->diff_dst_desc.data_type))
&& utils::one_of(desc()->diff_src_desc.data_type,
data_type::f16, data_type::f32)
&& desc()->alg_kind == alg_kind::deconvolution_direct
&& attr()->has_default_values();
if (ok) {
CHECK(init_convolution(engine));
if (weights_md_.format_kind == format_kind::any) {
CHECK(compute_blocked_format(with_groups(),
conv_pd_->weights_md(), &desc_.weights_desc));
weights_md_ = desc_.weights_desc;
}
if (diff_src_md_.format_kind == format_kind::any)
diff_src_md_ = *conv_pd_->dst_md();
if (diff_dst_md_.format_kind == format_kind::any)
diff_dst_md_ = *conv_pd_->src_md();
init_scratchpad();
return status::success;
}
return status::unimplemented;
}
void init_scratchpad() {
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_nested,
conv_pd_->scratchpad_registry());
}
std::unique_ptr<primitive_desc_t> conv_pd_;
};
cudnn_deconvolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
~cudnn_deconvolution_bwd_data_t() {}
virtual status_t init(engine_t *engine) {
return pd()->conv_pd_->create_primitive(conv_p_, engine);
}
status_t execute(const exec_ctx_t &ctx) const {
using namespace memory_tracking::names;
const auto &args = ctx.args();
exec_args_t conv_args;
conv_args[DNNL_ARG_SRC] = args.at(DNNL_ARG_DIFF_DST);
conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS);
conv_args[DNNL_ARG_DST] = args.at(DNNL_ARG_DIFF_SRC);
if (!types::is_zero_md(pd()->scratchpad_md()))
conv_args[DNNL_ARG_SCRATCHPAD] = args.at(DNNL_ARG_SCRATCHPAD);
exec_ctx_t conv_ctx(ctx.stream(), std::move(conv_args));
nested_scratchpad_t ns(ctx, key_nested, conv_p_);
conv_ctx.set_scratchpad_grantor(ns.grantor());
// Executing the convolution kernel
status_t status = conv_p_->execute(conv_ctx);
return status;
}
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
std::shared_ptr<primitive_t> conv_p_;
};
struct cudnn_deconvolution_bwd_weights_t : public primitive_t {
struct pd_t : public deconvolution_bwd_weights_pd_t {
pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
const deconvolution_fwd_pd_t *hint_fwd_pd)
: deconvolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
, conv_pd_(nullptr) {}
pd_t(const pd_t &other)
: deconvolution_bwd_weights_pd_t(other)
, conv_pd_(other.conv_pd_->clone()) {}
~pd_t() {}
DECLARE_COMMON_PD_T(
"cuda:cudnn:any", cudnn_deconvolution_bwd_weights_t);
status_t init_convolution(engine_t *engine) {
convolution_desc_t cd;
CHECK(conv_descr_create(desc(), &cd));
primitive_attr_t conv_attr = *attr();
conv_attr.set_scratchpad_mode(scratchpad_mode::user);
dnnl_primitive_desc_iterator it(
engine, (op_desc_t *)&cd, &conv_attr, nullptr);
while (++it != it.end()) {
primitive_desc_t *_conv_pd = it.fetch_once();
conv_pd_.reset(_conv_pd);
if (conv_pd_ == nullptr) return status::out_of_memory;
return status::success;
}
return status::unimplemented;
}
status_t init(engine_t *engine) {
using namespace format_tag;
bool ok = true && desc()->prop_kind == prop_kind::backward_weights
&& (utils::everyone_is(data_type::f32,
desc()->src_desc.data_type,
desc()->diff_weights_desc.data_type,
desc()->diff_dst_desc.data_type)
|| utils::everyone_is(data_type::f16,
desc()->diff_dst_desc.data_type,
desc()->src_desc.data_type))
&& utils::one_of(
desc()->alg_kind, alg_kind::deconvolution_direct)
&& attr()->has_default_values()
&& utils::one_of(desc()->diff_weights_desc.data_type,
data_type::f16, data_type::f32);
if (ok) {
CHECK(init_convolution(engine));
if (diff_weights_md_.format_kind == format_kind::any) {
CHECK(compute_blocked_format(with_groups(),
conv_pd_->diff_weights_md(),
&desc_.diff_weights_desc));
diff_weights_md_ = desc_.diff_weights_desc;
}
if (src_md_.format_kind == format_kind::any)
src_md_ = *conv_pd_->diff_dst_md();
if (diff_dst_md_.format_kind == format_kind::any)
diff_dst_md_ = *conv_pd_->src_md();
if (diff_bias_md_.format_kind == format_kind::any)
CHECK(memory_desc_init_by_tag(diff_bias_md_, x));
init_scratchpad();
return status::success;
}
return status::unimplemented;
}
void init_scratchpad() {
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_nested,
conv_pd_->scratchpad_registry());
}
std::unique_ptr<primitive_desc_t> conv_pd_;
};
cudnn_deconvolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {}
~cudnn_deconvolution_bwd_weights_t() {}
virtual status_t init(engine_t *engine) {
if (pd()->with_bias()) {
if (pd()->ndims() > CUDNN_DIM_MAX) return status::invalid_arguments;
impl_ = std::make_shared<cudnn_deconvolution_bwd_bias_impl_t>();
impl_->init(pd()->invariant_dst_md(), pd()->invariant_bia_md());
}
return pd()->conv_pd_->create_primitive(conv_p_, engine);
}
status_t execute(const exec_ctx_t &ctx) const {
using namespace memory_tracking::names;
const auto &args = ctx.args();
exec_args_t conv_args;
conv_args[DNNL_ARG_DIFF_DST] = args.at(DNNL_ARG_SRC);
conv_args[DNNL_ARG_SRC] = args.at(DNNL_ARG_DIFF_DST);
conv_args[DNNL_ARG_DIFF_WEIGHTS] = args.at(DNNL_ARG_DIFF_WEIGHTS);
if (!types::is_zero_md(pd()->scratchpad_md()))
conv_args[DNNL_ARG_SCRATCHPAD] = args.at(DNNL_ARG_SCRATCHPAD);
exec_ctx_t conv_ctx(ctx, std::move(conv_args));
nested_scratchpad_t ns(ctx, key_nested, conv_p_);
conv_ctx.set_scratchpad_grantor(ns.grantor());
status_t status = conv_p_->execute(conv_ctx);
if (status != status::success) return status;
if (pd()->with_bias()) { return execute_bias(ctx); }
return status::success;
}
status_t execute_bias(const exec_ctx_t &ctx) const;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
std::shared_ptr<primitive_t> conv_p_;
std::shared_ptr<cudnn_deconvolution_bwd_bias_impl_t> impl_;
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,92 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_DECONVOLUTION_IMPL_HPP
#define GPU_NVIDIA_CUDNN_DECONVOLUTION_IMPL_HPP
#include "cudnn.h"
#include "common/c_types_map.hpp"
#include "common/deconvolution_pd.hpp"
#include "gpu/nvidia/cudnn_convolution_pd.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_deconvolution_bwd_bias_impl_t {
protected:
enum io { y = 0, bias, NUM_IO };
memory_desc_t dnnl_descs[NUM_IO];
cudnnTensorDescriptor_t descs[NUM_IO];
int dims[NUM_IO][DNNL_MAX_NDIMS];
int strides[NUM_IO][DNNL_MAX_NDIMS];
int ndims[NUM_IO];
cudnnDataType_t data_types[NUM_IO];
public:
~cudnn_deconvolution_bwd_bias_impl_t() {
for (size_t i = 0; i < NUM_IO; i++) {
if (descs[i]) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, descs[i]);
}
}
}
status_t init(const memory_desc_t *dst, const memory_desc_t *bia) {
dnnl_descs[y] = *dst;
dnnl_descs[bias] = *bia;
ndims[y] = dnnl_descs[y].ndims;
ndims[bias] = dnnl_descs[bias].ndims;
convert_dims(dnnl_descs[y].padded_dims, dims[y], ndims[y]);
CHECK(convert_data_type(&dnnl_descs[y], &data_types[y]));
CHECK(convert_data_type(&dnnl_descs[bias], &data_types[bias]));
convert_dims(dnnl_descs[y].format_desc.blocking.strides, strides[y],
ndims[y]);
ndims[y] = std::max(4, ndims[y]);
convert_dims(dnnl_descs[bias].format_desc.blocking.strides,
strides[bias], ndims[bias], ndims[y]);
convert_dims(dnnl_descs[bias].padded_dims, dims[bias], ndims[bias],
ndims[y]);
std::swap(dims[bias][0], dims[bias][1]);
ndims[bias] = ndims[y];
CHECK(create_and_set_tensor_descriptor(
&descs[y], data_types[y], ndims[y], dims[y], strides[y]));
CHECK(create_and_set_tensor_descriptor(&descs[bias], data_types[bias],
ndims[bias], dims[bias], strides[bias]));
return status::success;
}
void execute_bias(cudnnHandle_t handle, void *y, void *bias) const {
const float bias_alpha = 1.0f;
const float bias_beta = 0.0f;
CUDNN_EXECUTE_FUNC_V(cudnnConvolutionBackwardBias, handle, &bias_alpha,
descs[io::y], y, &bias_beta, descs[io::bias], bias);
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,85 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_eltwise.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "sycl/sycl_buffer_memory_storage.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_eltwise_fwd_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->src_md()).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
std::vector<void *> args;
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
args.push_back(sc.memory<void *>(ih, src_acc));
args.push_back(sc.memory<void *>(ih, dst_acc));
pd()->eltwise_fwd_impl_->execute(handle, args.data(), args.size());
});
});
}
status_t cudnn_eltwise_bwd_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->src_md()).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
std::vector<void *> args;
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
args.push_back(sc.memory<void *>(ih, src_acc));
args.push_back(sc.memory<void *>(ih, diff_dst_acc));
args.push_back(sc.memory<void *>(ih, diff_src_acc));
pd()->eltwise_bwd_impl_->execute(handle, args.data(), args.size());
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,116 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_SYCL_CUDA_ELTWISE_HPP
#define GPU_NVIDIA_SYCL_CUDA_ELTWISE_HPP
#include "common/eltwise_pd.hpp"
#include "common/primitive.hpp"
#include "gpu/nvidia/cudnn_eltwise_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_eltwise_fwd_t : public primitive_t {
struct pd_t : public eltwise_fwd_pd_t {
using eltwise_fwd_pd_t::eltwise_fwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_eltwise_fwd_t);
status_t init(engine_t *) {
using namespace alg_kind;
bool ok = true
&& utils::one_of(desc()->prop_kind,
prop_kind::forward_training,
prop_kind::forward_inference)
// Supported algorithms
&& utils::one_of(desc()->alg_kind, eltwise_relu,
eltwise_bounded_relu, eltwise_tanh, eltwise_elu,
eltwise_logistic)
// Supported data types
&& utils::one_of(desc()->data_desc.data_type,
data_type::f32, data_type::f16, data_type::s8)
&& IMPLICATION(desc()->alg_kind == eltwise_relu,
desc()->alpha == 0)
// Eltwise does not support blocking
&& src_md()->format_desc.blocking.inner_nblks == 0
&& attr()->has_default_values();
if (!ok) return status::unimplemented;
eltwise_fwd_impl_.reset(new cudnn_eltwise_fwd_impl_t());
return eltwise_fwd_impl_->init(this);
}
std::shared_ptr<cudnn_eltwise_impl_base_t> eltwise_fwd_impl_;
};
cudnn_eltwise_fwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
struct cudnn_eltwise_bwd_t : public primitive_t {
struct pd_t : public eltwise_bwd_pd_t {
using eltwise_bwd_pd_t::eltwise_bwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_eltwise_bwd_t);
status_t init(engine_t *) {
using namespace alg_kind;
bool ok = true
&& desc()->prop_kind == prop_kind::backward_data
// Supported algorithms
&& utils::one_of(desc()->alg_kind, eltwise_bounded_relu,
eltwise_relu)
// Supported data types
&& desc()->data_desc.data_type == data_type::f32
&& IMPLICATION(desc()->alg_kind == eltwise_relu,
desc()->alpha == 0)
&& set_default_formats_common()
// Eltwise does not support blocking
&& src_md()->format_desc.blocking.inner_nblks == 0
&& diff_dst_md()->format_desc.blocking.inner_nblks == 0
&& attr()->has_default_values();
if (!ok) return status::unimplemented;
eltwise_bwd_impl_.reset(new cudnn_eltwise_bwd_impl_t());
return eltwise_bwd_impl_->init(this);
}
std::shared_ptr<cudnn_eltwise_impl_base_t> eltwise_bwd_impl_;
};
cudnn_eltwise_bwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,203 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_SYCL_CUDA_ELTWISE_IMPL_HPP
#define GPU_NVIDIA_SYCL_CUDA_ELTWISE_IMPL_HPP
#include "cudnn.h"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_eltwise_impl_base_t {
public:
virtual status_t init(const eltwise_pd_t *pd) = 0;
virtual void execute(cudnnHandle_t handle, void **x, int size) const = 0;
virtual status_t create_and_set_act_descriptor() {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateActivationDescriptor, &act_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_,
alg_kind, cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN, coef));
return status::success;
}
// Mapping between dnnl algorithm and cuDNN activation mode
status_t convert_alg_kind(
alg_kind_t alg_kind, cudnnActivationMode_t *cuda_alg_kind) const {
switch (alg_kind) {
case alg_kind::eltwise_relu:
*cuda_alg_kind = cudnnActivationMode_t::CUDNN_ACTIVATION_RELU;
break;
case alg_kind::eltwise_bounded_relu:
*cuda_alg_kind
= cudnnActivationMode_t::CUDNN_ACTIVATION_CLIPPED_RELU;
break;
case alg_kind::eltwise_tanh:
*cuda_alg_kind = cudnnActivationMode_t::CUDNN_ACTIVATION_TANH;
break;
case alg_kind::eltwise_elu:
*cuda_alg_kind = cudnnActivationMode_t::CUDNN_ACTIVATION_ELU;
break;
case alg_kind::eltwise_logistic:
*cuda_alg_kind
= cudnnActivationMode_t::CUDNN_ACTIVATION_SIGMOID;
break;
default: return status::unimplemented;
}
return status::success;
}
virtual ~cudnn_eltwise_impl_base_t() {
if (act_desc_) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyActivationDescriptor, act_desc_);
}
}
protected:
int ndims;
cudnnActivationDescriptor_t act_desc_ = nullptr;
cudnnActivationMode_t alg_kind;
// alpha and beta are post operation scaling parameters used by cuDNN
float alpha = 1;
float beta = 0;
// coef in cuDNN is use for Relu (is equal to zero) and BRelu (represents
// the bound)
double coef = 0;
};
struct cudnn_eltwise_fwd_impl_t : public cudnn_eltwise_impl_base_t {
public:
status_t init(const eltwise_pd_t *pd) override {
// If any of the dimensions are 0 we should not continue with creating
// cudnn descriptors
if (has_zero_dims(pd->src_md()->dims, pd->ndims())) {
return status::success;
}
if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
ndims = pd->ndims() < 4 ? 4 : pd->ndims();
// Obtain source and destination dimensions, strides and datatype
convert_dims(pd->src_md()->padded_dims, dims_, pd->ndims());
convert_dims(pd->src_md()->format_desc.blocking.strides, strides_,
pd->ndims());
CHECK(convert_data_type(pd->src_md(), &data_type_));
// Get cuDNN activation mode
alg_kind_t alg = pd->desc()->alg_kind;
auto alg_ok = convert_alg_kind(alg, &alg_kind);
if (alg_ok != status::success) { return status::unimplemented; }
coef = pd->desc()->alpha;
CHECK(create_and_set_tensor_descriptor(
&tensor_desc_, data_type_, ndims, dims_, strides_));
CHECK(create_and_set_act_descriptor());
return status::success;
}
void execute(cudnnHandle_t handle, void **x, int size) const override {
// Confirm that 2 arguments were passed src and dst
assert(size == 2);
CUDNN_EXECUTE_FUNC(cudnnActivationForward, handle, act_desc_, &alpha,
tensor_desc_, x[0], &beta, tensor_desc_, x[1]);
}
~cudnn_eltwise_fwd_impl_t() {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_desc_);
}
private:
int strides_[DNNL_MAX_NDIMS];
int dims_[DNNL_MAX_NDIMS];
cudnnDataType_t data_type_;
cudnnTensorDescriptor_t tensor_desc_;
};
struct cudnn_eltwise_bwd_impl_t : public cudnn_eltwise_impl_base_t {
public:
status_t init(const eltwise_pd_t *pd) override {
// If any of the dimensions are 0 we should not continue with creating
// cudnn descriptors
if (memory_desc_wrapper(pd->desc()->data_desc).has_zero_dim())
return status::success;
if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
ndims = pd->ndims() < 4 ? 4 : pd->ndims();
// Obtain dimension and strides for the backward eltwise operation
convert_dims(pd->src_md()->padded_dims, dims_, pd->ndims());
convert_dims(pd->src_md()->format_desc.blocking.strides, strides_,
pd->ndims());
alg_kind_t alg = pd->desc()->alg_kind;
auto alg_ok = convert_alg_kind(alg, &alg_kind);
if (alg_ok != status::success) { return status::unimplemented; }
coef = pd->desc()->alpha;
// Check validity of input
assert(pd->diff_dst_md()->data_type == pd->src_md()->data_type);
assert(pd->diff_dst_md()->data_type == pd->diff_src_md()->data_type);
CHECK(convert_data_type(pd->src_md(), &data_type_));
CHECK(create_and_set_tensor_descriptor(
&tensor_desc_src_, data_type_, ndims, dims_, strides_));
CHECK(create_and_set_tensor_descriptor(
&tensor_diff_desc_, data_type_, ndims, dims_, strides_));
CHECK(create_and_set_act_descriptor());
return status::success;
}
void execute(cudnnHandle_t handle, void **x, int size) const override {
// Assert that 3 arguments were passed src, diff_dst and diff_src
assert(size == 3);
void *dy = x[1];
void *dx = x[2];
CUDNN_EXECUTE_FUNC(cudnnActivationBackward, handle, act_desc_, &alpha,
tensor_desc_src_, x[0], tensor_diff_desc_, dy, tensor_desc_src_,
x[0], &beta, tensor_diff_desc_, dx);
}
~cudnn_eltwise_bwd_impl_t() {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_desc_src_);
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_diff_desc_);
}
private:
int dims_[DNNL_MAX_NDIMS];
int strides_[DNNL_MAX_NDIMS];
cudnnTensorDescriptor_t tensor_diff_desc_;
cudnnDataType_t data_type_;
cudnnTensorDescriptor_t tensor_desc_src_;
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,347 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_GEMM_INNER_PRODUCT_HPP
#define GPU_NVIDIA_CUDNN_GEMM_INNER_PRODUCT_HPP
#include "cudnn.h"
#include <CL/sycl.hpp>
#include "common/c_types_map.hpp"
#include "common/inner_product_pd.hpp"
#include "common/primitive.hpp"
#include "gpu/nvidia/cudnn_gemm_inner_product_impl.hpp"
#include "gpu/nvidia/cudnn_inner_product.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace {
inline bool gemm_consitency_check(const memory_desc_wrapper &src_d,
const memory_desc_wrapper &wei_d, const memory_desc_wrapper &dst_d) {
using namespace utils;
auto strides_compatible = [&]() {
bool ok = true;
auto w_str = wei_d.blocking_desc().strides;
auto d_str = src_d.blocking_desc().strides;
for (int i = 1; i < src_d.ndims() - 1; i++) {
ok = ok && w_str[i] / d_str[i] == w_str[i + 1] / d_str[i + 1];
}
return ok && one_of(w_str[1] / d_str[1], 1, wei_d.padded_dims()[0]);
};
auto inner_blk_compatible = [&]() {
auto d_inner_blks = src_d.blocking_desc().inner_blks;
auto w_inner_blks = wei_d.blocking_desc().inner_blks;
auto d_inner_idxs = src_d.blocking_desc().inner_idxs;
auto w_inner_idxs = wei_d.blocking_desc().inner_idxs;
int d_inner_nblks = src_d.blocking_desc().inner_nblks;
int w_inner_nblks = wei_d.blocking_desc().inner_nblks;
bool ok = true;
if ((wei_d.blocking_desc().strides[0] == 1) && (w_inner_nblks > 0)) {
ok = ok && wei_d.dims()[0] / w_inner_blks[w_inner_nblks - 1] == 1
&& w_inner_idxs[w_inner_nblks - 1] == 0;
w_inner_nblks--;
}
// cudnn only supports blocking for channel C and type s8. Only
// blocksize 4 is supported.
ok = ok && d_inner_nblks == w_inner_nblks;
bool supported_block_size = (d_inner_nblks == 0
|| (d_inner_nblks == 1 && d_inner_idxs[0] == w_inner_idxs[0]
&& w_inner_idxs[0] == 1
&& d_inner_blks[0] == w_inner_blks[0]
&& d_inner_blks[0] == 4
&& src_d.data_type() == data_type::s8));
ok = ok && supported_block_size;
for (int d = 1; d < w_inner_nblks; d++)
ok = ok && (d_inner_blks[d] == w_inner_blks[d] == 0)
&& (d_inner_idxs[d] == w_inner_idxs[d] == 0);
return ok;
};
return true && src_d.is_blocking_desc() && wei_d.is_blocking_desc()
&& src_d.ndims() == wei_d.ndims() && inner_blk_compatible()
&& strides_compatible() && dst_d.matches_tag(format_tag::nc)
&& src_d.only_padded_dim(1) && wei_d.only_padded_dim(1)
&& src_d.padded_dims()[1] == wei_d.padded_dims()[1];
}
inline bool reorder_check(const memory_desc_wrapper &src_d,
const memory_desc_wrapper &wei_d, const memory_desc_wrapper &dst_d) {
using namespace format_tag;
using namespace utils;
return true
&& ((src_d.matches_tag(nwc)
&& (wei_d.matches_one_of_tag(oiw, iwo) != undef))
|| (src_d.matches_tag(ncw)
&& (wei_d.matches_one_of_tag(wio, owi) != undef))
|| (src_d.matches_tag(nhwc),
(wei_d.matches_one_of_tag(oihw, ihwo) != undef))
|| (src_d.matches_tag(nchw)
&& (wei_d.matches_one_of_tag(ohwi, hwio) != undef))
|| (src_d.matches_tag(ndhwc)
&& (wei_d.matches_one_of_tag(oidhw, idhwo)
!= undef))
|| (src_d.matches_tag(ncdhw)
&& (wei_d.matches_one_of_tag(odhwi, dhwio)
!= undef)))
&& dst_d.matches_tag(nc);
}
inline bool dense_check(const memory_desc_wrapper &src_d,
const memory_desc_wrapper &wei_d, const memory_desc_wrapper &dst_d) {
return true && src_d.is_dense(true) && dst_d.is_dense()
&& wei_d.is_dense(true);
}
status_t template_set_default_params(memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t *bias_md, int ndims) {
using namespace format_tag;
auto init_md = [&](memory_desc_t &out_md, const memory_desc_t &in_md) {
format_tag_t md_tag;
if (memory_desc_matches_one_of_tag(in_md, ab, abc, abcd, abcde))
md_tag = utils::pick(ndims - 2, ab, abc, abcd, abcde);
else if (memory_desc_matches_one_of_tag(in_md, acb, acdb, acdeb))
md_tag = utils::pick(ndims - 3, cba, cdba, cdeba);
else if (memory_desc_matches_one_of_tag(in_md, ba, cba, cdba, cdeba))
md_tag = utils::pick(ndims - 2, ab, acb, acdb, acdeb);
else {
memory_desc_wrapper md_desc_wrapper(in_md);
return memory_desc_init_by_blocking_desc(
out_md, md_desc_wrapper.blocking_desc());
}
return memory_desc_init_by_tag(out_md, md_tag);
};
if (src_md.format_kind == format_kind::any
&& weights_md.format_kind == format_kind::any) {
CHECK(memory_desc_init_by_tag(
src_md, utils::pick(ndims - 2, nc, ncw, nchw, ncdhw)));
CHECK(memory_desc_init_by_tag(
weights_md, utils::pick(ndims - 2, oi, oiw, oihw, oidhw)));
} else if (src_md.format_kind == format_kind::any) {
CHECK(init_md(src_md, weights_md));
} else if (weights_md.format_kind == format_kind::any) {
CHECK(init_md(weights_md, src_md));
}
if (dst_md.format_kind == format_kind::any) {
CHECK(memory_desc_init_by_tag(dst_md, nc));
}
if (bias_md->format_kind == format_kind::any) {
CHECK(memory_desc_init_by_tag(*bias_md, x));
}
return status::success;
}
} // namespace
struct cudnn_gemm_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
using cudnn_inner_product_fwd_t::cudnn_inner_product_fwd_t;
using parrent_pd_t = cudnn_inner_product_fwd_t::pd_t;
struct pd_t : public parrent_pd_t {
using parrent_pd_t::parrent_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:gemm", cudnn_gemm_inner_product_fwd_t);
status_t init(engine_t *engine) {
using namespace data_type;
using namespace prop_kind;
using namespace data_type;
assert(engine->kind() == engine_kind::gpu);
bool ok = true && is_fwd()
&& (set_default_params() == status::success);
if (!ok) return status::unimplemented;
if (has_zero_dim_memory()) return status::success;
bool gemm_compatible
= gemm_consitency_check(src_md(), weights_md(), dst_md());
bool need_reorder = (gemm_compatible
? false
: reorder_check(src_md(), weights_md(), dst_md()));
const auto attr_skip_mask = primitive_attr_t::skip_mask_t::oscale
| primitive_attr_t::skip_mask_t::post_ops;
bool with_eltwise
= attr()->post_ops_.find(primitive_kind::eltwise) != -1;
bool with_sum = attr()->post_ops_.find(primitive_kind::sum) != -1;
ok = ok
&& utils::one_of(true,
expect_data_types(f16, f16, f16, f16, f16),
expect_data_types(f16, f16, f32, f16, f32),
expect_data_types(s8, s8, f32, s8, s32),
expect_data_types(s8, s8, f32, f32, f32),
expect_data_types(f32, f32, f32, f32, f32))
&& memory_format_ok(src_md())
&& memory_format_ok(weights_md(0))
&& memory_format_ok(dst_md())
&& IMPLICATION(!attr()->output_scales_.has_default_values(),
utils::one_of(src_md_.data_type, s8)
&& attr()->output_scales_.mask_ == 0)
&& attr()->has_default_values(attr_skip_mask)
&& post_ops_ok(attr())
&& dense_check(src_md(), weights_md(), dst_md())
&& (gemm_compatible || need_reorder);
if (!ok) return status::unimplemented;
inner_product_impl_.reset(
new cudnn_gemm_inner_product_fwd_impl_t());
return inner_product_impl_->init(engine, this, with_eltwise,
with_eltwise, with_sum, need_reorder);
}
bool post_ops_ok(const primitive_attr_t *attr) const {
const auto &p = attr->post_ops_;
auto is_eltwise
= [&](int idx) { return p.entry_[idx].is_eltwise(false); };
auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); };
switch (p.len()) {
case 0: return true; // no post_ops
case 1: return is_eltwise(0) || is_sum(0); // sum OR eltwise
case 2: return is_sum(0) && is_eltwise(1); // sum -> eltwise
default: return false;
}
return false;
}
status_t set_default_params() {
return template_set_default_params(
src_md_, weights_md_, dst_md_, &bias_md_, ndims());
}
};
const pd_t *pd() const override {
return (const pd_t *)primitive_t::pd().get();
}
};
struct cudnn_gemm_inner_product_bwd_data_t
: public cudnn_inner_product_bwd_data_t {
using cudnn_inner_product_bwd_data_t::cudnn_inner_product_bwd_data_t;
using parent_pd_t = cudnn_inner_product_bwd_data_t::pd_t;
struct pd_t : public parent_pd_t {
using parent_pd_t::parent_pd_t;
DECLARE_COMMON_PD_T(
"cuda:cudnn:gemm", cudnn_gemm_inner_product_bwd_data_t);
status_t init(engine_t *engine) {
using namespace prop_kind;
using namespace data_type;
assert(engine->kind() == engine_kind::gpu);
bool ok = true && this->desc()->prop_kind == backward_data
&& set_default_params() == status::success;
if (!ok) return status::unimplemented;
if (has_zero_dim_memory()) return status::success;
bool gemm_compatible = gemm_consitency_check(
diff_src_md(), weights_md(), diff_dst_md());
bool need_reorder = gemm_compatible
? false
: reorder_check(diff_src_md(), weights_md(), diff_dst_md());
ok = ok && expect_data_types(f32, f32, data_type::undef, f32, f32)
&& attr()->has_default_values()
&& dense_check(diff_src_md(), weights_md(), diff_dst_md())
&& (gemm_compatible || need_reorder);
if (!ok) return status::unimplemented;
inner_product_impl_.reset(
new cudnn_gemm_inner_product_bwd_data_impl_t());
return inner_product_impl_->init(
engine, this, false, false, false, need_reorder);
}
status_t set_default_params() {
return template_set_default_params(diff_src_md_, weights_md_,
diff_dst_md_, &glob_zero_md, ndims());
}
};
const pd_t *pd() const override {
return (const pd_t *)primitive_t::pd().get();
}
};
struct cudnn_gemm_inner_product_bwd_weights_t
: public cudnn_inner_product_bwd_weights_t {
using cudnn_inner_product_bwd_weights_t::cudnn_inner_product_bwd_weights_t;
using parent_pd_t = cudnn_inner_product_bwd_weights_t::pd_t;
struct pd_t : public parent_pd_t {
using parent_pd_t::parent_pd_t;
DECLARE_COMMON_PD_T(
"cuda:cudnn:gemm", cudnn_gemm_inner_product_bwd_weights_t);
status_t init(engine_t *engine) {
using namespace prop_kind;
using namespace data_type;
assert(engine->kind() == engine_kind::gpu);
bool ok = true && this->desc()->prop_kind == backward_weights
&& set_default_params() == status::success;
if (!ok) return status::unimplemented;
if (has_zero_dim_memory()) return status::success;
bool gemm_compatible = gemm_consitency_check(
src_md(), diff_weights_md(), diff_dst_md());
bool need_reorder = gemm_compatible
? false
: reorder_check(src_md(), diff_weights_md(), diff_dst_md());
ok = ok && expect_data_types(f32, f32, f32, f32, f32)
&& attr()->has_default_values()
&& dense_check(src_md(), diff_weights_md(), diff_dst_md())
&& (gemm_compatible || need_reorder);
if (!ok) return status::unimplemented;
inner_product_impl_.reset(
new cudnn_gemm_inner_product_bwd_weights_impl_t());
return inner_product_impl_->init(
engine, this, false, false, false, need_reorder);
}
status_t set_default_params() {
return template_set_default_params(src_md_, diff_weights_md_,
diff_dst_md_, &diff_bias_md_, ndims());
}
};
const pd_t *pd() const override {
return (const pd_t *)primitive_t::pd().get();
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,463 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_GEMM_INNER_PRODUCT_IMPL_HPP
#define GPU_NVIDIA_CUDNN_GEMM_INNER_PRODUCT_IMPL_HPP
#include "cublas_v2.h"
#include "cudnn.h"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/cudnn_inner_product_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
// GEMM Implementation
struct cudnn_gemm_inner_product_base_t {
protected:
int m_, n_, k_, lda_, ldb_, ldc_;
cublasOperation_t trans_a_, trans_b_;
// compute_type is always equal to c_type_;
// if datatype is f16 or s8 and bias is presented the compute type must be
// f32 and we need to do the operation in f32
cudaDataType_t a_type_, b_type_, c_type_,
// Despite the claim in cuBlas
// (https://docs.nvidia.com/cuda/cublas/index.html#cublas-GemmEx)
// for the support of fp16 computation when all the types are fp16,
// in cublas 10.1, and 10.2, if the fp16 is chosen as a
// computation mode, it silently does no computation. So we force
// computation type to be f32 in order to get the correct result.
// This can be reverted when the bug in cublas is fixed.
compute_type_ = CUDA_R_32F;
cublasGemmAlgo_t algo_ = CUBLAS_GEMM_DEFAULT;
status_t get_cublas_data_type(
const cudnnDataType_t &cudnn_dt, cudaDataType_t &blas_dt) const {
switch (cudnn_dt) {
case CUDNN_DATA_FLOAT: blas_dt = CUDA_R_32F; return status::success;
case CUDNN_DATA_HALF: blas_dt = CUDA_R_16F; return status::success;
case CUDNN_DATA_INT8: blas_dt = CUDA_R_8I; return status::success;
case CUDNN_DATA_INT8x4: blas_dt = CUDA_R_8I; return status::success;
default: return status::unimplemented;
}
return status::unimplemented;
}
};
struct cudnn_gemm_inner_product_fwd_impl_t
: public cudnn_inner_product_fwd_base_t,
public cudnn_gemm_inner_product_base_t,
public cudnn_conv_filter_adjustment_base_t {
cudnnActivationDescriptor_t act_desc_;
bool use_acc_dst_;
cudnnTensorDescriptor_t y_acc_desc_;
bool need_reorder_;
bool ip_using_scratchpad() const override { return (use_acc_dst_ > 0); }
virtual bool need_to_transform_filter() const { return need_reorder_; }
virtual status_t init(engine_t *, inner_product_pd_t *pd, bool with_relu,
bool with_eltwise, bool with_sum, bool need_reorder) override {
need_reorder_ = need_reorder;
// GEMM is column major, here the data is row major.
// By switching the weight and source we convert the row major to
// column major without transposing matrices.
// B * A = C, where B is weight, A is src and C is dst
bool wie_tr = (pd->weights_md()->format_desc.blocking.strides[0] != 1);
CHECK(convert_data_type(pd->src_md(), &data_types_[io::src]));
CHECK(convert_data_type(pd->weights_md(0), &data_types_[io::wei]));
if (need_reorder) {
cudnnTensorFormat_t source_format;
CHECK(get_format(pd->src_md(), source_format));
ndims_ = pd->ndims() < 4 ? 4 : pd->ndims();
get_4d_tensor_descriptor(
pd->weights_md(0), dims_[io::wei], strides_[io::wei]);
set_filter_format(
ndims_, dims_[io::wei], strides_[NUM_IO], source_format);
CHECK(init_filter_transformation(data_types_[io::wei], ndims_,
dims_[io::wei], strides_[io::wei], strides_[NUM_IO]));
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_none,
memory_desc_wrapper(pd->weights_md(0)).size(), size_t(1));
wie_tr = strides_[NUM_IO][0] != 1;
}
trans_a_ = wie_tr ? CUBLAS_OP_T : CUBLAS_OP_N;
trans_b_ = CUBLAS_OP_N;
int ic = pd->IC_total_padded();
int oc = pd->OC();
int mb = pd->MB();
n_ = mb;
k_ = ic;
m_ = oc;
lda_ = wie_tr ? k_ : m_;
ldb_ = k_;
ldc_ = m_;
with_bias_ = pd->with_bias();
with_eltwise_ = with_eltwise || with_relu;
with_relu_ = with_eltwise;
use_acc_dst_ = ((pd->dst_md()->data_type == data_type::s8)
|| (with_bias_
&& pd->weights_md(1)->data_type
!= pd->dst_md()->data_type));
// this must be applied on bias if exists.
output_scales_ = pd->attr()->output_scales_.scales_[0]; // alpha
with_sum_ = with_sum;
// scaling factor to add the previous destination value to the current
// computation. This is equivalent of
sum_scale_ = sum_scale(pd);
ndims_ = 4;
bool input_is_blocked
= pd->src_md()->format_desc.blocking.inner_blks[0] == 4
&& pd->weights_md(0)->format_desc.blocking.inner_blks[0] == 4;
if (input_is_blocked) { // since we flatten the tensor and use gemm
// we dont care about the blocked data type
data_types_[io::src] = CUDNN_DATA_INT8;
data_types_[io::wei] = CUDNN_DATA_INT8;
data_types_[io::dst] = CUDNN_DATA_INT8;
} else {
CHECK(convert_data_type(pd->dst_md(), &data_types_[io::dst]));
}
CHECK(get_cublas_data_type(data_types_[io::wei], a_type_));
CHECK(get_cublas_data_type(data_types_[io::src], b_type_));
c_type_ = (data_types_[io::dst] == CUDNN_DATA_HALF && !use_acc_dst_)
? CUDA_R_16F
: CUDA_R_32F;
get_4d_tensor_descriptor(
pd->dst_md(), dims_[io::dst], strides_[io::dst]);
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::dst],
data_types_[io::dst], ndims_, dims_[io::dst],
strides_[io::dst]));
if (with_bias_) {
CHECK(convert_data_type(pd->weights_md(1), &data_types_[io::bia]));
// format is always nchw
set_bias_dims(CUDNN_TENSOR_NCHW, ndims_, pd->OC());
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia],
data_types_[io::bia], ndims_, dims_[io::bia],
strides_[io::bia]));
}
if (use_acc_dst_) {
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_iprod_int_dat_in_acc_dt,
memory_desc_wrapper(pd->dst_md()).size(), size_t(1));
CHECK(create_and_set_tensor_descriptor(&y_acc_desc_,
CUDNN_DATA_FLOAT, ndims_, dims_[io::dst],
strides_[io::dst]));
} else {
y_acc_desc_ = tensor_descs_[io::dst];
}
if (with_eltwise_) { CHECK(create_and_set_op_descriptor(pd)); }
return status::success;
}
void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle,
const std::vector<void *> &args) const override {
assert(args.size() == 7);
auto x = args[0], w = args[1], b = args[2], y = args[3],
workspace = args[4];
auto w_arg = w;
if (need_reorder_) {
void *transformed_w = args[5];
transform_filter(cudnn_handle, w, transformed_w);
w_arg = transformed_w;
}
auto y_dst = use_acc_dst_ ? workspace : y;
auto sum_scale = use_acc_dst_ ? 0.0f : sum_scale_;
// do gemm
CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, trans_a_, trans_b_, m_,
n_, k_, &output_scales_, w_arg, a_type_, lda_, x, b_type_, ldb_,
&sum_scale, y_dst, c_type_, ldc_, compute_type_, algo_);
if (with_bias_) {
CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &output_scales_,
tensor_descs_[io::bia], b, &alpha_, y_acc_desc_, y_dst);
}
if (use_acc_dst_) {
CUDNN_EXECUTE_FUNC(cudnnTransformTensor, cudnn_handle, &alpha_,
y_acc_desc_, y_dst, &sum_scale_, tensor_descs_[io::dst], y);
}
if (with_eltwise_) {
CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_,
&alpha_, tensor_descs_[io::dst], y, &beta_,
tensor_descs_[io::dst], y);
}
}
status_t create_and_set_op_descriptor(const inner_product_pd_t *pd) {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateActivationDescriptor, &act_desc_));
cudnnActivationMode_t act_mode;
switch (eltwise_algorithm_kind(pd)) {
case alg_kind::eltwise_tanh:
act_mode = CUDNN_ACTIVATION_TANH;
break;
case alg_kind::eltwise_elu: act_mode = CUDNN_ACTIVATION_ELU; break;
case alg_kind::eltwise_relu:
act_mode = CUDNN_ACTIVATION_RELU;
break;
case alg_kind::eltwise_logistic:
act_mode = CUDNN_ACTIVATION_SIGMOID;
break;
case alg_kind::eltwise_bounded_relu:
act_mode = CUDNN_ACTIVATION_CLIPPED_RELU;
break;
default: return status::unimplemented;
}
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_,
act_mode, cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN,
eltwise_alpha(pd)));
return status::success;
}
};
struct cudnn_gemm_inner_product_bwd_data_impl_t
: public cudnn_inner_product_impl_base_t,
public cudnn_gemm_inner_product_base_t,
public cudnn_conv_filter_adjustment_base_t {
bool need_reorder_;
virtual bool need_to_transform_filter() const { return need_reorder_; }
virtual status_t init(engine_t *, inner_product_pd_t *pd,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool need_reorder) override {
need_reorder_ = need_reorder;
// GEMM is column major, here the data is row major.
// By switching the weight and source we convert the row major to
// column major without transposing matrices.
// B * A = C, where B is weight, A is d_dst and C is d_src
bool wie_tr = (pd->weights_md(0)->format_desc.blocking.strides[0] == 1);
CHECK(convert_data_type(pd->diff_src_md(), &data_types_[io::src]));
CHECK(convert_data_type(pd->weights_md(0), &data_types_[io::wei]));
CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst]));
if (need_reorder) {
cudnnTensorFormat_t diff_source_format_;
CHECK(get_format(pd->diff_src_md(), diff_source_format_));
ndims_ = pd->ndims() < 4 ? 4 : pd->ndims();
get_4d_tensor_descriptor(
pd->weights_md(0), dims_[io::wei], strides_[io::wei]);
set_filter_format(ndims_, dims_[io::wei], strides_[NUM_IO],
diff_source_format_);
CHECK(init_filter_transformation(data_types_[io::wei], ndims_,
dims_[io::wei], strides_[io::wei], strides_[NUM_IO]));
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_none,
memory_desc_wrapper(pd->weights_md(0)).size(), size_t(1));
wie_tr = strides_[NUM_IO][0] == 1;
}
trans_a_ = wie_tr ? CUBLAS_OP_T : CUBLAS_OP_N;
trans_b_ = CUBLAS_OP_N;
int ic = pd->IC_total_padded();
int oc = pd->OC();
int mb = pd->MB();
n_ = mb;
k_ = oc;
m_ = ic;
lda_ = wie_tr ? k_ : m_;
ldb_ = k_;
ldc_ = m_;
CHECK(get_cublas_data_type(data_types_[io::wei], a_type_));
CHECK(get_cublas_data_type(data_types_[io::dst], b_type_));
CHECK(get_cublas_data_type(data_types_[io::src], c_type_));
return status::success;
}
void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle,
const std::vector<void *> &args) const override {
assert(args.size() == 5);
auto dx = args[0], w = args[1], dy = args[2];
auto w_arg = w;
if (need_reorder_) {
void *transformed_w = args[4];
transform_filter(cudnn_handle, w, transformed_w);
w_arg = transformed_w;
}
// do gemm
CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, trans_a_, trans_b_, m_,
n_, k_, &alpha_, w_arg, a_type_, lda_, dy, b_type_, ldb_,
&beta_, dx, c_type_, ldc_, compute_type_, algo_);
}
};
struct cudnn_gemm_inner_product_bwd_weights_impl_t
: public cudnn_inner_product_impl_base_t,
public cudnn_gemm_inner_product_base_t,
public cudnn_conv_filter_adjustment_base_t {
cudnnReduceTensorDescriptor_t reduceTensorDesc_ = nullptr;
bool wie_tr_;
bool need_reorder_;
virtual bool need_to_transform_filter() const { return need_reorder_; }
virtual ~cudnn_gemm_inner_product_bwd_weights_impl_t() {
if (reduceTensorDesc_) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyReduceTensorDescriptor, reduceTensorDesc_);
}
}
status_t create_and_set_reduce_descriptor() {
CUDNN_EXECUTE_FUNC_S(
cudnnCreateReduceTensorDescriptor, &reduceTensorDesc_);
CUDNN_EXECUTE_FUNC_S(cudnnSetReduceTensorDescriptor, reduceTensorDesc_,
CUDNN_REDUCE_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES);
return status::success;
}
virtual status_t init(engine_t *engine, inner_product_pd_t *pd,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool need_reorder) override {
need_reorder_ = need_reorder;
with_bias_ = pd->with_bias();
// GEMM is column major, here the data is row major.
// By switching the weight and source we convert the row major to
// column major without transposing matrices.
// B * A = C.
// Here backward weight is equivalent of d_dst * src^T when the weight
// filter is IC*OC. Therefore B is d_dst and A is transposed src, and C
// is d_wei. However, when the filter format is OC*IC , the backward
// weight is equivalent to src * d_dst^T. In this case, B is src, A is
// transposed d_dst and C is d_wei.
wie_tr_ = (pd->diff_weights_md(0)->format_desc.blocking.strides[0]
== 1);
// std::cout << wie_tr_ << std::endl;
CHECK(convert_data_type(pd->src_md(), &data_types_[io::src]));
CHECK(convert_data_type(pd->diff_weights_md(0), &data_types_[io::wei]));
CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst]));
if (need_reorder_) {
cudnnTensorFormat_t source_format;
CHECK(get_format(pd->src_md(), source_format));
ndims_ = pd->ndims() < 4 ? 4 : pd->ndims();
get_4d_tensor_descriptor(
pd->diff_weights_md(0), dims_[io::wei], strides_[io::wei]);
set_filter_format(
ndims_, dims_[io::wei], strides_[NUM_IO], source_format);
CHECK(init_filter_transformation(data_types_[io::wei], ndims_,
dims_[io::wei], strides_[NUM_IO], strides_[io::wei]));
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_none,
memory_desc_wrapper(pd->diff_weights_md(0)).size(),
size_t(1));
wie_tr_ = (strides_[NUM_IO][0] == 1);
}
trans_a_ = CUBLAS_OP_N;
trans_b_ = CUBLAS_OP_T;
int ic = pd->IC_total_padded();
int oc = pd->OC();
int mb = pd->MB();
n_ = wie_tr_ ? ic : oc;
k_ = mb;
m_ = wie_tr_ ? oc : ic;
lda_ = m_;
ldb_ = n_;
ldc_ = m_;
CHECK(get_cublas_data_type(
data_types_[(wie_tr_ ? io::dst : io::src)], a_type_));
CHECK(get_cublas_data_type(
data_types_[(wie_tr_ ? io::src : io::dst)], b_type_));
CHECK(get_cublas_data_type(data_types_[io::wei], c_type_));
if (with_bias_) {
ndims_ = 4;
get_4d_tensor_descriptor(
pd->diff_dst_md(), dims_[io::dst], strides_[io::dst]);
CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[io::dst]));
set_bias_dims(CUDNN_TENSOR_NCHW, ndims_, pd->OC());
CHECK(convert_data_type(
pd->diff_weights_md(1), &data_types_[io::bia]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::dst],
data_types_[io::dst], ndims_, dims_[io::dst],
strides_[io::dst]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia],
data_types_[io::bia], ndims_, dims_[io::bia],
strides_[io::bia]));
CHECK(create_and_set_reduce_descriptor());
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto handle = cuda_stream->get_cudnn_handle();
// get the required workspace size
CUDNN_EXECUTE_FUNC_S(cudnnGetReductionWorkspaceSize, handle,
reduceTensorDesc_, tensor_descs_[io::dst],
tensor_descs_[io::bia], &workspace_size_);
}
if (workspace_size_ > 0) {
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_iprod_int_dat_in_acc_dt,
workspace_size_, size_t(1));
}
return status::success;
}
void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle,
const std::vector<void *> &args) const override {
assert(args.size() == 6);
auto x = args[0], dy = args[1], dw = args[2], db = args[3],
workspace = args[4];
auto dw_arg = need_reorder_ ? args[5] : dw;
// do gemm
CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, trans_a_, trans_b_, m_,
n_, k_, &alpha_, (wie_tr_ ? dy : x), a_type_, lda_,
(wie_tr_ ? x : dy), b_type_, ldb_, &beta_, dw_arg, c_type_,
ldc_, compute_type_, algo_);
if (need_reorder_) {
// The output of weight is in nvida specific format,
// however a user requires the oneDNN format as an output
transform_filter(cudnn_handle, dw_arg, dw);
}
if (with_bias_) {
// backward bias for inner product is reduction of dy on dim[0] .
// So we can use cudnnReduceTensor to partially reduce dy.
CUDNN_EXECUTE_FUNC(cudnnReduceTensor, cudnn_handle,
reduceTensorDesc_, nullptr, 0, workspace, workspace_size_,
&alpha_, tensor_descs_[io::dst], dy, &beta_,
tensor_descs_[io::bia], db);
}
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,238 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_inner_product.hpp"
#include "gpu/nvidia/cudnn_conv_inner_product.hpp"
#include "gpu/nvidia/cudnn_gemm_inner_product.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "sycl/sycl_buffer_memory_storage.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
if (pd()->has_zero_dim_memory()) return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>;
using read_acc_t
= cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
cl::sycl::access::target::global_buffer>;
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto wei_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
std::shared_ptr<read_acc_t> bias_acc;
if (pd()->with_bias()) {
bias_acc = std::make_shared<read_acc_t>(
CTX_IN_ACCESSOR(DNNL_ARG_BIAS));
}
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
std::shared_ptr<scratch_acc_t> ip_scratch_acc;
std::shared_ptr<scratch_acc_t> spacial_scratch_acc;
std::shared_ptr<scratch_acc_t> scaled_bias_scratch_acc;
if (pd()->inner_product_impl_->ip_using_scratchpad()) {
ip_scratch_acc = std::make_shared<
scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
memory_tracking::names::key_iprod_int_dat_in_acc_dt));
}
if (pd()->inner_product_impl_->need_to_transform_filter()) {
spacial_scratch_acc = std::make_shared<scratch_acc_t>(
CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none));
}
if (pd()->inner_product_impl_->conv_using_scale_scratchpad()) {
scaled_bias_scratch_acc
= std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
memory_tracking::names::key_conv_adjusted_scales));
}
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto cudnn_handle = cuda_stream->get_cudnn_handle();
auto cublas_handle = cuda_stream->get_cublas_handle();
std::vector<void *> args;
args.push_back(sc.memory<void *>(ih, src_acc));
args.push_back(sc.memory<void *>(ih, wei_acc));
args.push_back(
((pd()->with_bias()) ? sc.memory<void *>(ih, *bias_acc)
: nullptr));
args.push_back(sc.memory<void *>(ih, dst_acc));
args.push_back((pd()->inner_product_impl_->ip_using_scratchpad()
? sc.memory<void *>(ih, *ip_scratch_acc)
: nullptr));
args.push_back((
pd()->inner_product_impl_->need_to_transform_filter()
? sc.memory<void *>(ih, *spacial_scratch_acc)
: nullptr));
args.push_back((
pd()->inner_product_impl_->conv_using_scale_scratchpad()
? sc.memory<void *>(ih, *scaled_bias_scratch_acc)
: nullptr));
pd()->inner_product_impl_->execute(
cudnn_handle, cublas_handle, args);
});
});
}
status_t cudnn_inner_product_bwd_data_t::execute(const exec_ctx_t &ctx) const {
if (pd()->has_zero_dim_memory()) return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>;
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto wei_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
std::shared_ptr<scratch_acc_t> ip_scratch_acc;
std::shared_ptr<scratch_acc_t> spacial_scratch_acc;
if (pd()->inner_product_impl_->ip_using_scratchpad()) {
ip_scratch_acc = std::make_shared<
scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
memory_tracking::names::key_iprod_int_dat_in_acc_dt));
}
if (pd()->inner_product_impl_->need_to_transform_filter()) {
spacial_scratch_acc = std::make_shared<scratch_acc_t>(
CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none));
}
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto cudnn_handle = cuda_stream->get_cudnn_handle();
auto cublas_handle = cuda_stream->get_cublas_handle();
std::vector<void *> args;
args.push_back(sc.memory<void *>(ih, diff_src_acc));
args.push_back(sc.memory<void *>(ih, wei_acc));
args.push_back(sc.memory<void *>(ih, diff_dst_acc));
args.push_back((pd()->inner_product_impl_->ip_using_scratchpad()
? sc.memory<void *>(ih, *ip_scratch_acc)
: nullptr));
args.push_back((
pd()->inner_product_impl_->need_to_transform_filter()
? sc.memory<void *>(ih, *spacial_scratch_acc)
: nullptr));
pd()->inner_product_impl_->execute(
cudnn_handle, cublas_handle, args);
});
});
}
status_t cudnn_inner_product_bwd_weights_t::execute(
const exec_ctx_t &ctx) const {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
if (pd()->has_zero_dim_memory()) {
auto wei_sz = memory_desc_wrapper(pd()->diff_weights_md(0)).size();
size_t bias_sz = (pd()->with_bias()
? memory_desc_wrapper(pd()->diff_weights_md(1)).size()
: 0);
if (wei_sz != 0) {
auto status
= cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto diff_wei_acc
= CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS);
cgh.fill(diff_wei_acc, static_cast<uint8_t>(0));
});
if (status != status::success) return status;
}
if (bias_sz != 0) {
auto status
= cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto diff_bia_acc
= CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS);
cgh.fill(diff_bia_acc, static_cast<uint8_t>(0));
});
if (status != status::success) return status;
}
return status::success;
}
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>;
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto diff_wei_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS);
using write_acc_t
= cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
cl::sycl::access::target::global_buffer>;
std::shared_ptr<write_acc_t> diff_bias_acc;
if (pd()->with_bias()) {
diff_bias_acc = std::make_shared<write_acc_t>(
CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS));
}
std::shared_ptr<scratch_acc_t> ip_scratch_acc;
std::shared_ptr<scratch_acc_t> spacial_scratch_acc;
if (pd()->inner_product_impl_->ip_using_scratchpad()) {
ip_scratch_acc = std::make_shared<
scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
memory_tracking::names::key_iprod_int_dat_in_acc_dt));
}
if (pd()->inner_product_impl_->need_to_transform_filter()) {
spacial_scratch_acc = std::make_shared<scratch_acc_t>(
CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none));
}
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto cudnn_handle = cuda_stream->get_cudnn_handle();
auto cublas_handle = cuda_stream->get_cublas_handle();
std::vector<void *> args;
args.push_back(sc.memory<void *>(ih, src_acc));
args.push_back(sc.memory<void *>(ih, diff_dst_acc));
args.push_back(sc.memory<void *>(ih, diff_wei_acc));
args.push_back(
((pd()->with_bias()) ? sc.memory<void *>(ih, *diff_bias_acc)
: nullptr));
args.push_back((pd()->inner_product_impl_->ip_using_scratchpad()
? sc.memory<void *>(ih, *ip_scratch_acc)
: nullptr));
args.push_back((
pd()->inner_product_impl_->need_to_transform_filter()
? sc.memory<void *>(ih, *spacial_scratch_acc)
: nullptr));
pd()->inner_product_impl_->execute(
cudnn_handle, cublas_handle, args);
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,90 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_INNER_PRODUCT_HPP
#define GPU_NVIDIA_CUDNN_INNER_PRODUCT_HPP
#include "cudnn.h"
#include <CL/sycl.hpp>
#include "common/c_types_map.hpp"
#include "common/inner_product_pd.hpp"
#include "common/primitive.hpp"
#include "gpu/nvidia/cudnn_inner_product_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_inner_product_fwd_t : public primitive_t {
public:
using primitive_t::primitive_t;
struct pd_t : public inner_product_fwd_pd_t {
using inner_product_fwd_pd_t::inner_product_fwd_pd_t;
std::shared_ptr<cudnn_inner_product_impl_base_t> inner_product_impl_;
};
status_t execute(const exec_ctx_t &ctx) const override;
virtual const pd_t *pd() const {
return (const pd_t *)primitive_t::pd().get();
}
};
struct cudnn_inner_product_bwd_data_t : public primitive_t {
public:
using primitive_t::primitive_t;
struct pd_t : public inner_product_bwd_data_pd_t {
using inner_product_bwd_data_pd_t::inner_product_bwd_data_pd_t;
std::shared_ptr<cudnn_inner_product_impl_base_t> inner_product_impl_;
};
status_t execute(const exec_ctx_t &ctx) const override;
virtual const pd_t *pd() const {
return (const pd_t *)primitive_t::pd().get();
}
};
struct cudnn_inner_product_bwd_weights_t : public primitive_t {
public:
using primitive_t::primitive_t;
struct pd_t : public inner_product_bwd_weights_pd_t {
using inner_product_bwd_weights_pd_t::inner_product_bwd_weights_pd_t;
std::shared_ptr<cudnn_inner_product_impl_base_t> inner_product_impl_;
};
status_t execute(const exec_ctx_t &ctx) const override;
virtual const pd_t *pd() const {
return (const pd_t *)primitive_t::pd().get();
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,191 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_INNER_PRODUCT_IMPL_HPP
#define GPU_NVIDIA_CUDNN_INNER_PRODUCT_IMPL_HPP
#include "cublas_v2.h"
#include "cudnn.h"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace {
inline void get_4d_tensor_descriptor(
const memory_desc_t *mem_desc1, int *dims, int *strides) {
memory_desc_t mem_desc = *mem_desc1;
// Forcing tensors dims less than 4 to be 4 {n c h w};
using namespace format_tag;
auto set_dim = [&]() {
if (mem_desc.ndims == 3) {
mem_desc.ndims = 4;
mem_desc.dims[3] = mem_desc.dims[2];
mem_desc.dims[2] = 1;
mem_desc.padded_dims[3] = mem_desc.padded_dims[2];
mem_desc.padded_dims[2] = 1;
} else if (mem_desc.ndims == 2) {
mem_desc.ndims = 4;
mem_desc.dims[3] = 1;
mem_desc.dims[2] = 1;
mem_desc.padded_dims[3] = 1;
mem_desc.padded_dims[2] = 1;
}
};
auto &stride = mem_desc.format_desc.blocking.strides;
auto &dim = mem_desc.dims;
// Forcing strides < 4 to be 4
if (memory_desc_matches_tag(mem_desc, nwc)) {
set_dim();
// promoting nwc(owi) to NHWC = {wc 1 c} to {wc 1 wc c}
mem_desc.format_desc.blocking.strides[3]
= mem_desc.format_desc.blocking.strides[2];
mem_desc.format_desc.blocking.strides[2]
= mem_desc.format_desc.blocking.strides[0];
assert(memory_desc_matches_tag(mem_desc, nhwc)
&& "Tag is not set to NHWC");
} else if (memory_desc_matches_tag(mem_desc, ncw)) {
set_dim();
// promoting ncw(oiw) to NCHW = {wc w 1} to {wc w w 1}
mem_desc.format_desc.blocking.strides[3]
= mem_desc.format_desc.blocking.strides[2];
mem_desc.format_desc.blocking.strides[2]
= mem_desc.format_desc.blocking.strides[1];
assert(memory_desc_matches_tag(mem_desc, nchw)
&& "Tag is not set to NCHW");
} else if (memory_desc_matches_tag(mem_desc, wio)) {
set_dim();
// promoting wcn(wio) to HWCN = {1 n nc} to {1 n ncw nc}
mem_desc.format_desc.blocking.strides[3]
= mem_desc.format_desc.blocking.strides[2];
mem_desc.format_desc.blocking.strides[2] *= mem_desc.dims[3];
assert(memory_desc_matches_tag(mem_desc, hwio)
&& " Tag is not set to HWIO");
} else if (memory_desc_matches_tag(mem_desc, nc)) {
set_dim();
// fixing strides
// promoting nc(oi) to NCHW = {c 1} to {c 1 1 1}
mem_desc.format_desc.blocking.strides[2]
= mem_desc.format_desc.blocking.strides[1];
mem_desc.format_desc.blocking.strides[3]
= mem_desc.format_desc.blocking.strides[1];
assert(memory_desc_matches_tag(mem_desc, nchw)
&& " Tag is not set to NCHW");
} else if (memory_desc_matches_tag(mem_desc, cn)) {
set_dim();
// fixing strides cn(oi) to HWCN = {1 n} to {1 n nc nc}.
// Note that CHWN exists as well, but for inner product
// we convert it to HWCN. Other primitives may need
// different conversion.
mem_desc.format_desc.blocking.strides[2]
= mem_desc.format_desc.blocking.strides[1]
* mem_desc.padded_dims[1];
mem_desc.format_desc.blocking.strides[3]
= mem_desc.format_desc.blocking.strides[2];
assert(memory_desc_matches_tag(mem_desc, hwio)
&& " Tag is not set to NCHW");
}
convert_dnnl_dims_array(mem_desc.dims, dims, mem_desc.ndims);
convert_dnnl_dims_array(
mem_desc.format_desc.blocking.strides, strides, mem_desc.ndims);
}
} // namespace
struct cudnn_inner_product_impl_base_t {
// The io enum requires the weights be the last parameter to ensure
// tensor_descs is contiguous.
enum io { src = 0, bia, dst, wei, NUM_IO };
cudnnDataType_t data_types_[NUM_IO + 1]; // +1 data-type for accumulation
int ndims_;
int dims_[NUM_IO][DNNL_MAX_NDIMS];
// one extra stride added for transform filter
int strides_[NUM_IO + 1][DNNL_MAX_NDIMS];
cudnnTensorDescriptor_t tensor_descs_[NUM_IO - 1] = {};
size_t workspace_size_ = 0;
float alpha_ = 1, beta_ = 0;
bool with_bias_;
bool scale_bias_ = false;
bool with_relu_ = false, with_eltwise_ = false, with_sum_ = false;
bool filter_using_spatial_format_ = false;
virtual bool need_to_transform_filter() const {
return filter_using_spatial_format_;
}
virtual bool ip_using_scratchpad() const { return (workspace_size_ > 0); }
bool conv_using_scale_scratchpad() const { return scale_bias_; }
void set_bias_dims(cudnnTensorFormat_t format, int ndims, int bias_dim) {
// Set the dimensions and strides for the bias.
// Note that the second dimension of bias and the first dimension
// of filter should be equal, as cuDNN always stores dimensions in
// NCDHW order. The first dimension of filter must be equal to the
// second dimension of bias
for (size_t i = 0; i < ndims; ++i) {
dims_[io::bia][i] = 1;
strides_[io::bia][i] = (format != CUDNN_TENSOR_NHWC ? 1 : bias_dim);
}
dims_[io::bia][1] = bias_dim;
strides_[io::bia][1] = 1;
strides_[io::bia][0] = bias_dim;
}
virtual status_t init(engine_t * /*engine*/, inner_product_pd_t * /*pd*/,
bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
bool /*using_fused_path_for_blocking*/)
= 0;
virtual void execute(cudnnHandle_t /*handle*/,
cublasHandle_t /*cublas_handle*/,
const std::vector<void *> & /*args*/) const = 0;
};
struct cudnn_inner_product_fwd_base_t : public cudnn_inner_product_impl_base_t {
float output_scales_; // alpha in gemm
float sum_scale_; // beta in gemm
float eltwise_alpha(const inner_product_pd_t *pd) const {
const int eltwise_idx
= pd->attr()->post_ops_.find(primitive_kind::eltwise);
return with_eltwise_
? pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alpha
: 0.0f;
}
float sum_scale(const inner_product_pd_t *pd) const {
const int sum_idx = pd->attr()->post_ops_.find(primitive_kind::sum);
return with_sum_ ? pd->attr()->post_ops_.entry_[sum_idx].sum.scale
: 0.0f;
}
dnnl::impl::alg_kind_t eltwise_algorithm_kind(
const inner_product_pd_t *pd) const {
const int eltwise_idx
= pd->attr()->post_ops_.find(primitive_kind::eltwise);
return pd->attr()->post_ops_.entry_[eltwise_idx].eltwise.alg;
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,89 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_lrn.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "sycl/sycl_buffer_memory_storage.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_lrn_fwd_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->desc()->data_desc).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto wrksp_acc = pd()->is_training()
? CTX_OUT_ACCESSOR(DNNL_ARG_WORKSPACE)
: dst_acc;
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
std::vector<void *> args {sc.memory<void *>(ih, src_acc),
sc.memory<void *>(ih, dst_acc),
sc.memory<void *>(ih, wrksp_acc)};
pd()->lrn_impl_->execute(handle, args);
});
});
}
status_t cudnn_lrn_bwd_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->desc()->data_desc).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
auto ws_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) mutable {
std::vector<void *> args;
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
args.push_back(sc.memory<void *>(ih, src_acc));
args.push_back(sc.memory<void *>(ih, ws_acc));
args.push_back(sc.memory<void *>(ih, diff_src_acc));
args.push_back(sc.memory<void *>(ih, diff_dst_acc));
pd()->lrn_impl_->execute(handle, args);
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,132 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_LRN_HPP
#define GPU_NVIDIA_CUDNN_LRN_HPP
#include "cudnn.h"
#include <CL/sycl.hpp>
#include "common/c_types_map.hpp"
#include "common/lrn_pd.hpp"
#include "common/primitive.hpp"
#include "gpu/nvidia/cudnn_lrn_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_lrn_fwd_t : public primitive_t {
struct pd_t : public lrn_fwd_pd_t {
using lrn_fwd_pd_t::lrn_fwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_lrn_fwd_t);
status_t init(engine_t *) {
using namespace data_type;
bool ok = true && is_fwd()
&& utils::one_of(desc()->prop_kind,
prop_kind::forward_inference,
prop_kind::forward_training)
&& utils::one_of(
desc()->alg_kind, alg_kind::lrn_across_channels)
&& utils::one_of(desc()->data_desc.data_type, f32, f16)
&& attr()->has_default_values()
// Make sure local size is not even (issue #75)
&& desc_.local_size % 2
// lrn does not support blocking
&& src_md()->format_desc.blocking.inner_nblks == 0;
if (!ok) return status::unimplemented;
if (has_zero_dim_memory()) return status::success;
if (is_training()) { ws_md_ = *dst_md(); }
lrn_impl_.reset(new cudnn_lrn_fwd_impl_t());
return lrn_impl_->init(this);
}
bool is_training() const {
return desc_.prop_kind == prop_kind::forward_training;
}
std::shared_ptr<cudnn_lrn_impl_base_t> lrn_impl_;
};
cudnn_lrn_fwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
struct cudnn_lrn_bwd_t : public primitive_t {
struct pd_t : public lrn_bwd_pd_t {
using lrn_bwd_pd_t::lrn_bwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_lrn_bwd_t);
status_t init(engine_t *) {
bool ok = true && !is_fwd()
&& utils::one_of(
desc()->alg_kind, alg_kind::lrn_across_channels)
&& utils::one_of(desc()->data_desc.data_type,
data_type::f16, data_type::f32)
&& set_default_formats_common()
&& attr()->has_default_values()
&& desc_.local_size
% 2 // Make sure local size is not even (issue #75)
// lrn does not support blocking
&& src_md()->format_desc.blocking.inner_nblks == 0
&& diff_dst_md()->format_desc.blocking.inner_nblks == 0;
if (!ok) return status::unimplemented;
if (has_zero_dim_memory()) { return status::success; };
ws_md_ = *diff_dst_md();
if (!compare_ws(hint_fwd_pd_)) return status::unimplemented;
lrn_impl_.reset(new cudnn_lrn_bwd_impl_t());
return lrn_impl_->init(this);
}
std::shared_ptr<cudnn_lrn_impl_base_t> lrn_impl_;
};
cudnn_lrn_bwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,201 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_LRN_IMPL_HPP
#define GPU_NVIDIA_CUDNN_LRN_IMPL_HPP
#include "cudnn.h"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_lrn_impl_base_t {
virtual ~cudnn_lrn_impl_base_t() {
if (lrn_desc) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyLRNDescriptor, lrn_desc);
}
for (size_t i = 0; i < NUM_IO; i++) {
if (tensor_descs[i]) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, tensor_descs[i]);
}
}
}
virtual status_t init(const lrn_pd_t *pd) = 0;
virtual void execute(
cudnnHandle_t handle, const std::vector<void *> &args) const = 0;
protected:
enum io { src_idx = 0, dst_idx, d_src_idx, d_dst_idx, NUM_IO };
cudnnDataType_t data_types[NUM_IO];
int ndims;
int dst_size;
int dims[NUM_IO][DNNL_MAX_NDIMS];
int strides[NUM_IO][DNNL_MAX_NDIMS];
float alpha = 1.0f;
float beta = 0.0f;
bool is_training;
double lrn_alpha;
double lrn_beta;
double lrn_K;
unsigned int lrn_N;
cudnnLRNMode_t lrn_mode;
cudnnLRNDescriptor_t lrn_desc = nullptr;
cudnnTensorDescriptor_t tensor_descs[NUM_IO] = {};
virtual status_t init_common(const lrn_pd_t *pd) {
ndims = std::max(4, pd->ndims());
if (ndims > 6) { return status::invalid_arguments; }
const bool do_scaling
= pd->src_md()->data_type == dnnl_data_type_t::dnnl_s8;
const auto scales_0 = pd->attr()->scales_.get(1).scales_;
const auto lrn_desc = pd->desc();
const auto dst_wrap = memory_desc_wrapper(pd->dst_md());
dst_size = dst_wrap.nelems();
alpha = do_scaling ? scales_0[0] : 1.0f;
is_training = pd->desc()->prop_kind == prop_kind::forward_training;
lrn_K = lrn_desc->lrn_k;
lrn_N = lrn_desc->local_size;
lrn_alpha = lrn_desc->lrn_alpha;
lrn_beta = lrn_desc->lrn_beta;
// Initialise lrn algorithm
CHECK(convert_alg_kind(pd->desc()->alg_kind, &lrn_mode));
// Set strides and dimensions
convert_dims(pd->src_md()->padded_dims, dims[src_idx], pd->ndims());
convert_dims(pd->src_md()->format_desc.blocking.strides,
strides[src_idx], pd->ndims());
// Set datatype
CHECK(convert_data_type(pd->src_md(), &data_types[src_idx]));
// Initialise tensor descriptor
CHECK(create_and_set_tensor_descriptor(&tensor_descs[src_idx],
data_types[src_idx], ndims, dims[src_idx], strides[src_idx]));
CHECK(create_and_set_lrn_descriptor());
return status::success;
}
virtual status_t create_and_set_lrn_descriptor() {
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateLRNDescriptor, &lrn_desc));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetLRNDescriptor, lrn_desc, lrn_N,
lrn_alpha, lrn_beta, lrn_K));
return status::success;
}
status_t convert_alg_kind(
alg_kind_t alg_kind, cudnnLRNMode_t *cuda_alg_kind) {
if (alg_kind == alg_kind::lrn_across_channels) {
*cuda_alg_kind = cudnnLRNMode_t::CUDNN_LRN_CROSS_CHANNEL_DIM1;
} else {
return status::unimplemented;
}
return status::success;
}
};
struct cudnn_lrn_fwd_impl_t : public cudnn_lrn_impl_base_t {
status_t init(const lrn_pd_t *pd) override {
CHECK(init_common(pd));
convert_dims(pd->dst_md()->padded_dims, dims[dst_idx], pd->ndims());
convert_dims(pd->dst_md()->format_desc.blocking.strides,
strides[dst_idx], pd->ndims());
CHECK(convert_data_type(pd->dst_md(), &data_types[dst_idx]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs[dst_idx],
data_types[dst_idx], ndims, dims[dst_idx], strides[dst_idx]));
return status::success;
}
void execute(cudnnHandle_t handle,
const std::vector<void *> &args) const override {
CUDNN_EXECUTE_FUNC(cudnnLRNCrossChannelForward, handle, lrn_desc,
lrn_mode, &alpha, tensor_descs[src_idx], args[0], &beta,
tensor_descs[dst_idx], args[1]);
if (is_training) {
float alpha = 1.0f;
float beta = 0.0f;
cudnnAddTensor(handle, &alpha, tensor_descs[dst_idx], args[dst_idx],
&beta, tensor_descs[2], args[2]);
}
}
};
struct cudnn_lrn_bwd_impl_t : public cudnn_lrn_impl_base_t {
status_t init(const lrn_pd_t *pd) override {
CHECK(init_common(pd));
// Set dimensions
convert_dims(
pd->diff_dst_md()->padded_dims, dims[dst_idx], pd->ndims());
convert_dims(
pd->diff_src_md()->padded_dims, dims[d_src_idx], pd->ndims());
convert_dims(
pd->diff_dst_md()->padded_dims, dims[d_dst_idx], pd->ndims());
// Set strides
convert_dims(pd->diff_dst_md()->format_desc.blocking.strides,
strides[dst_idx], pd->ndims());
convert_dims(pd->diff_src_md()->format_desc.blocking.strides,
strides[d_src_idx], pd->ndims());
convert_dims(pd->diff_dst_md()->format_desc.blocking.strides,
strides[d_dst_idx], pd->ndims());
// Set datatypes
CHECK(convert_data_type(pd->diff_dst_md(), &data_types[dst_idx]));
CHECK(convert_data_type(pd->diff_src_md(), &data_types[d_src_idx]));
CHECK(convert_data_type(pd->diff_dst_md(), &data_types[d_dst_idx]));
// Initialise tensor descriptors
CHECK(create_and_set_tensor_descriptor(&tensor_descs[dst_idx],
data_types[dst_idx], ndims, dims[dst_idx], strides[dst_idx]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs[d_src_idx],
data_types[d_src_idx], ndims, dims[d_src_idx],
strides[d_src_idx]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs[d_dst_idx],
data_types[d_dst_idx], ndims, dims[d_dst_idx],
strides[d_dst_idx]));
return status::success;
}
void execute(cudnnHandle_t handle,
const std::vector<void *> &args) const override {
CUDNN_EXECUTE_FUNC_V(cudnnLRNCrossChannelBackward, handle, lrn_desc,
lrn_mode, &alpha, tensor_descs[dst_idx], args[dst_idx],
tensor_descs[d_dst_idx], args[d_dst_idx], tensor_descs[src_idx],
args[src_idx], &beta, tensor_descs[d_src_idx], args[d_src_idx]);
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,87 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_matmul.hpp"
#include "common/c_types_map.hpp"
#include "common/dnnl_thread.hpp"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/cudnn_matmul_executor.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_matmul_t::execute(const exec_ctx_t &ctx) const {
const bool with_bias = matmul_impl_->with_bias();
const bool has_runtime_args = matmul_impl_->has_runtime_params();
const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
const auto weights_d = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
const auto bias_d = with_bias
? ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1))
: nullptr;
status_t status;
if (has_runtime_args) {
// Initialise all runtime parameters
status = matmul_impl_->init_parameters(src_d, weights_d, dst_d, bias_d);
if (status != status::success) return status;
}
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
if (!pd()->attr()->output_scales_.defined()) {
auto &buff = utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
&CTX_IN_STORAGE(DNNL_ARG_ATTR_OUTPUT_SCALES))
->buffer();
auto ev = copy(cuda_stream->queue(), buff,
reinterpret_cast<uint8_t *>(output_scale_));
ev.wait();
}
const auto scratchpad_type = matmul_impl_->get_scratchpad_type();
const auto scratchpad_size = matmul_impl_->with_scratchpad()
? (dst_d.nelems() * types::data_type_size(scratchpad_type))
: 0;
status = executor_->execute(ctx, ctx.stream()->engine(), matmul_impl_,
*output_scale_, scratchpad_size);
if (has_runtime_args) {
auto &evts = cuda_stream->get_deps();
for (auto e : evts) {
e.wait();
}
matmul_impl_->cleanup();
}
return status;
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,151 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_MATMUL_HPP
#define GPU_NVIDIA_CUDNN_MATMUL_HPP
#include <assert.h>
#include "common/matmul_pd.hpp"
#include "common/primitive.hpp"
#include "gpu/nvidia/cudnn_matmul_executor.hpp"
#include "gpu/nvidia/cudnn_matmul_impl.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_matmul_t : public primitive_t {
struct pd_t : public matmul_pd_t {
using matmul_pd_t::matmul_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_matmul_t);
status_t init(engine_t *) {
using namespace data_type;
using smask_t = primitive_attr_t::skip_mask_t;
data_type_t src_dt = src_md()->data_type;
data_type_t dst_dt = dst_md()->data_type;
data_type_t wei_dt = weights_md(0)->data_type;
data_type_t bia_dt
= with_bias() ? weights_md(1)->data_type : data_type::f32;
bool f32_case = utils::everyone_is(f32, src_dt, wei_dt, dst_dt);
bool f16_case = utils::everyone_is(f16, src_dt, wei_dt, dst_dt);
bool s8_case = utils::everyone_is(s8, src_dt, wei_dt)
&& utils::one_of(dst_dt, s8, f32);
bool ok = attr()->has_default_values(
smask_t::oscale_runtime | smask_t::post_ops)
&& attr_oscale_ok() && attr_post_ops_ok()
&& set_default_formats()
&& (f32_case || f16_case || s8_case)
&& IMPLICATION(with_bias(),
(IMPLICATION(f32_case, utils::one_of(bia_dt, f32))
&& IMPLICATION(f16_case,
utils::one_of(bia_dt, f16, f32))
&& IMPLICATION(s8_case,
utils::one_of(bia_dt, s8, f32))));
if (!ok) return status::unimplemented;
return status::success;
}
private:
bool attr_oscale_ok() const {
const auto &oscale = attr()->output_scales_;
return oscale.mask_ == 0 || oscale.mask_ == (1 << (batched() + 1));
}
bool attr_post_ops_ok() const {
using namespace primitive_kind;
const auto &p = attr()->post_ops_;
switch (p.len()) {
case 0: return true;
case 1: return p.contain(sum, 0) || p.contain(eltwise, 0);
case 2: return p.contain(sum, 0) && p.contain(eltwise, 1);
default: return false;
}
}
};
cudnn_matmul_t(const pd_t *apd) : primitive_t(apd) {}
status_t init(engine_t *engine) override {
matmul_impl_.reset(new cudnn_matmul_impl_t());
const auto status
= matmul_impl_->init((matmul_pd_t *)primitive_t::pd().get());
if (pd()->attr()->output_scales_.defined()) {
output_scale_ = pd()->attr()->output_scales_.scales_;
} else {
// Only single-element scale is supported
output_scale_ = new float;
}
const bool with_bias = matmul_impl_->with_bias();
const bool has_runtime_args = matmul_impl_->has_runtime_params();
const bool with_scratchpad = matmul_impl_->with_scratchpad();
if (with_scratchpad && has_runtime_args && with_bias) {
executor_.reset(new cudnn_matmul_scratch_runtime_args_bias_exec_t);
} else if (with_scratchpad && has_runtime_args) {
executor_.reset(new cudnn_matmul_runtime_args_scratch_exec_t);
} else if (has_runtime_args && with_bias) {
executor_.reset(new cudnn_matmul_runtime_args_bias_exec_t);
} else if (has_runtime_args) {
executor_.reset(new cudnn_matmul_runtime_args_exec_t);
} else if (with_bias && with_scratchpad) {
executor_.reset(new cudnn_matmul_bias_scratch_exec_t);
} else if (with_scratchpad) {
executor_.reset(new cudnn_matmul_scratch_exec_t);
} else if (with_bias) {
executor_.reset(new cudnn_matmul_bias_exec_t);
} else if (!with_scratchpad && !has_runtime_args && !with_bias) {
executor_.reset(new cudnn_matmul_exec_t);
} else {
return status::unimplemented;
}
return status;
}
status_t execute(const exec_ctx_t &ctx) const override;
virtual ~cudnn_matmul_t() {
if (!pd()->attr()->output_scales_.defined()) { delete output_scale_; }
}
std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_;
std::shared_ptr<cudnn_matmul_exec_base_t> executor_;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
float *output_scale_;
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,300 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_MATMUL_EXECUTOR_HPP
#define GPU_NVIDIA_CUDNN_MATMUL_EXECUTOR_HPP
#include "gpu/nvidia/cudnn_matmul.hpp"
#include "gpu/nvidia/cudnn_matmul_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include <memory>
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_matmul_exec_base_t {
virtual status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size)
= 0;
protected:
template <typename read_acc_t, typename write_acc_t, typename scratch_acc_t,
typename bias_acc_t>
void interop_task(std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
engine_t *engine, cl::sycl::handler &cgh,
nvidia::sycl_cuda_stream_t *cuda_stream, read_acc_t weights_acc,
read_acc_t src_acc, write_acc_t dst_acc, bias_acc_t bias_acc,
scratch_acc_t scratch_acc, float output_scale) {
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto cublas_handle = cuda_stream->get_cublas_handle();
auto cudnn_handle = cuda_stream->get_cudnn_handle();
auto scratch = maybe_cast_to_ptr(scratch_acc, sc, ih);
auto bias = maybe_cast_to_ptr(bias_acc, sc, ih);
auto weights = sc.memory<void *>(ih, weights_acc);
auto src = sc.memory<void *>(ih, src_acc);
auto dst = sc.memory<void *>(ih, dst_acc);
matmul_impl_->execute(cublas_handle, cudnn_handle, weights, src,
dst, bias, scratch, output_scale);
});
}
template <typename T, cl::sycl::access::mode md, typename sc_t>
void *maybe_cast_to_ptr(cl::sycl::accessor<T, 1, md> acc, sc_t &sc,
const cl::sycl::interop_handler &ih) const {
return sc.template memory<void *>(ih, acc);
}
template <typename sc_t>
std::nullptr_t maybe_cast_to_ptr(std::nullptr_t acc, sc_t &,
const cl::sycl::interop_handler &ih) const {
return acc;
}
};
struct cudnn_matmul_scratch_runtime_args_base_exec_t
: public cudnn_matmul_exec_base_t {
virtual status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size)
= 0;
protected:
void init_scratch_buffer(std::size_t scratch_size) {
if (scratch_size > 0) {
scratch_buff_.reset(new cl::sycl::buffer<uint8_t, 1>(scratch_size));
}
}
std::shared_ptr<cl::sycl::buffer<uint8_t, 1>> scratch_buff_ {nullptr};
};
struct cudnn_matmul_scratch_runtime_args_bias_exec_t
: public cudnn_matmul_scratch_runtime_args_base_exec_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size) override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
init_scratch_buffer(scratchpad_size);
return cuda_stream->interop_task([=](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto bias_acc = CTX_IN_ACCESSOR(DNNL_ARG_BIAS);
auto scratch_acc
= scratch_buff_
->get_access<cl::sycl::access::mode::read_write>(
cgh);
interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc,
src_acc, dst_acc, bias_acc, scratch_acc, output_scale);
});
}
};
struct cudnn_matmul_runtime_args_scratch_exec_t
: public cudnn_matmul_scratch_runtime_args_base_exec_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size) override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
init_scratch_buffer(scratchpad_size);
return cuda_stream->interop_task([=](cl::sycl::handler &cgh) {
auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto scratch_acc
= scratch_buff_
->get_access<cl::sycl::access::mode::read_write>(
cgh);
interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc,
src_acc, dst_acc, nullptr, scratch_acc, output_scale);
});
}
};
struct cudnn_matmul_runtime_args_bias_exec_t : public cudnn_matmul_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size) override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([=](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto bias_acc = CTX_IN_ACCESSOR(DNNL_ARG_BIAS);
interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc,
src_acc, dst_acc, bias_acc, nullptr, output_scale);
});
}
};
struct cudnn_matmul_runtime_args_exec_t : public cudnn_matmul_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size) override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([=](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc,
src_acc, dst_acc, nullptr, nullptr, output_scale);
});
}
};
struct cudnn_matmul_bias_scratch_exec_t : public cudnn_matmul_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size) override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([=](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto bias_acc = CTX_IN_ACCESSOR(DNNL_ARG_BIAS);
using read_write_acc_t = cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write>;
auto scratch_acc = read_write_acc_t(
utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
ctx.get_scratchpad_grantor()
.get_memory_storage(memory_tracking::names::
key_matmul_dst_in_acc_dt)
.get())
->buffer()
.get_access<cl::sycl::access::mode::read_write>(
cgh));
interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc,
src_acc, dst_acc, bias_acc, scratch_acc, output_scale);
});
}
};
struct cudnn_matmul_scratch_exec_t : public cudnn_matmul_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size) override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([=](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
using read_write_acc_t = cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::read_write>;
auto scratch_acc = read_write_acc_t(
utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
ctx.get_scratchpad_grantor()
.get_memory_storage(memory_tracking::names::
key_matmul_dst_in_acc_dt)
.get())
->buffer()
.get_access<cl::sycl::access::mode::read_write>(
cgh));
interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc,
src_acc, dst_acc, nullptr, scratch_acc, output_scale);
});
}
};
struct cudnn_matmul_bias_exec_t : public cudnn_matmul_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size) override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([=](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto bias_acc = CTX_IN_ACCESSOR(DNNL_ARG_BIAS);
interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc,
src_acc, dst_acc, bias_acc, nullptr, output_scale);
});
}
};
struct cudnn_matmul_exec_t : public cudnn_matmul_exec_base_t {
status_t execute(const exec_ctx_t &ctx, engine_t *engine,
const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
float output_scale, std::size_t scratchpad_size) override {
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([=](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto wt_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
interop_task(matmul_impl_, engine, cgh, cuda_stream, wt_acc,
src_acc, dst_acc, nullptr, nullptr, output_scale);
});
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,403 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_MATMUL_IMPL_HPP
#define GPU_NVIDIA_CUDNN_MATMUL_IMPL_HPP
#include "cudnn.h"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_matmul_impl_t {
bool with_eltwise(int position, const matmul_pd_t *pd) const {
return pd->attr()->post_ops_.contain(primitive_kind::eltwise, position);
}
float eltwise_alpha(const matmul_pd_t *pd) const {
int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise);
return with_eltwise(0, pd) || with_eltwise(1, pd)
? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.alpha
: 1.0f;
}
float eltwise_beta(const matmul_pd_t *pd) const {
int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise);
return with_eltwise(0, pd) || with_eltwise(1, pd)
? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.beta
: 0.0f;
}
alg_kind_t eltwise_algo(const matmul_pd_t *pd) const {
int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise);
return with_eltwise(0, pd) || with_eltwise(1, pd)
? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.alg
: dnnl_alg_kind_undef;
}
bool with_sum(const matmul_pd_t *pd) const {
return pd->attr()->post_ops_.contain(primitive_kind::sum, 0)
|| pd->attr()->post_ops_.contain(primitive_kind::sum, 1);
}
// Returns scaling factor for post-ops=sum operation
float sum_scale(const matmul_pd_t *pd) const {
int sum_idx_ = pd->attr()->post_ops_.find(primitive_kind::sum);
return pd->attr()->post_ops_.entry_[sum_idx_].sum.scale;
}
// creates operation descriptor based on the elemen-wise operation specified
status_t create_and_set_op_descriptor(const matmul_pd_t *pd) {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateActivationDescriptor, &act_desc_));
cudnnActivationMode_t mode;
switch (eltwise_algo(pd)) {
case alg_kind::eltwise_relu:
mode = cudnnActivationMode_t::CUDNN_ACTIVATION_RELU;
break;
case alg_kind::eltwise_bounded_relu:
mode = cudnnActivationMode_t::CUDNN_ACTIVATION_CLIPPED_RELU;
break;
case alg_kind::eltwise_tanh:
mode = cudnnActivationMode_t::CUDNN_ACTIVATION_TANH;
break;
case alg_kind::eltwise_elu:
mode = cudnnActivationMode_t::CUDNN_ACTIVATION_ELU;
break;
case alg_kind::eltwise_logistic:
mode = cudnnActivationMode_t::CUDNN_ACTIVATION_SIGMOID;
break;
default: return status::unimplemented;
}
// NaNs by default are propagated in oneDNN, although the forward
// convolution routine does not support this.
auto propagate_nan = cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN;
// For ReLU, a ceiling of 0 means no limit.
double ceiling = eltwise_alpha(pd);
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_,
mode, propagate_nan, ceiling));
return status::success;
}
status_t init(matmul_pd_t *pd) {
CHECK(get_cublas_data_type(pd->src_md()->data_type, src_type_));
CHECK(get_cublas_data_type(pd->weights_md()->data_type, weights_type_));
isbatched_ = pd->batched();
memory_desc_wrapper src_d = memory_desc_wrapper(pd->src_md());
memory_desc_wrapper weights_d = memory_desc_wrapper(pd->weights_md());
memory_desc_wrapper dst_d = memory_desc_wrapper(pd->dst_md());
with_bias_ = pd->with_bias();
if ((with_bias_)
&& (pd->weights_md(1)->data_type != pd->dst_md()->data_type)) {
// When datatype of bias is different from the dst,
// we need to reorder the output.
bias_dt_mismatch_ = true;
reorder_required_ = true;
CHECK(get_cublas_data_type(
pd->weights_md(1)->data_type, dst_type_));
} else {
CHECK(get_cublas_data_type(pd->dst_md()->data_type, dst_type_));
}
// cuBLAS only supports s8s8f32 configuration.
// Hence, one final reorder is required if the cfg = s8s8s8
if (dst_type_ == cudaDataType_t::CUDA_R_8I) {
reorder_required_ = true;
dst_type_ = cudaDataType_t::CUDA_R_32F;
}
if (with_eltwise(0, pd) || with_eltwise(1, pd)) {
with_eltwise_ = true;
create_and_set_op_descriptor(pd);
}
// Set parameter when post-op sum is specified
if (with_sum(pd)) { post_op_sum_ = sum_scale(pd); }
has_runtime_params_ = src_d.has_runtime_dims_or_strides()
|| dst_d.has_runtime_dims_or_strides()
|| weights_d.has_runtime_dims_or_strides();
if (!has_runtime_params_) {
// Initialise all gemm parameters if there are no runtime parameters
init_parameters(src_d, weights_d, dst_d,
memory_desc_wrapper(pd->weights_md(1)));
if (with_scratchpad()) { book_scratchpad(pd, dst_d.nelems()); }
}
if (reorder_required_ || bias_dt_mismatch_) { with_scratchpad_ = true; }
return status::success;
}
status_t book_scratchpad(matmul_pd_t *pd, dim_t num_elems) {
if (has_runtime_params_) { return status::unimplemented; }
// This case should only be called when no runtime parameters are
// specified
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_matmul_dst_in_acc_dt, num_elems,
types::data_type_size(get_scratchpad_type()));
return status::success;
}
bool isbatched() { return isbatched_; }
bool with_bias() { return with_bias_; }
bool with_scratchpad() { return with_scratchpad_; }
bool has_runtime_params() { return has_runtime_params_; }
dnnl_data_type_t get_scratchpad_type() { return scratchpad_type_; }
void convert_dims_matmul(
const dnnl_dim_t *dims, int *new_dims, int n_dims) {
// Moving the dimensions because cudnnAddTensor doesn't work when
// bia_mask=1
if (n_dims == 3) { return convert_dims(dims, new_dims, n_dims); }
new_dims[0] = 1;
for (size_t i = 0; i < n_dims; i++) {
new_dims[i + 1] = static_cast<int>(dims[i]);
}
for (size_t i = n_dims; i < 4; i++) {
new_dims[i + 1] = 1;
}
}
status_t init_gemm_parameters(const memory_desc_wrapper src_d,
const memory_desc_wrapper weights_d,
const memory_desc_wrapper dst_d) {
const auto &dst_bd = dst_d.blocking_desc();
if (isbatched_) { batch_count_ = dst_d.dims()[0]; }
const dim_t M = dst_d.dims()[isbatched_ + 1];
const dim_t N = dst_d.dims()[isbatched_ + 0];
const dim_t K = src_d.dims()[isbatched_ + 1];
M_ = (int)M;
N_ = (int)N;
K_ = (int)K;
const auto &src_strides = &src_d.blocking_desc().strides[isbatched_];
const auto &weights_strides
= &weights_d.blocking_desc().strides[isbatched_];
// A matrix is the weights
transA_ = weights_strides[1] == 1
&& weights_d.dims()[isbatched_ + 0] > 1
? cublasOperation_t::CUBLAS_OP_N
: cublasOperation_t::CUBLAS_OP_T;
// B matrix is the src
transB_ = src_strides[1] == 1 && src_d.dims()[isbatched_ + 0] > 1
? cublasOperation_t::CUBLAS_OP_N
: cublasOperation_t::CUBLAS_OP_T;
lda_ = (int)
weights_strides[transA_ == cublasOperation_t::CUBLAS_OP_N ? 0
: 1];
ldb_ = (int)
src_strides[transB_ == cublasOperation_t::CUBLAS_OP_N ? 0 : 1];
ldc_ = (int)dst_bd.strides[isbatched_ + 0];
if (isbatched_) {
// These parameters are required for cublasGemmStridedBatchedEx()
stride_a_ = (transA_ == cublasOperation_t::CUBLAS_OP_N) ? lda_ * K_
: lda_ * M_;
stride_b_ = (transB_ == cublasOperation_t::CUBLAS_OP_N) ? ldb_ * N_
: ldb_ * K_;
stride_c_ = ldc_ * N_;
}
return status::success;
}
status_t init_parameters(const memory_desc_wrapper src_d,
const memory_desc_wrapper weights_d,
const memory_desc_wrapper dst_d, const memory_desc_wrapper bias_d) {
// Matmul supports runtime paramters for dimensions and scales.
// We need to initialize them in the execute function.
init_gemm_parameters(src_d, weights_d, dst_d);
if (with_bias_ || reorder_required_ || with_eltwise_) {
// Initialise cuDNN descriptors
cudnnDataType_t data_types[NUM_IO];
int ndims = dst_d.ndims() < 4 ? 4 : dst_d.ndims();
int dims[NUM_IO][DNNL_MAX_NDIMS];
int strides[NUM_IO][DNNL_MAX_NDIMS];
convert_dims_matmul(dst_d.dims(), dims[dst], dst_d.ndims());
CHECK(convert_data_type(dst_d.md_, &data_types[dst], false));
convert_dims_matmul(
dst_d.blocking_desc().strides, strides[dst], dst_d.ndims());
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst],
data_types[dst], ndims, dims[dst], strides[dst]));
if (reorder_required_ && !bias_dt_mismatch_) {
// If reorder is required, we need to create a scratchpad memory
// to store the intermediate result
with_scratchpad_ = true;
scratchpad_type_ = data_type::f32;
CHECK(create_and_set_tensor_descriptor(&temp_mem_desc_,
cudnnDataType_t::CUDNN_DATA_FLOAT, ndims, dims[dst],
strides[dst]));
}
if (with_bias_) {
// Create bias and destination tensor descriptors
convert_dims_matmul(bias_d.dims(), dims[bias], bias_d.ndims());
convert_dims_matmul(bias_d.blocking_desc().strides,
strides[bias], bias_d.ndims());
CHECK(convert_data_type(bias_d.md_, &data_types[bias], false));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[bias],
data_types[bias], ndims, dims[bias], strides[bias]));
if (bias_dt_mismatch_) {
with_scratchpad_ = true;
scratchpad_type_ = bias_d.data_type();
CHECK(create_and_set_tensor_descriptor(&temp_mem_desc_,
data_types[bias], ndims, dims[dst], strides[dst]));
}
}
}
return status::success;
}
void execute(cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle,
void *a, void *b, void *c, void *bias, void *scratch,
const float scales) {
float gemm_beta = 0;
if (!bias_dt_mismatch_ && !reorder_required_) {
// Case where no reorder is required, scratchpad points to dst (c)
scratch = c;
temp_mem_desc_ = tensor_descs_[io::dst];
gemm_beta = post_op_sum_;
}
if (isbatched_) {
// Calls cublasGemmStridedBatchedEx()
CUBLAS_EXECUTE_FUNC(cublasGemmStridedBatchedEx, cublas_handle,
transA_, transB_, M_, N_, K_, &scales, a, weights_type_,
lda_, stride_a_, b, src_type_, ldb_, stride_b_, &gemm_beta,
scratch, dst_type_, ldc_, stride_c_, batch_count_,
acc_type_, gemm_algo_);
} else {
// Calls cublasGemmEx()
CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, transA_, transB_,
M_, N_, K_, &scales, a, weights_type_, lda_, b, src_type_,
ldb_, &gemm_beta, scratch, dst_type_, ldc_, acc_type_,
gemm_algo_);
}
if (with_bias_) {
// When bias is specified call cudnnAddTensor()
float bias_beta = 1;
CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &scales,
tensor_descs_[io::bias], bias, &bias_beta, temp_mem_desc_,
scratch);
}
if (with_eltwise_) {
// Perform elementwise operation if specified
float alpha = 1;
float beta = 0;
CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_,
&alpha, temp_mem_desc_, scratch, &beta, temp_mem_desc_,
scratch);
}
if (reorder_required_) {
// Reorder from scratchpad to destination if required
float reorder_alpha = 1, reorder_beta = 0;
CUDNN_EXECUTE_FUNC(cudnnTransformTensor, cudnn_handle,
&reorder_alpha, temp_mem_desc_, scratch, &post_op_sum_,
tensor_descs_[io::dst], c);
}
}
~cudnn_matmul_impl_t() { cleanup(); }
void cleanup() {
if (act_desc_) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyActivationDescriptor, act_desc_);
act_desc_ = nullptr;
}
if ((reorder_required_ && !bias_dt_mismatch_)
|| (with_bias_ && bias_dt_mismatch_) && temp_mem_desc_) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, temp_mem_desc_);
temp_mem_desc_ = nullptr;
}
for (size_t i = 0; i < NUM_IO; i++) {
if (tensor_descs_[i]) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, tensor_descs_[i]);
tensor_descs_[i] = nullptr;
}
}
}
private:
status_t get_cublas_data_type(
dnnl_data_type_t data_type, cudaDataType_t &blas_dt) {
switch (data_type) {
case dnnl_data_type_t::dnnl_f32:
blas_dt = CUDA_R_32F;
return status::success;
case dnnl_data_type_t::dnnl_f16:
blas_dt = CUDA_R_16F;
return status::success;
case dnnl_data_type_t::dnnl_s8:
blas_dt = CUDA_R_8I;
return status::success;
default: return status::unimplemented;
}
return status::unimplemented;
}
cublasOperation_t transA_;
cublasOperation_t transB_;
int M_, N_, K_;
int lda_, ldb_, ldc_;
long long int stride_a_, stride_b_, stride_c_;
bool isbatched_ = false, with_bias_ = false, bias_dt_mismatch_ = false;
bool reorder_required_ = false, with_eltwise_ = false;
bool with_scratchpad_ = false, has_runtime_params_ = false;
dnnl_data_type_t scratchpad_type_;
cudaDataType_t src_type_, weights_type_, dst_type_;
cudaDataType_t acc_type_ = cudaDataType_t::CUDA_R_32F, bias_type_;
cublasGemmAlgo_t gemm_algo_
= cublasGemmAlgo_t::CUBLAS_GEMM_DEFAULT_TENSOR_OP;
int batch_count_;
enum io { bias = 0, dst, NUM_IO };
cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {},
temp_mem_desc_ = nullptr;
cudnnActivationDescriptor_t act_desc_ = nullptr;
float post_op_sum_;
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,157 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_pooling.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "sycl/sycl_buffer_memory_storage.hpp"
#include <CL/sycl.hpp>
#include "common/nstl.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_pooling_fwd_t::execute(const exec_ctx_t &ctx) const {
// If dst is empty, do nothing
memory_desc_wrapper dst_wrap(pd()->dst_md());
if (dst_wrap.size() == 0) return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
bool is_training = pd()->desc()->prop_kind == prop_kind::forward_training;
auto wkspace_st = is_training
? ctx.output(DNNL_ARG_WORKSPACE)->memory_storage()
: &memory_storage_t::empty_storage();
memory_desc_wrapper src_wrap(pd()->src_md());
auto dst_offset_bytes = src_wrap.nelems() * src_wrap.data_type_size();
// If src is empty and dst is not, fill dst with
// numeric_limits<dt>::lowest() to match the other backends' behaviour
if (src_wrap.size() == 0 && dst_wrap.size() != 0) {
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto dst = sc.memory<void *>(ih, dst_acc);
if (dst_wrap.data_type() == data_type_t::dnnl_f32) {
auto val = nstl::numeric_limits<float>::lowest();
cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst),
reinterpret_cast<int &>(val), dst_wrap.nelems(),
cuda_stream->get_underlying_stream());
} else if (dst_wrap.data_type() == data_type_t::dnnl_f16) {
float16_t val = nstl::numeric_limits<float16_t>::lowest();
cuMemsetD16Async(reinterpret_cast<CUdeviceptr>(dst),
reinterpret_cast<unsigned short &>(val),
dst_wrap.nelems(),
cuda_stream->get_underlying_stream());
} else if (dst_wrap.data_type() == data_type_t::dnnl_s8) {
auto val = nstl::numeric_limits<int8_t>::lowest();
cuMemsetD8Async(reinterpret_cast<CUdeviceptr>(dst),
reinterpret_cast<unsigned char &>(val),
dst_wrap.nelems(),
cuda_stream->get_underlying_stream());
}
});
});
}
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
std::shared_ptr<
cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write>>
wkspace_acc;
if (!wkspace_st->is_null()) {
wkspace_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
cl::sycl::access::mode::write>>(
utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
wkspace_st)
->buffer()
.template get_access<cl::sycl::access::mode::write>(
cgh));
}
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto x = sc.memory<void *>(ih, src_acc);
auto y = sc.memory<void *>(ih, dst_acc);
uint8_t *ws_x = nullptr, *ws_y = nullptr;
if (!wkspace_st->is_null()) {
ws_x = sc.memory<uint8_t *>(ih, *wkspace_acc);
ws_y = ws_x + dst_offset_bytes;
}
pd()->pooling_impl_->execute(handle, x, y, ws_x, ws_y);
});
});
}
status_t cudnn_pooling_bwd_t::execute(const exec_ctx_t &ctx) const {
if (has_zero_dims(pd()->diff_src_md()->dims, pd()->diff_src_md()->ndims)
|| has_zero_dims(
pd()->diff_dst_md()->dims, pd()->diff_dst_md()->ndims)) {
return status::success;
}
memory_desc_wrapper wrap(pd()->diff_src_md());
if (wrap.size() == 0) { return status::success; }
const auto dst_offset_bytes = wrap.size();
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto wkspace_acc = CTX_IN_ACCESSOR(DNNL_ARG_WORKSPACE);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto dx = sc.memory<void *>(ih, diff_src_acc);
auto dy = sc.memory<void *>(ih, diff_dst_acc);
auto ws_x = sc.memory<uint8_t *>(ih, wkspace_acc);
auto ws_y = ws_x + dst_offset_bytes;
pd()->pooling_impl_->execute(handle, dx, dy, ws_x, ws_y);
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,200 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_POOLING_HPP
#define GPU_NVIDIA_CUDNN_POOLING_HPP
#include "common/c_types_map.hpp"
#include "common/pooling_pd.hpp"
#include "common/primitive.hpp"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/cudnn_pooling_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_pooling_common_t {
template <typename pd_t>
void init_ws(const pd_t *pd, memory_desc_t &ws_md) {
bool is_fwd = pd->is_fwd();
memory_desc_wrapper src_wrap(is_fwd ? pd->src_md() : pd->diff_src_md());
memory_desc_wrapper dst_wrap(is_fwd ? pd->dst_md() : pd->diff_dst_md());
const auto src_size = src_wrap.nelems();
const auto dst_size = dst_wrap.nelems();
const dims_t ws_size = {(dim_t)(src_size + dst_size)};
dnnl_memory_desc_init_by_tag(
&ws_md, 1, ws_size, src_wrap.data_type(), format_tag::x);
}
status_t init_mem_by_tag(format_tag_t tag, memory_desc_t &md) {
if (tag == format_tag::undef) { return status::unimplemented; }
CHECK(memory_desc_init_by_tag(md, tag));
return status::success;
}
format_tag_t get_tag(const memory_desc_t &md) const {
using namespace format_tag;
auto tag = memory_desc_matches_one_of_tag(md, ab, abc, abcd,
abcde, // NCHW derivatives
ba, bca, bcda, bcdea, cba, cdba,
cdeba, // IO and spatial derivatives
acb, acdb, acdeb, // NHWC derivatives
aBcd16b, aBcde16b, aBcd8b, aBcde8b, aBcd4b,
aBcde4b); // blocked layouts
return tag;
}
};
struct cudnn_pooling_fwd_t : public primitive_t {
struct pd_t : public pooling_fwd_pd_t, public cudnn_pooling_common_t {
using pooling_fwd_pd_t::pooling_fwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_pooling_fwd_t);
status_t init(engine_t *engine) {
using namespace data_type;
using namespace prop_kind;
using namespace alg_kind;
using namespace format_tag;
assert(engine->kind() == engine_kind::gpu);
auto src_dt = src_md()->data_type;
bool ok = true && is_fwd();
ok = ok && set_default_params() == status::success;
ok = ok
&& utils::one_of(desc()->prop_kind, forward_training,
forward_inference);
ok = ok
&& utils::one_of(desc()->alg_kind, pooling_max,
pooling_avg_include_padding,
pooling_avg_exclude_padding);
ok = ok && utils::one_of(src_dt, s8, f16, f32);
ok = ok
&& IMPLICATION(utils::one_of(src_dt, f16),
desc()->prop_kind == forward_inference);
ok = ok
&& IMPLICATION(
src_dt == s8, desc()->accum_data_type == s32);
ok = ok && attr()->has_default_values();
ok = ok && blocking_ok();
if (!ok) return status::unimplemented;
bool is_training = desc_.prop_kind == forward_training;
if (is_training) init_ws(this, ws_md_);
if (has_zero_dim_memory()) return status::success;
pooling_impl_.reset(new cudnn_pooling_fwd_impl_t());
return pooling_impl_->init(this);
}
bool blocking_ok() const {
if (!utils::one_of(src_md()->data_type, data_type::s8)
&& src_md()->format_desc.blocking.inner_nblks > 0)
return false;
if (src_md()->format_desc.blocking.inner_nblks > 1) return false;
if (utils::one_of(src_md()->data_type, data_type::s8)
&& src_md()->format_desc.blocking.inner_nblks == 1) {
return memory_desc_matches_nchw_vect_c(src_md())
&& memory_desc_matches_nchw_vect_c(dst_md());
}
return true;
}
std::shared_ptr<cudnn_pooling_impl_base_t> pooling_impl_;
};
cudnn_pooling_fwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
struct cudnn_pooling_bwd_t : public primitive_t {
struct pd_t : public pooling_bwd_pd_t, public cudnn_pooling_common_t {
using pooling_bwd_pd_t::pooling_bwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_pooling_bwd_t);
status_t init(engine_t *engine) {
using namespace prop_kind;
using namespace alg_kind;
using namespace format_tag;
assert(engine->kind() == engine_kind::gpu);
bool ok = true && !is_fwd()
&& set_default_params() == status::success
&& desc()->prop_kind == backward_data
&& utils::one_of(desc()->alg_kind, pooling_max,
pooling_avg_include_padding,
pooling_avg_exclude_padding)
&& (utils::everyone_is(data_type::f32,
diff_dst_md()->data_type,
diff_src_md()->data_type)
|| utils::everyone_is(data_type::f16,
diff_dst_md()->data_type,
diff_src_md()->data_type))
&& attr()->has_default_values() && no_blocking();
if (!ok) return status::unimplemented;
init_mem_by_tag(get_tag(diff_dst_md_), diff_src_md_);
init_ws(this, ws_md_);
if (!compare_ws(hint_fwd_pd_)) return status::unimplemented;
if (has_zero_dim_memory()) { return status::success; };
pooling_impl_.reset(new cudnn_pooling_bwd_impl_t());
return pooling_impl_->init(this);
}
bool no_blocking() const {
return diff_src_md()->format_desc.blocking.inner_nblks
+ diff_dst_md()->format_desc.blocking.inner_nblks
== 0;
}
std::shared_ptr<cudnn_pooling_impl_base_t> pooling_impl_;
};
cudnn_pooling_bwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,234 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_POOLING_IMPL_HPP
#define GPU_NVIDIA_CUDNN_POOLING_IMPL_HPP
#include <cudnn.h>
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_pooling_impl_base_t {
virtual status_t init(const pooling_pd_t *pd) = 0;
virtual ~cudnn_pooling_impl_base_t() {
for (size_t i = 0; i < NUM_IO; ++i) {
if (tensor_descs_[i]) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, tensor_descs_[i]);
}
}
if (pool_desc_) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyPoolingDescriptor, pool_desc_);
}
}
virtual void execute(cudnnHandle_t handle, void *x, void *y, void *ws_x,
void *ws_y) const = 0;
protected:
status_t init_common(const pooling_pd_t *pd) {
ndims_ = std::max(4, pd->ndims());
kernel_ndims_ = ndims_ - 2;
// Only 1D, 2D and 3D pooling is supported by cuDNN
if (kernel_ndims_ > 3) { return status::unimplemented; }
// cuDNN requires symmetric padding, however it seems that
// configurations where padding in the beginning > padding at the end of
// dimensions work as expected. When padding at the end of any dimension
// > padding in the beginning of that dimension the results are wrong
// since the data is rearranged incorrectly due to the limitation that
// padding has to be the same. This applies to configurations which use
// the "average include padding" algorithm. Therefore, such
// configurations return status::unimplemented since the results are
// wrong.
if (pd->desc()->alg_kind == alg_kind::pooling_avg_include_padding
&& (pd->padL() < pd->padR() || pd->padT() < pd->padB()
|| pd->padFront() < pd->padBack())) {
return status::unimplemented;
}
is_training_ = pd->desc()->prop_kind == prop_kind::forward_training;
bool is_fwd = pd->is_fwd();
auto src_md = is_fwd ? pd->src_md() : pd->diff_src_md();
auto dst_md = is_fwd ? pd->dst_md() : pd->diff_dst_md();
if (has_zero_dims(src_md->dims, pd->ndims())
|| has_zero_dims(dst_md->dims, pd->ndims())) {
return status::success;
}
if (is_training_) {
auto src_wrap = memory_desc_wrapper(src_md);
auto dst_wrap = memory_desc_wrapper(dst_md);
x_size_bytes_ = src_wrap.size();
y_size_bytes_ = dst_wrap.size();
}
convert_dims(src_md->padded_dims, dims_[src], pd->ndims());
convert_dims(dst_md->padded_dims, dims_[dst], pd->ndims());
convert_dims(src_md->format_desc.blocking.strides, strides_[src],
pd->ndims());
convert_dims(dst_md->format_desc.blocking.strides, strides_[dst],
pd->ndims());
convert_dims(pd->desc()->kernel, kernel_dims_, kernel_ndims_);
// If 1D pooling
if (pd->ndims() == 3) {
// Convert to [n, c, 1, w] since the current format is
// [n, c, w, 1]
dims_[src][3] = dims_[src][2];
dims_[src][2] = 1;
dims_[dst][3] = dims_[dst][2];
dims_[dst][2] = 1;
// Set kernel dimensions to [1, kw]
kernel_dims_[1] = kernel_dims_[0];
kernel_dims_[0] = 1;
}
if (ndims_ == 4) {
kernel_padding_[0] = static_cast<int>(pd->padT());
kernel_padding_[1] = static_cast<int>(pd->padL());
kernel_strides_[0] = static_cast<int>(pd->KSH());
kernel_strides_[1] = static_cast<int>(pd->KSW());
} else {
kernel_padding_[0] = static_cast<int>(pd->padFront());
kernel_padding_[1] = static_cast<int>(pd->padT());
kernel_padding_[2] = static_cast<int>(pd->padL());
kernel_strides_[0] = static_cast<int>(pd->KSD());
kernel_strides_[1] = static_cast<int>(pd->KSH());
kernel_strides_[2] = static_cast<int>(pd->KSW());
}
CHECK(convert_data_type(src_md, &data_types_[src]));
CHECK(convert_data_type(dst_md, &data_types_[dst]));
CHECK(convert_alg_kind(pd->desc()->alg_kind, &pool_mode_));
cudnnTensorFormat_t src_format, dst_format;
CHECK(get_format(src_md, src_format));
CHECK(get_format(dst_md, dst_format));
CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[src],
src_format, data_types_[src], ndims_, dims_[src]));
CHECK(create_and_set_tensor_descriptor_ex(&tensor_descs_[dst],
dst_format, data_types_[dst], ndims_, dims_[dst]));
CHECK(create_and_set_pooling_descriptor(pd));
return status::success;
}
status_t create_and_set_pooling_descriptor(const pooling_pd_t *pd) {
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreatePoolingDescriptor, &pool_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetPoolingNdDescriptor, pool_desc_,
pool_mode_, CUDNN_PROPAGATE_NAN, kernel_ndims_, kernel_dims_,
kernel_padding_, kernel_strides_));
return status::success;
}
status_t convert_alg_kind(
alg_kind_t alg_kind, cudnnPoolingMode_t *cudnn_alg_kind) const {
switch (alg_kind) {
case alg_kind::pooling_max:
*cudnn_alg_kind = CUDNN_POOLING_MAX;
break;
case alg_kind::pooling_avg_include_padding:
*cudnn_alg_kind = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case alg_kind::pooling_avg_exclude_padding:
*cudnn_alg_kind = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
default: return status::unimplemented;
}
return status::success;
}
enum io { src = 0, dst, NUM_IO };
cudnnDataType_t data_types_[NUM_IO];
cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {};
cudnnPoolingDescriptor_t pool_desc_;
cudnnPoolingMode_t pool_mode_ = CUDNN_POOLING_MAX;
int dims_[NUM_IO][DNNL_MAX_NDIMS];
int strides_[NUM_IO][DNNL_MAX_NDIMS];
int kernel_dims_[DNNL_MAX_NDIMS];
int kernel_padding_[DNNL_MAX_NDIMS];
int kernel_strides_[DNNL_MAX_NDIMS];
const float alpha_ = 1.f, beta_ = 0.f;
int ndims_, kernel_ndims_;
bool is_training_ = false;
std::size_t x_size_bytes_ = 0, y_size_bytes_ = 0;
};
struct cudnn_pooling_fwd_impl_t : public cudnn_pooling_impl_base_t {
status_t init(const pooling_pd_t *pd) override {
return cudnn_pooling_impl_base_t::init_common(pd);
}
void execute(cudnnHandle_t handle, void *x, void *y, void *ws_x,
void *ws_y) const override {
CUDNN_EXECUTE_FUNC(cudnnPoolingForward, handle, pool_desc_, &alpha_,
tensor_descs_[src], x, &beta_, tensor_descs_[dst], y);
if (is_training_) {
// Copy x and y into workspace so that they can be used
// in the backward pass
cudnnAddTensor(handle, &alpha_, tensor_descs_[src], x, &beta_,
tensor_descs_[src], ws_x);
cudnnAddTensor(handle, &alpha_, tensor_descs_[dst], y, &beta_,
tensor_descs_[dst], ws_y);
}
}
};
struct cudnn_pooling_bwd_impl_t : public cudnn_pooling_impl_base_t {
status_t init(const pooling_pd_t *pd) override {
return cudnn_pooling_impl_base_t::init_common(pd);
}
void execute(cudnnHandle_t handle, void *dx, void *dy, void *ws_x,
void *ws_y) const override {
CUDNN_EXECUTE_FUNC(cudnnPoolingBackward, handle, pool_desc_, &alpha_,
tensor_descs_[dst], ws_y, tensor_descs_[dst], dy,
tensor_descs_[src], ws_x, &beta_, tensor_descs_[src], dx);
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,55 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_reorder.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_reorder_t::execute(const exec_ctx_t &ctx) const {
memory_desc_wrapper wrap(pd()->src_md());
if (wrap.size() == 0) { return status::success; }
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto a = sc.memory<uint8_t *>(ih, src_acc)
+ pd()->reorder_->src_offset_in_bytes();
auto b = sc.memory<uint8_t *>(ih, dst_acc)
+ pd()->reorder_->dst_offset_in_bytes();
pd()->reorder_->execute(handle, a, b);
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,122 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_REORDER_HPP
#define GPU_NVIDIA_CUDNN_REORDER_HPP
#include "common/memory_desc_wrapper.hpp"
#include "common/primitive.hpp"
#include "common/reorder_pd.hpp"
#include "gpu/nvidia/cudnn_reorder_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_reorder_t : public primitive_t {
using primitive_t::primitive_t;
struct pd_t : public reorder_pd_t {
using reorder_pd_t::reorder_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_reorder_t);
static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
const primitive_attr_t *attr, engine_t *src_engine,
const memory_desc_t *src_md, engine_t *dst_engine,
const memory_desc_t *dst_md) {
auto _pd = new pd_t(attr, src_engine->kind(), src_md,
dst_engine->kind(), dst_md);
if (_pd == nullptr) return status::out_of_memory;
if (_pd->init(engine, src_engine, dst_engine) != status::success) {
delete _pd;
return status::unimplemented;
}
_pd->init_scratchpad_md();
return safe_ptr_assign<reorder_pd_t>(*reorder_pd, _pd);
}
// Function to verify data and memory format
bool valid_data_n_mem_format() const {
bool ok = utils::one_of(src_md()->data_type, data_type::s8,
data_type::f16, data_type::f32)
&& utils::one_of(dst_md()->data_type, data_type::s8,
data_type::f16, data_type::f32);
// Nvidia only supports blocking for Int8
if (!utils::one_of(src_md()->data_type, data_type::s8)
&& src_md()->format_desc.blocking.inner_nblks > 0)
return false;
if (!utils::one_of(dst_md()->data_type, data_type::s8)
&& dst_md()->format_desc.blocking.inner_nblks > 0)
return false;
// Nvidia supports blocking only on channel dimension C
if (dst_md()->format_desc.blocking.inner_nblks > 1
|| src_md()->format_desc.blocking.inner_nblks > 1)
return false;
if (utils::one_of(src_md()->data_type, data_type::s8)
&& src_md()->format_desc.blocking.inner_nblks == 1) {
ok = ok && memory_desc_matches_nchw_vect_c(src_md());
}
int blks = dst_md()->format_desc.blocking.inner_nblks;
if (utils::one_of(dst_md()->data_type, data_type::s8)
&& blks == 1) {
ok = ok && memory_desc_matches_nchw_vect_c(dst_md());
}
return ok;
}
bool check_scales_mask() const {
// cuDNN does not support scaling per dimension.
if (attr()->output_scales_.mask_ != 0) { return false; }
return true;
}
status_t init(
engine_t *engine, engine_t *src_engine, engine_t *dst_engine) {
bool ok = true && (engine == dst_engine)
&& (src_engine->kind() == engine_kind::gpu)
&& valid_data_n_mem_format() && check_scales_mask();
if (!ok) return status::unimplemented;
if (has_different_block_size(src_md(), dst_md())) {
reorder_.reset(new cudnn_reorder_ex_t());
} else {
reorder_.reset(new cudnn_reorder_stride_t());
}
return reorder_->init(this);
}
std::shared_ptr<cudnn_reorder_generic_t> reorder_;
};
cudnn_reorder_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,46 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "common/engine.hpp"
#include "gpu/nvidia/cudnn_reorder.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/ocl/cross_engine_reorder.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace {
using rpd_create_f = dnnl::impl::engine_t::reorder_primitive_desc_create_f;
const rpd_create_f cuda_reorder_impl_list[]
= {gpu::ocl::cross_engine_reorder_t::pd_t::create,
cudnn_reorder_t::pd_t::create, nullptr};
} // namespace
const rpd_create_f *
cuda_gpu_engine_impl_list_t::get_reorder_implementation_list(
const memory_desc_t *, const memory_desc_t *) {
return cuda_reorder_impl_list;
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,182 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_REORDER_IMPL_HPP
#define GPU_NVIDIA_CUDNN_REORDER_IMPL_HPP
#include "common/type_helpers.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_reorder_generic_t {
public:
virtual status_t init(const reorder_pd_t *pd) = 0;
virtual void execute(cudnnHandle_t handle, void *src, void *dst) const = 0;
virtual ~cudnn_reorder_generic_t() {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, src_desc_);
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, dst_desc_);
}
int dst_offset_in_bytes() { return dst_offset_in_bytes_; }
int src_offset_in_bytes() { return src_offset_in_bytes_; }
protected:
cudnnDataType_t src_data_type_;
cudnnDataType_t dst_data_type_;
int ndims_;
int dims_[DNNL_MAX_NDIMS];
cudnnTensorDescriptor_t src_desc_;
cudnnTensorDescriptor_t dst_desc_;
float alpha_, beta_;
int dst_offset_in_bytes_ = 0;
int src_offset_in_bytes_ = 0;
};
// This structure is used when the memory format includes blocking
struct cudnn_reorder_ex_t : public cudnn_reorder_generic_t {
public:
status_t init(const reorder_pd_t *pd) override {
// If any of the dimensions are 0 we should not continue with creating
// cudnn descriptors
memory_desc_wrapper wrap(pd->src_md());
if (wrap.size() == 0) { return status::success; }
// Validity checks
assert(pd->dst_md()->ndims == pd->src_md()->ndims);
get_format(pd->src_md(), src_format_);
get_format(pd->dst_md(), dst_format_);
dst_offset_in_bytes_ = pd->dst_md()->offset0
* types::data_type_size(pd->dst_md()->data_type);
src_offset_in_bytes_ = pd->src_md()->offset0
* types::data_type_size(pd->src_md()->data_type);
alpha_ = pd->alpha();
beta_ = pd->beta();
CHECK(convert_data_type(pd->src_md(), &src_data_type_));
CHECK(convert_data_type(pd->dst_md(), &dst_data_type_));
convert_dims(pd->src_md()->padded_dims, dims_, pd->src_md()->ndims);
ndims_ = pd->dst_md()->ndims > 4 ? pd->dst_md()->ndims : 4;
// Create and set tensor transform descriptor
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateTensorTransformDescriptor, &trans_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorTransformDescriptor,
trans_desc_, ndims_, dst_format_, nullptr, nullptr, nullptr,
cudnnFoldingDirection_t::CUDNN_TRANSFORM_FOLD));
// Create and set source tensor descriptor
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &src_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, src_desc_,
src_format_, src_data_type_, ndims_, dims_));
// Create and set destination tensor descriptor
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &dst_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, dst_desc_,
dst_format_, dst_data_type_, ndims_, dims_));
return status::success;
}
void execute(cudnnHandle_t handle, void *src, void *dst) const override {
// cudnnTransformTensorEx() function is required to support blocking.
// It requires the output tensor to be in cuDNN supported format.
CUDNN_EXECUTE_FUNC(cudnnTransformTensorEx, handle, trans_desc_, &alpha_,
src_desc_, src, &beta_, dst_desc_, dst);
}
~cudnn_reorder_ex_t() {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorTransformDescriptor, trans_desc_);
}
private:
cudnnTensorFormat_t src_format_;
cudnnTensorFormat_t dst_format_;
cudnnTensorTransformDescriptor_t trans_desc_;
using cudnn_reorder_generic_t::cudnn_reorder_generic_t;
};
// This structure is used when the memory format does not include blocking
struct cudnn_reorder_stride_t : public cudnn_reorder_generic_t {
public:
status_t init(const reorder_pd_t *pd) override {
// If any of the dimensions are 0 we should not continue with creating
// cudnn descriptors
memory_desc_wrapper wrap(pd->src_md());
if (wrap.size() == 0) { return status::success; }
// Validity checks
assert(pd->dst_md()->ndims == pd->src_md()->ndims);
dst_offset_in_bytes_ = pd->dst_md()->offset0
* types::data_type_size(pd->dst_md()->data_type);
src_offset_in_bytes_ = pd->src_md()->offset0
* types::data_type_size(pd->src_md()->data_type);
alpha_ = pd->alpha();
beta_ = pd->beta();
convert_dims(pd->dst_md()->dims, dims_, pd->dst_md()->ndims);
convert_dims(pd->src_md()->format_desc.blocking.strides, src_strides_,
pd->src_md()->ndims);
convert_dims(pd->dst_md()->format_desc.blocking.strides, dst_strides_,
pd->dst_md()->ndims);
adjust_dim_for_dnn(dims_, pd->dst_md()->ndims, pd->src_md());
adjust_stride_for_dnn(src_strides_, pd->dst_md()->ndims, pd->src_md());
adjust_stride_for_dnn(dst_strides_, pd->dst_md()->ndims, pd->dst_md());
ndims_ = pd->dst_md()->ndims >= 4 ? pd->dst_md()->ndims
+ pd->dst_md()->format_desc.blocking.inner_nblks
: 4;
bool vectorized = has_different_block_size(pd->src_md(), pd->dst_md());
CHECK(convert_data_type(pd->src_md(), &src_data_type_, vectorized));
CHECK(convert_data_type(pd->dst_md(), &dst_data_type_, vectorized));
// Create and set source tensor descriptor
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &src_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, src_desc_,
src_data_type_, ndims_, dims_, src_strides_));
// Create and set destination tensor descriptor
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, &dst_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, dst_desc_,
dst_data_type_, ndims_, dims_, dst_strides_));
return status::success;
}
void execute(cudnnHandle_t handle, void *src, void *dst) const override {
// We don't need to specify the format (deducible using the strides)
// in case of cudnnTransformTensor().
// For example, this is useful when converting from abcd to bacd
CUDNN_EXECUTE_FUNC(cudnnTransformTensor, handle, &alpha_, src_desc_,
src, &beta_, dst_desc_, dst);
}
private:
int src_strides_[DNNL_MAX_NDIMS];
int dst_strides_[DNNL_MAX_NDIMS];
using cudnn_reorder_generic_t::cudnn_reorder_generic_t;
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,94 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "sycl/sycl_buffer_memory_storage.hpp"
#include "gpu/nvidia/cudnn_resampling.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_resampling_fwd_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->src_md()).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
auto grid_acc = buffer(grid_storage_.get())
.get_access<cl::sycl::access::mode::read>(cgh);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
std::vector<void *> args;
args.push_back(sc.memory<void *>(ih, src_acc));
args.push_back(sc.memory<void *>(ih, grid_acc));
args.push_back(sc.memory<void *>(ih, dst_acc));
pd()->resampling_impl_->execute(handle, args);
});
});
return status::success;
}
status_t cudnn_resampling_bwd_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->diff_src_md()).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto grid_acc = buffer(grid_storage_.get())
.get_access<cl::sycl::access::mode::read>(cgh);
auto diff_grid_acc
= CTX_SCRATCH_ACCESSOR(memory_tracking::names::key_none);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
std::vector<void *> args;
args.push_back(sc.memory<void *>(ih, diff_src_acc));
args.push_back(sc.memory<void *>(ih, diff_dst_acc));
args.push_back(sc.memory<void *>(ih, grid_acc));
args.push_back(sc.memory<void *>(ih, diff_grid_acc));
pd()->resampling_impl_->execute(handle, args);
});
});
return status::success;
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,269 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_RESAMPLING_HPP
#define GPU_NVIDIA_CUDNN_RESAMPLING_HPP
#include <cudnn.h>
#include <CL/sycl.hpp>
#include "common/c_types_map.hpp"
#include "common/primitive.hpp"
#include "common/resampling_pd.hpp"
#include "common/type_helpers.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
#include "gpu/nvidia/cudnn_resampling_impl.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_resampling_pd_base_t {
protected:
status_t init_mem_by_tag(format_tag_t tag, memory_desc_t &md) {
if (tag == format_tag::undef) return status::unimplemented;
CHECK(memory_desc_init_by_tag(md, tag));
return status::success;
}
};
struct cudnn_resampling_base_t : public primitive_t {
protected:
using primitive_t::primitive_t;
template <typename data_t>
struct theta_t {
data_t s0_, i_, tx_;
data_t j_, s1_, ty_;
theta_t(data_t s0, data_t i, data_t tx, data_t j, data_t s1, data_t ty)
: s0_(s0), i_(i), tx_(tx), j_(j), s1_(s1), ty_(ty) {}
};
cl::sycl::buffer<uint8_t, 1> &buffer(memory_storage_t *mem_storage) {
return utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
mem_storage)
->buffer();
}
cl::sycl::buffer<uint8_t, 1> &buffer(memory_storage_t *mem_storage) const {
return utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
mem_storage)
->buffer();
}
template <typename data_t, typename pd_t>
status_t prepare_coordinate_grid(engine_t *engine, const pd_t *pd) {
using io = cudnn_resampling_impl_base_t::io;
int ndims = pd->resampling_impl_->ndims();
data_t OW = pd->resampling_impl_->dims_[io::dst][ndims - 1],
IW = pd->resampling_impl_->dims_[io::src][ndims - 1],
OH = pd->resampling_impl_->dims_[io::dst][ndims - 2],
IH = pd->resampling_impl_->dims_[io::src][ndims - 2];
// cudnn uses the normalized value between -1<=(xsi, ysi)<= 1 for
// building the grid. Therefore, scaling parameter for tau_theta must be
// adjusted for computing the normalized value per grid.
data_t w = 1;
if (IW != 1 && IW != OW) w = IW * (OW - 1) / (OW * (IW - 1));
data_t h = 1;
if (IH != 1 && IH != OH) h = IH * (OH - 1) / (OH * (IH - 1));
// the taue of theta size is fixed in cudnn
int tau_thea_size = 2 * 3;
auto theta_size = pd->MB();
auto tau_theta = theta_t<data_t> {w, 0.f, 0.f, 0.f, h, 0.f};
std::vector<theta_t<data_t>> theta_data(theta_size, tau_theta);
auto grid_size = pd->MB() * pd->OH() * pd->OW() * 2;
auto sycl_engine = utils::downcast<sycl_cuda_engine_t *>(engine);
auto theta_size_in_byte = tau_thea_size * theta_size * sizeof(data_t);
auto grid_size_in_byte = grid_size * sizeof(data_t);
memory_storage_t *mem_grid_ptr;
CHECK(sycl_engine->create_memory_storage(&mem_grid_ptr,
memory_flags_t::alloc, grid_size_in_byte, nullptr));
grid_storage_.reset(mem_grid_ptr);
memory_storage_t *mem_theta_ptr;
CHECK(sycl_engine->create_memory_storage(&mem_theta_ptr,
memory_flags_t::alloc, theta_size_in_byte, nullptr));
theta_storage_.reset(mem_theta_ptr);
stream_t *service_stream;
CHECK(sycl_engine->get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto event = copy(cuda_stream->queue(),
reinterpret_cast<uint8_t *>(theta_data.data()),
buffer(theta_storage_.get()));
auto &st_desc_ = pd->resampling_impl_->st_desc_;
cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
cgh.depends_on(event);
auto theta_acc
= buffer(theta_storage_.get())
.get_access<cl::sycl::access::mode::read>(cgh);
auto grid_acc
= buffer(grid_storage_.get())
.get_access<cl::sycl::access::mode::write>(cgh);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
// scoped context will make sure the top of the stack context is
// the engine context while creating the cublas handle.
auto &s_engine = *utils::downcast<sycl_cuda_engine_t *>(engine);
cuda_sycl_scoped_context_handler_t sc(s_engine);
auto handle = cuda_stream->get_cudnn_handle();
auto theta = sc.memory<void *>(ih, theta_acc);
auto grid = sc.memory<void *>(ih, grid_acc);
CUDNN_EXECUTE_FUNC(cudnnSpatialTfGridGeneratorForward, handle,
st_desc_, theta, grid);
});
});
// cudnn requires the grid data to be normalized between (-1, -1) <=
// (xsi, ysi) <= (1,1) when the value is outside of the boundary, cudnn
// assume the values are 0, while oneDNN uses the boundary values. So we
// clamp the outside of the boundary values to the boundary,. This will
// fix the upsampling issue.
std::vector<data_t> unbound_raw_grid(grid_size);
auto event2 = copy(cuda_stream->queue(), buffer(grid_storage_.get()),
reinterpret_cast<uint8_t *>(unbound_raw_grid.data()));
event2.wait();
for (int i = 0; i < grid_size; i++) {
if (std::fabs(unbound_raw_grid[i]) > 1)
unbound_raw_grid[i] = unbound_raw_grid[i]
/ (std::fabs(unbound_raw_grid[i]));
}
auto event3 = copy(cuda_stream->queue(),
reinterpret_cast<uint8_t *>(unbound_raw_grid.data()),
buffer(grid_storage_.get()));
event3.wait();
return status::success;
}
std::unique_ptr<memory_storage_t> grid_storage_;
std::unique_ptr<memory_storage_t> theta_storage_;
};
struct cudnn_resampling_fwd_t : public cudnn_resampling_base_t {
using cudnn_resampling_base_t::cudnn_resampling_base_t;
struct pd_t : public resampling_fwd_pd_t,
public cudnn_resampling_pd_base_t {
using cudnn_resampling_pd_base_t::cudnn_resampling_pd_base_t;
using resampling_fwd_pd_t::resampling_fwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_resampling_fwd_t);
status_t init(engine_t *engine) {
using namespace data_type;
using namespace format_tag;
assert(engine->kind() == engine_kind::gpu);
bool ok = desc()->alg_kind == alg_kind::resampling_linear
&& is_fwd() && utils::one_of(src_md()->data_type, f32, f16)
&& src_md()->data_type == dst_md()->data_type
&& set_default_params() == status::success
&& attr()->has_default_values();
if (!ok) return status::unimplemented;
// src must have a tag and src must follow the same tag
format_tag_t dat_tag = memory_desc_matches_one_of_tag(
*src_md(), ncw, nchw, nwc, nhwc);
if (dat_tag == format_tag::undef) return status::unimplemented;
if (!memory_desc_matches_tag(*dst_md(), dat_tag)) {
return status::unimplemented;
}
resampling_impl_.reset(new cudnn_resampling_fwd_impl_t());
return resampling_impl_->init(this);
}
std::shared_ptr<cudnn_resampling_impl_base_t> resampling_impl_;
};
status_t init(engine_t *engine) override {
status_t status;
auto wrap = memory_desc_wrapper(pd()->src_md());
switch (wrap.data_type()) {
case data_type::f32:
status = prepare_coordinate_grid<float>(engine, pd());
break;
case data_type::f16:
status = prepare_coordinate_grid<float16_t>(engine, pd());
break;
default: status = status::unimplemented;
}
return status;
}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
struct cudnn_resampling_bwd_t : public cudnn_resampling_base_t {
using cudnn_resampling_base_t::cudnn_resampling_base_t;
struct pd_t : public resampling_bwd_pd_t,
public cudnn_resampling_pd_base_t {
using cudnn_resampling_pd_base_t::cudnn_resampling_pd_base_t;
using resampling_bwd_pd_t::resampling_bwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_resampling_bwd_t);
status_t init(engine_t *engine) {
using namespace data_type;
using namespace format_tag;
assert(engine->kind() == engine_kind::gpu);
bool ok = desc()->alg_kind == alg_kind::resampling_linear
&& !is_fwd() && utils::one_of(diff_src_md()->data_type, f32)
&& diff_src_md()->data_type == diff_dst_md()->data_type
&& set_default_params() == status::success
&& attr()->has_default_values();
if (!ok) return status::unimplemented;
// dst must have a tag and src must follow the same tag
format_tag_t dat_tag = memory_desc_matches_one_of_tag(
*diff_dst_md(), ncw, nchw, nwc, nhwc);
if (dat_tag == format_tag::undef) return status::unimplemented;
if (!memory_desc_matches_tag(*diff_src_md(), dat_tag)) {
return status::unimplemented;
}
resampling_impl_.reset(new cudnn_resampling_bwd_impl_t());
return resampling_impl_->init(this);
}
std::shared_ptr<cudnn_resampling_impl_base_t> resampling_impl_;
};
status_t init(engine_t *engine) override {
return prepare_coordinate_grid<float>(engine, pd());
}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,171 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_RESAMPLING_IMPL_HPP
#define GPU_NVIDIA_CUDNN_RESAMPLING_IMPL_HPP
#include <cudnn.h>
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_resampling_impl_base_t {
virtual ~cudnn_resampling_impl_base_t() {
for (int i = 0; i < NUM_IO; ++i) {
if (tensor_descs_[i]) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroyTensorDescriptor, tensor_descs_[i]);
}
}
if (st_desc_) {
CUDNN_EXECUTE_FUNC_V(
cudnnDestroySpatialTransformerDescriptor, st_desc_);
}
}
virtual status_t init(resampling_pd_t *pd) = 0;
virtual void execute(
cudnnHandle_t handle, const std::vector<void *> &args) const = 0;
int ndims() { return ndims_; }
status_t create_and_set_st_desc() {
CHECK(CUDNN_EXECUTE_FUNC_S(
cudnnCreateSpatialTransformerDescriptor, &st_desc_));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetSpatialTransformerNdDescriptor,
st_desc_, CUDNN_SAMPLER_BILINEAR, data_types_[dst], ndims_,
dims_[dst]));
return status::success;
}
enum io { src, dst, NUM_IO };
int dims_[NUM_IO][DNNL_MAX_NDIMS];
int strides_[NUM_IO][DNNL_MAX_NDIMS];
cudnnDataType_t data_types_[NUM_IO];
cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {};
cudnnSpatialTransformerDescriptor_t st_desc_;
int ndims_;
const float alpha_ = 1.f, beta_ = 0.f;
};
struct cudnn_resampling_fwd_impl_t : public cudnn_resampling_impl_base_t {
status_t init(resampling_pd_t *pd) override {
ndims_ = std::max(4, pd->ndims());
if (ndims_ > 4) return status::unimplemented;
cudnnTensorFormat_t src_format, dst_format;
CHECK(get_format(pd->src_md(), dst_format));
CHECK(get_format(pd->dst_md(), src_format));
convert_dims(pd->src_md()->padded_dims, dims_[src], pd->ndims());
convert_dims(pd->src_md()->format_desc.blocking.strides, strides_[src],
pd->ndims(), 4,
(dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[src][1]));
convert_dims(pd->dst_md()->padded_dims, dims_[dst], pd->ndims());
convert_dims(pd->dst_md()->format_desc.blocking.strides, strides_[dst],
pd->ndims(), 4,
(dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[dst][1]));
CHECK(convert_data_type(pd->src_md(), &data_types_[src]));
CHECK(convert_data_type(pd->dst_md(), &data_types_[dst]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src],
data_types_[src], ndims_, dims_[src], strides_[src]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst],
data_types_[dst], ndims_, dims_[dst], strides_[dst]));
CHECK(create_and_set_st_desc());
return status::success;
}
void execute(cudnnHandle_t handle,
const std::vector<void *> &args) const override {
CUDNN_EXECUTE_FUNC(cudnnSpatialTfSamplerForward, handle, st_desc_,
&alpha_, tensor_descs_[src], args[0], args[1], &beta_,
tensor_descs_[dst], args[2]);
}
};
struct cudnn_resampling_bwd_impl_t : public cudnn_resampling_impl_base_t {
status_t init(resampling_pd_t *pd) override {
ndims_ = std::max(4, pd->ndims());
if (ndims_ > 4) return status::unimplemented;
cudnnTensorFormat_t src_format, dst_format;
CHECK(get_format(pd->diff_src_md(), dst_format));
CHECK(get_format(pd->diff_dst_md(), src_format));
convert_dims(pd->diff_src_md()->padded_dims, dims_[src], pd->ndims());
convert_dims(pd->diff_src_md()->format_desc.blocking.strides,
strides_[src], pd->ndims(), 4,
(dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[src][1]));
convert_dims(pd->diff_dst_md()->padded_dims, dims_[dst], pd->ndims());
convert_dims(pd->diff_dst_md()->format_desc.blocking.strides,
strides_[dst], pd->ndims(), 4,
(dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[dst][1]));
CHECK(convert_data_type(pd->diff_src_md(), &data_types_[src]));
CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[dst]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src],
data_types_[src], ndims_, dims_[src], strides_[src]));
CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst],
data_types_[dst], ndims_, dims_[dst], strides_[dst]));
CHECK(create_and_set_st_desc());
auto wrap = memory_desc_wrapper(pd->diff_src_md());
auto grid_size = pd->MB() * pd->OH() * pd->OW() * 2;
auto grid_size_in_byte = grid_size * wrap.data_type_size();
// cuDNN does not allow the dgrid to be NULL ptr. Although we dont
// need to compute dgrid since the theta is not comming from a
// local network, we have to set that since Nvidia does not accept
// so we allocate an scratchpad for dgrid
pd->scratchpad_registry().registrar().book(
memory_tracking::names::key_none, grid_size_in_byte, size_t(1));
return status::success;
}
void execute(cudnnHandle_t handle,
const std::vector<void *> &args) const override {
// we are not backpropagating for the grid here.
// So both alpha and beta are zero and the dgrid value
// wont be used
CUDNN_EXECUTE_FUNC(cudnnSpatialTfSamplerBackward, handle, st_desc_,
&alpha_, tensor_descs_[src], args[0], &beta_,
tensor_descs_[src], args[0], &beta_, tensor_descs_[dst],
args[1], args[2], &beta_, args[3]);
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,85 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_softmax.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "sycl/sycl_buffer_memory_storage.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
status_t cudnn_softmax_fwd_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->desc()->data_desc).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto src_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
auto dst_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
std::vector<void *> args;
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
args.push_back(sc.memory<void *>(ih, src_acc));
args.push_back(sc.memory<void *>(ih, dst_acc));
pd()->softmax_impl_->execute(handle, args.data(), args.size());
});
});
}
status_t cudnn_softmax_bwd_t::execute(const exec_ctx_t &ctx) const {
if (memory_desc_wrapper(pd()->desc()->diff_desc).has_zero_dim())
return status::success;
nvidia::sycl_cuda_stream_t *cuda_stream
= utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
auto dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DST);
auto diff_dst_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
auto diff_src_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
std::vector<void *> args;
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
cuda_stream->engine());
auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
auto handle = cuda_stream->get_cudnn_handle();
args.push_back(sc.memory<void *>(ih, dst_acc));
args.push_back(sc.memory<void *>(ih, diff_dst_acc));
args.push_back(sc.memory<void *>(ih, diff_src_acc));
pd()->softmax_impl_->execute(handle, args.data(), args.size());
});
});
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,116 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_SOFTMAX_HPP
#define GPU_NVIDIA_CUDNN_SOFTMAX_HPP
#include "cudnn.h"
#include <CL/sycl.hpp>
#include "common/primitive.hpp"
#include "common/softmax_pd.hpp"
#include "gpu/nvidia/cudnn_softmax_impl.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_softmax_fwd_t : public primitive_t {
using primitive_t::primitive_t;
struct pd_t : public softmax_fwd_pd_t {
using softmax_fwd_pd_t::softmax_fwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_softmax_fwd_t);
status_t init(engine_t *) {
bool ok = true
&& utils::one_of(desc()->prop_kind,
prop_kind::forward_inference,
prop_kind::forward_training)
&& utils::one_of(desc()->data_desc.data_type,
data_type::f32, data_type::f16)
// Blocking is supported only for s8 and softmax does not
// support it.
&& src_md()->format_desc.blocking.inner_nblks == 0
&& dst_md()->format_desc.blocking.inner_nblks == 0
&& attr()->has_default_values();
if (!ok) return status::unimplemented;
softmax_impl_.reset(new cudnn_softmax_fwd_impl_t());
return softmax_impl_->init(this);
}
std::shared_ptr<cudnn_softmax_impl_base_t> softmax_impl_;
};
cudnn_softmax_fwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
struct cudnn_softmax_bwd_t : public primitive_t {
using primitive_t::primitive_t;
struct pd_t : public softmax_bwd_pd_t {
using softmax_bwd_pd_t::softmax_bwd_pd_t;
DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_softmax_bwd_t);
status_t init(engine_t *) {
bool ok = true && desc()->prop_kind == prop_kind::backward_data
&& utils::one_of(desc()->data_desc.data_type,
data_type::f32, data_type::f16)
&& set_default_formats_common()
// Blocking is not supported
&& dst_md()->format_desc.blocking.inner_nblks == 0
&& diff_dst_md()->format_desc.blocking.inner_nblks == 0
&& attr()->has_default_values();
if (!ok) return status::unimplemented;
softmax_impl_.reset(new cudnn_softmax_bwd_impl_t());
return softmax_impl_->init(this);
}
std::shared_ptr<cudnn_softmax_impl_base_t> softmax_impl_;
};
cudnn_softmax_bwd_t(const pd_t *apd) : primitive_t(apd) {}
status_t execute(const exec_ctx_t &ctx) const override;
private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,255 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_SOFTMAX_IMPL_HPP
#define GPU_NVIDIA_CUDNN_SOFTMAX_IMPL_HPP
#include "cudnn.h"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_softmax_impl_base_t {
cudnnDataType_t data_type;
int ndims;
cudnnSoftmaxAlgorithm_t alg_kind;
// cuDNN only supports softmax on channel dimension
cudnnSoftmaxMode_t mode = cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_CHANNEL;
// oneDNN softmax primitive doesn't support any post-ops or attributes,
// hence we can set alpha = 1 and beta = 0 for all cases
float alpha = 1.0f;
float beta = 0.0f;
virtual ~cudnn_softmax_impl_base_t() {}
virtual status_t init(const softmax_pd_t *pd) = 0;
virtual void execute(cudnnHandle_t handle, void **x, int size) const = 0;
// Mapping between dnnl algorithm and cuDNN softmax algorithm
status_t convert_alg_kind(
bool is_log_softmax, cudnnSoftmaxAlgorithm_t *cuda_alg_kind) const {
if (is_log_softmax) {
*cuda_alg_kind = cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_LOG;
} else {
*cuda_alg_kind = cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_ACCURATE;
}
return status::success;
}
status_t convert_dims_softmax(const dims_t &orig_dims, int *modified_dims,
int axis, int ndims, format_tag_t tag,
cudnnTensorFormat_t &format) const {
// Initialise all dims to 1
for (int i = 0; i < 4; i++) {
modified_dims[i] = 1;
}
if (axis == 1) {
// Copy dimensions into the new array
format = tag == dnnl_nhwc ? cudnnTensorFormat_t::CUDNN_TENSOR_NHWC
: cudnnTensorFormat_t::CUDNN_TENSOR_NCHW;
int num_dims = ndims < 4 ? ndims : 4;
for (int i = 0; i < num_dims; i++) {
modified_dims[i] = orig_dims[i];
}
for (int i = 4; i < ndims; i++) {
modified_dims[3] *= orig_dims[i];
}
return status::success;
}
format = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW;
switch (tag) {
case dnnl_cn: {
modified_dims[0] = orig_dims[1];
modified_dims[1] = orig_dims[0];
break;
}
case dnnl_nchw: {
switch (axis) {
case 0:
modified_dims[1] = orig_dims[axis];
modified_dims[2] = orig_dims[1];
for (int i = 2; i < ndims; i++) {
modified_dims[3] *= orig_dims[i];
}
break;
default: {
for (int i = 0; i < axis; i++) {
modified_dims[0] *= orig_dims[i];
}
modified_dims[1] = orig_dims[axis];
if (axis == ndims - 1) { return status::success; }
for (int i = axis + 1; i < ndims; i++) {
modified_dims[2] *= orig_dims[i];
}
break;
}
}
break;
}
case dnnl_nhwc:
switch (axis) {
case 0:
modified_dims[1] = orig_dims[0];
for (int i = 1; i < ndims; i++) {
modified_dims[2] *= orig_dims[i];
}
break;
case 2:
modified_dims[0] = orig_dims[0];
modified_dims[1] = orig_dims[2];
for (int i = 3; i < ndims; i++) {
modified_dims[2] *= orig_dims[i];
}
modified_dims[3] = orig_dims[1];
break;
case 3:
modified_dims[0] = orig_dims[0] * orig_dims[2];
modified_dims[1] = orig_dims[3];
modified_dims[2] = ndims == 4 ? 1 : orig_dims[4];
modified_dims[3] = orig_dims[1];
break;
}
break;
default: return status::unimplemented;
}
return status::success;
}
status_t convert_tag(const memory_desc_t *md, format_tag_t &tag) const {
const memory_desc_wrapper mem_wrapper(md);
if (mem_wrapper.matches_one_of_tag(format_tag::ba)) {
tag = dnnl_cn;
} else if (mem_wrapper.matches_one_of_tag(format_tag::ab,
format_tag::abc, format_tag::abcd, format_tag::abcde,
format_tag::abcdef)) {
tag = dnnl_nchw;
} else if (mem_wrapper.matches_one_of_tag(format_tag::acb,
format_tag::acdb, format_tag::acdeb)) {
tag = dnnl_nhwc;
} else {
return status::unimplemented;
}
return status::success;
}
};
struct cudnn_softmax_fwd_impl_t : public cudnn_softmax_impl_base_t {
int dims[DNNL_MAX_NDIMS];
cudnnTensorDescriptor_t tensor_desc;
cudnnTensorFormat_t format;
status_t init(const softmax_pd_t *pd) override {
// If any of the dimensions are 0 we should not continue with
// creating cudnn descriptors
if (has_zero_dims(pd->src_md(0)->dims, pd->ndims())) {
return status::success;
}
if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
ndims = pd->ndims() < 4 ? 4 : pd->ndims();
format_tag_t tag;
CHECK(convert_tag(pd->src_md(), tag));
CHECK(convert_dims_softmax(pd->src_md()->padded_dims, dims, pd->axis(),
pd->ndims(), tag, format));
convert_alg_kind(pd->is_logsoftmax(), &alg_kind);
assert(pd->src_md()->data_type == pd->dst_md()->data_type);
CHECK(convert_data_type(pd->src_md(), &data_type));
CHECK(create_and_set_tensor_descriptor_ex(
&tensor_desc, format, data_type, 4, dims));
return status::success;
}
void execute(cudnnHandle_t handle, void **x, int size) const override {
// Confirm that 2 arguments were passed, src and dst
assert(size == 2);
CUDNN_EXECUTE_FUNC(cudnnSoftmaxForward, handle, alg_kind, mode, &alpha,
tensor_desc, x[0], &beta, tensor_desc, x[1]);
}
~cudnn_softmax_fwd_impl_t() {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_desc);
}
};
struct cudnn_softmax_bwd_impl_t : public cudnn_softmax_impl_base_t {
int dims[DNNL_MAX_NDIMS];
int dims_dst[DNNL_MAX_NDIMS];
cudnnTensorDescriptor_t tensor_dst_desc;
cudnnTensorDescriptor_t tensor_diff_desc;
cudnnTensorFormat_t format;
status_t init(const softmax_pd_t *pd) override {
// If any of the dimensions are 0 we should not continue with
// creating cudnn descriptors
if (memory_desc_wrapper(pd->desc()->diff_desc).has_zero_dim())
return status::success;
if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
ndims = pd->ndims() < 4 ? 4 : pd->ndims();
format_tag_t tag;
CHECK(convert_tag(pd->dst_md(), tag));
CHECK(convert_dims_softmax(pd->dst_md()->padded_dims, dims_dst,
pd->axis(), pd->ndims(), tag, format));
CHECK(convert_dims_softmax(pd->diff_src_md()->padded_dims, dims,
pd->axis(), pd->ndims(), tag, format));
convert_alg_kind(pd->is_logsoftmax(), &alg_kind);
assert(pd->diff_dst_md()->data_type == pd->dst_md()->data_type);
assert(pd->diff_dst_md()->data_type == pd->diff_src_md()->data_type);
CHECK(convert_data_type(pd->dst_md(), &data_type));
CHECK(create_and_set_tensor_descriptor_ex(
&tensor_dst_desc, format, data_type, 4, dims_dst));
CHECK(create_and_set_tensor_descriptor_ex(
&tensor_diff_desc, format, data_type, 4, dims));
return status::success;
}
void execute(cudnnHandle_t handle, void **x, int size) const override {
// Assert that 3 arguments were passed src, diff_dst and diff_src
assert(size == 3);
CUDNN_EXECUTE_FUNC(cudnnSoftmaxBackward, handle, alg_kind, mode, &alpha,
tensor_dst_desc, x[0], tensor_diff_desc, x[1], &beta,
tensor_diff_desc, x[2]);
}
~cudnn_softmax_bwd_impl_t() {
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_dst_desc);
CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_diff_desc);
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,41 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/cudnn_sum.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace {
using spd_create_f = dnnl::impl::engine_t::sum_primitive_desc_create_f;
const spd_create_f cuda_sum_impl_list[]
= {cudnn_ref_sum_t::pd_t::create, nullptr};
} // namespace
const spd_create_f *cuda_gpu_engine_impl_list_t::get_sum_implementation_list() {
return cuda_sum_impl_list;
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,70 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_CUDNN_SUM_HPP
#define GPU_NVIDIA_CUDNN_SUM_HPP
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
#include "gpu/ocl/ref_sum.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
struct cudnn_ref_sum_t : public ::dnnl::impl::gpu::ocl::ref_sum_t {
using base_t = dnnl::impl::gpu::ocl::ref_sum_t;
using base_t::base_t;
using base_pd_t = base_t::pd_t;
struct pd_t : public base_pd_t {
using base_pd_t::base_pd_t;
DECLARE_SUM_PD_T("ref:any", cudnn_ref_sum_t);
// This function can be used for backend that does not support
// blocking on f32, so it can convert the blocked format to nchw. Since
// the final destination will preserve the blocking, the last reorder
// to put the accumulated result to the final output will add the
// blocking back.
void define_dst_acc_md() override {
dst_acc_md_ = dst_md_;
dst_acc_md_.data_type = dnnl_f32;
if ((dst_md_.data_type == data_type::s8)
&& (memory_desc_matches_nchw_vect_c(&dst_md_))) {
dst_acc_md_.format_desc.blocking.inner_nblks = 0;
dst_acc_md_.format_desc.blocking.inner_idxs[0] = 0;
dst_acc_md_.format_desc.blocking.inner_blks[0] = 0;
dst_acc_md_.format_desc.blocking.strides[dst_acc_md_.ndims - 1]
= 1;
for (int i = dst_acc_md_.ndims - 2; i >= 0; i--) {
dst_acc_md_.format_desc.blocking.strides[i]
= dst_acc_md_.format_desc.blocking.strides[i + 1]
* dst_acc_md_.dims[i + 1];
}
}
}
};
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,199 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <CL/sycl/backend/cuda.hpp>
#include "sycl/sycl_utils.hpp"
#include "gpu/nvidia/cudnn_batch_normalization.hpp"
#include "gpu/nvidia/cudnn_binary.hpp"
#include "gpu/nvidia/cudnn_conv_inner_product.hpp"
#include "gpu/nvidia/cudnn_convolution.hpp"
#include "gpu/nvidia/cudnn_deconvolution.hpp"
#include "gpu/nvidia/cudnn_eltwise.hpp"
#include "gpu/nvidia/cudnn_gemm_inner_product.hpp"
#include "gpu/nvidia/cudnn_lrn.hpp"
#include "gpu/nvidia/cudnn_matmul.hpp"
#include "gpu/nvidia/cudnn_pooling.hpp"
#include "gpu/nvidia/cudnn_resampling.hpp"
#include "gpu/nvidia/cudnn_softmax.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
#include "gpu/nvidia/sycl_cuda_stream.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
bool is_nvidia_gpu(const cl::sycl::device &dev) {
constexpr int nvidia_vendor_id = 0x10DE;
return dev.is_gpu()
&& dev.get_info<cl::sycl::info::device::vendor_id>()
== nvidia_vendor_id;
}
status_t cuda_engine_create(engine_t **engine, engine_kind_t engine_kind,
const cl::sycl::device &dev, const cl::sycl::context &ctx) {
CHECK(nvidia::check_device(engine_kind));
std::unique_ptr<nvidia::sycl_cuda_engine_t> cuda_engine(
(new nvidia::sycl_cuda_engine_t(dev, ctx)));
if (!cuda_engine) return status::out_of_memory;
CHECK(cuda_engine->init());
*engine = cuda_engine.release();
return status::success;
}
sycl_cuda_engine_t::sycl_cuda_engine_t(engine_kind_t kind,
const cl::sycl::device &dev, const cl::sycl::context &ctx)
: base_t(kind, dev, ctx) {
underlying_context_type();
set_cudnn_handle();
set_cublas_handle();
}
sycl_cuda_engine_t::sycl_cuda_engine_t(
const cl::sycl::device &dev, const cl::sycl::context &ctx)
: sycl_cuda_engine_t(engine_kind::gpu, dev, ctx) {
assert(is_nvidia_gpu(dev));
}
status_t sycl_cuda_engine_t::set_cublas_handle() {
// scoped context will make sure the top of the stack context is
// the engine context while creating the cublas handle.
cublasHandle_t handle;
cuda_sycl_scoped_context_handler_t sc(*this);
CHECK(CUBLAS_EXECUTE_FUNC_S(cublasCreate, &handle));
cublas_handle_.reset(new cublasHandle_t(handle));
handle = nullptr;
return status::success;
}
status_t sycl_cuda_engine_t::set_cudnn_handle() {
// scoped context will make sure the top of the stack context is
// the engine context while creating the cublas handle.
cudnnHandle_t handle;
cuda_sycl_scoped_context_handler_t sc(*this);
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreate, &handle));
cudnn_handle_.reset(new cudnnHandle_t(handle));
handle = nullptr;
return status::success;
}
CUcontext sycl_cuda_engine_t::get_underlying_context() const {
return cl::sycl::get_native<cl::sycl::backend::cuda>(context());
}
status_t sycl_cuda_engine_t::create_stream(stream_t **stream, unsigned flags) {
return sycl_cuda_stream_t::create_stream(stream, this, flags);
}
status_t sycl_cuda_engine_t::create_stream(
stream_t **stream, cl::sycl::queue &queue) {
return sycl_cuda_stream_t::create_stream(stream, this, queue);
}
status_t sycl_cuda_engine_t::underlying_context_type() {
// this is a costly function which take avarage up to 75ms
// on titanrx. So we must run it once and store the variable
// in is_primary_context_;
CUcontext primary;
CUcontext desired
= cl::sycl::get_native<cl::sycl::backend::cuda>(context());
CUdevice cuda_device
= cl::sycl::get_native<cl::sycl::backend::cuda>(device());
CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxRetain, &primary, cuda_device));
CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxRelease, cuda_device));
primary_context_ = (primary == desired);
return status::success;
}
device_id_t sycl_cuda_engine_t::device_id() const {
return device_id_t(static_cast<int>(sycl::backend_t::nvidia),
static_cast<uint64_t>(
cl::sycl::get_native<cl::sycl::backend::cuda>(device())),
static_cast<uint64_t>(0));
}
namespace {
using namespace dnnl::impl::data_type;
#define INSTANCE(...) &primitive_desc_t::create<__VA_ARGS__::pd_t>
// clang-format off
const dnnl::impl::engine_t::primitive_desc_create_f sycl_cuda_impl_list[] = {
// Elementwise
INSTANCE(cudnn_eltwise_fwd_t),
INSTANCE(cudnn_eltwise_bwd_t),
// Deconvolution
INSTANCE(cudnn_deconvolution_fwd_t),
INSTANCE(cudnn_deconvolution_bwd_data_t),
INSTANCE(cudnn_deconvolution_bwd_weights_t),
// Convolution
INSTANCE(cudnn_convolution_fwd_t),
INSTANCE(cudnn_convolution_bwd_data_t),
INSTANCE(cudnn_convolution_bwd_weights_t),
// Batch Normalization
INSTANCE(cudnn_batch_normalization_fwd_t),
INSTANCE(cudnn_batch_normalization_bwd_t),
// Pooling
INSTANCE(cudnn_pooling_fwd_t),
INSTANCE(cudnn_pooling_bwd_t),
// LRN
INSTANCE(cudnn_lrn_fwd_t),
INSTANCE(cudnn_lrn_bwd_t),
// Inner Product
INSTANCE(cudnn_gemm_inner_product_fwd_t),
INSTANCE(cudnn_conv_inner_product_fwd_t),
INSTANCE(cudnn_gemm_inner_product_bwd_data_t),
INSTANCE(cudnn_conv_inner_product_bwd_data_t),
INSTANCE(cudnn_gemm_inner_product_bwd_weights_t),
INSTANCE(cudnn_conv_inner_product_bwd_weights_t),
// Softmax
INSTANCE(cudnn_softmax_fwd_t),
INSTANCE(cudnn_softmax_bwd_t),
// Binary
INSTANCE(cudnn_binary_t),
// MatMul
INSTANCE(cudnn_matmul_t),
// Resampling
INSTANCE(cudnn_resampling_fwd_t),
INSTANCE(cudnn_resampling_bwd_t),
nullptr,
};
// clang-format on
#undef INSTANCE
} // namespace
const dnnl::impl::engine_t::primitive_desc_create_f *
sycl_cuda_engine_t::get_implementation_list(const op_desc_t *) const {
return sycl_cuda_impl_list;
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,121 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_SYCL_CUDA_ENGINE_HPP
#define GPU_NVIDIA_SYCL_CUDA_ENGINE_HPP
#include <cudnn.h>
#include <cublas_v2.h>
#include <CL/sycl.hpp>
#include "common/stream.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
#include "sycl/sycl_device_info.hpp"
#include "sycl/sycl_engine_base.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
class cuda_gpu_engine_impl_list_t {
public:
static const dnnl::impl::engine_t::reorder_primitive_desc_create_f *
get_reorder_implementation_list(
const memory_desc_t *src_md, const memory_desc_t *dst_md);
static const dnnl::impl::engine_t::concat_primitive_desc_create_f *
get_concat_implementation_list();
static const dnnl::impl::engine_t::sum_primitive_desc_create_f *
get_sum_implementation_list();
};
class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t {
public:
using base_t = dnnl::impl::sycl::sycl_engine_base_t;
sycl_cuda_engine_t(engine_kind_t kind, const cl::sycl::device &dev,
const cl::sycl::context &ctx);
sycl_cuda_engine_t(
const cl::sycl::device &dev, const cl::sycl::context &ctx);
status_t create_stream(stream_t **stream, unsigned flags) override;
status_t create_stream(stream_t **stream, cl::sycl::queue &queue);
const dnnl::impl::engine_t::reorder_primitive_desc_create_f *
get_reorder_implementation_list(const memory_desc_t *src_md,
const memory_desc_t *dst_md) const override {
return cuda_gpu_engine_impl_list_t::get_reorder_implementation_list(
src_md, dst_md);
}
const dnnl::impl::engine_t::concat_primitive_desc_create_f *
get_concat_implementation_list() const override {
return cuda_gpu_engine_impl_list_t::get_concat_implementation_list();
}
const dnnl::impl::engine_t::sum_primitive_desc_create_f *
get_sum_implementation_list() const override {
return cuda_gpu_engine_impl_list_t::get_sum_implementation_list();
}
const primitive_desc_create_f *get_implementation_list(
const op_desc_t *) const override;
CUcontext get_underlying_context() const;
cudnnHandle_t *get_cudnn_handle() const { return cudnn_handle_.get(); }
cublasHandle_t *get_cublas_handle() const { return cublas_handle_.get(); }
const bool has_primary_context() const { return primary_context_; }
device_id_t device_id() const override;
private:
// This functions sets the context type. Since cuda requires different
// approach in retaining/releasing primary/non-primary context.
status_t underlying_context_type();
status_t set_cudnn_handle();
status_t set_cublas_handle();
// To avoid performance penalty cudnn/cublas required to have one handle per
// thread per context therefor the handles will be the properties of the
// engine. an engine can be assigned to multiple streams: lets say engine
// eng(kind, 0); stream str1(eng,...); stream str2(eng,...); stream
// str3(eng,...); In multi-threading environment both engin and stream
// should be created in a different thread in order to allow safe
// multi-threading programming If all the streams belongs to one thread, the
// same handle will be used for all. Creation of handle is expensive and
// must be avoided when it is not necessary.
std::unique_ptr<cudnnHandle_t, std::function<void(cudnnHandle_t *)>>
cudnn_handle_ {nullptr, [](cudnnHandle_t *h) {
if (h != nullptr) {
CUDNN_EXECUTE_FUNC_V(cudnnDestroy, *h);
h = nullptr;
}
}};
std::unique_ptr<cublasHandle_t, std::function<void(cublasHandle_t *)>>
cublas_handle_ {nullptr, [](cublasHandle_t *h) {
if (h != nullptr) {
CUBLAS_EXECUTE_FUNC_V(cublasDestroy, *h);
h = nullptr;
}
}};
bool primary_context_;
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,63 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
cuda_sycl_scoped_context_handler_t::cuda_sycl_scoped_context_handler_t(
const sycl_cuda_engine_t &engine)
: need_to_recover_(false) {
try {
auto desired = engine.get_underlying_context();
CUDA_EXECUTE_FUNC(cuCtxGetCurrent, &original_);
if (original_ != desired) {
// Sets the desired context as the active one for the thread
CUDA_EXECUTE_FUNC(cuCtxSetCurrent, desired);
// No context is installed and the suggested context is primary
// This is the most common case. We can activate the context in the
// thread and leave it there until all the PI context referring to
// the same underlying CUDA primary context are destroyed. This
// emulates the behaviour of the CUDA runtime api, and avoids costly
// context switches. No action is required on this side of the if.
need_to_recover_
= !(original_ == nullptr && engine.has_primary_context());
}
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
}
}
cuda_sycl_scoped_context_handler_t::
~cuda_sycl_scoped_context_handler_t() noexcept(false) {
// we need to release the placed_context_ since we set it from
// ctx.get() retains the underlying context so we need to remove it
try {
if (need_to_recover_) { CUDA_EXECUTE_FUNC(cuCtxSetCurrent, original_); }
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
}
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,60 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_SYCL_CUDA_SCOPED_CONTEXT_HPP
#define GPU_NVIDIA_SYCL_CUDA_SCOPED_CONTEXT_HPP
#include <memory>
#include <thread>
#include <CL/sycl.hpp>
#include <CL/sycl/backend/cuda.hpp>
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
// Scoped context is required to set the current context of a thread
// to the context of the using queue. The scoped handle class is
// required to put the stream context on top of the cuda stack
class cuda_sycl_scoped_context_handler_t {
CUcontext original_;
bool need_to_recover_;
public:
cuda_sycl_scoped_context_handler_t(const sycl_cuda_engine_t &);
// Destruct the scope p_context placed_context_.
~cuda_sycl_scoped_context_handler_t() noexcept(false);
// This is a work-around function for reinterpret_casting the memory. This
// will be fixed when SYCL-2020 has been implemented for Pi backend.
template <typename T, typename U>
inline T memory(const cl::sycl::interop_handler &ih, U acc) {
return reinterpret_cast<T>(ih.get_mem<cl::sycl::backend::cuda>(acc));
}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,126 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/nvidia/sycl_cuda_stream.hpp"
#include "gpu/nvidia/sycl_cuda_engine.hpp"
#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
cublasHandle_t &sycl_cuda_stream_t::get_cublas_handle() {
return *(utils::downcast<sycl_cuda_engine_t *>(engine())
->get_cublas_handle());
}
cudnnHandle_t &sycl_cuda_stream_t::get_cudnn_handle() {
return *(utils::downcast<sycl_cuda_engine_t *>(engine())
->get_cudnn_handle());
}
// the sycl_cuda_stream_t will not own this. it is an observer pointer
CUstream sycl_cuda_stream_t::get_underlying_stream() {
return cl::sycl::get_native<cl::sycl::backend::cuda>(*queue_);
}
// the sycl_cuda_stream_t will not own this. it is an observer pointer
CUcontext sycl_cuda_stream_t::get_underlying_context() {
return cl::sycl::get_native<cl::sycl::backend::cuda>(queue_->get_context());
}
status_t sycl_cuda_stream_t::init() {
if ((flags() & stream_flags::in_order) == 0
&& (flags() & stream_flags::out_of_order) == 0)
return status::invalid_arguments;
// If queue_ is not set then construct it
auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(engine());
auto status = status::success;
if (!queue_) {
auto &sycl_ctx = sycl_engine.context();
auto &sycl_dev = sycl_engine.device();
if (!sycl_engine.is_service_stream_created())
queue_.reset(new cl::sycl::queue(sycl_ctx, sycl_dev));
else {
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto sycl_stream = utils::downcast<sycl_stream_t *>(service_stream);
queue_.reset(new cl::sycl::queue(sycl_stream->queue()));
}
} else {
auto queue_streamId = get_underlying_stream();
auto sycl_dev = queue().get_device();
bool args_ok = IMPLICATION(
engine()->kind() == engine_kind::gpu, sycl_dev.is_gpu());
if (!sycl_dev.is_gpu()) return status::invalid_arguments;
auto queue_context = get_underlying_context();
CUdevice queue_device
= cl::sycl::get_native<cl::sycl::backend::cuda>(sycl_dev);
auto engine_context = sycl_engine.get_underlying_context();
auto engine_device = cl::sycl::get_native<cl::sycl::backend::cuda>(
sycl_engine.device());
stream_t *service_stream;
CHECK(sycl_engine.get_service_stream(service_stream));
auto cuda_stream
= utils::downcast<sycl_cuda_stream_t *>(service_stream);
auto engine_streamId = cuda_stream->get_underlying_stream();
status = ((engine_device != queue_device)
|| (engine_context != queue_context)
|| (engine_streamId != queue_streamId))
? status::invalid_arguments
: status::success;
}
cuda_sycl_scoped_context_handler_t sc(sycl_engine);
auto streamId = get_underlying_stream();
auto cublas_handle = sycl_engine.get_cublas_handle();
auto cudnn_handle = sycl_engine.get_cudnn_handle();
assert(sycl_engine.context() == base_t::queue().get_context());
cudaStream_t current_stream_id = nullptr;
CUDNN_EXECUTE_FUNC(cudnnGetStream, *cudnn_handle, &current_stream_id);
if (current_stream_id != streamId) {
CUDNN_EXECUTE_FUNC(cudnnSetStream, *cudnn_handle, streamId);
}
CUBLAS_EXECUTE_FUNC(cublasGetStream, *cublas_handle, &current_stream_id);
if (current_stream_id != streamId) {
CUBLAS_EXECUTE_FUNC(cublasSetStream, *cublas_handle, streamId);
}
return status;
}
status_t sycl_cuda_stream_t::interop_task(
std::function<void(cl::sycl::handler &)> sycl_cuda_interop_) {
try {
this->set_deps({queue().submit(
[&](cl::sycl::handler &cgh) { sycl_cuda_interop_(cgh); })});
return status::success;
} catch (std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
return status::runtime_error;
}
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,81 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_SYCL_CUDA_STREAM_HPP
#define GPU_NVIDIA_SYCL_CUDA_STREAM_HPP
#include <cuda.h>
#include <cudnn.h>
#include <cublas_v2.h>
#include "common/engine.hpp"
#include "sycl/sycl_stream.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
class sycl_cuda_stream_t : public dnnl::impl::sycl::sycl_stream_t {
public:
using base_t = dnnl::impl::sycl::sycl_stream_t;
cublasHandle_t &get_cublas_handle();
cudnnHandle_t &get_cudnn_handle();
static status_t create_stream(
stream_t **stream, engine_t *engine, unsigned flags) {
std::unique_ptr<sycl_cuda_stream_t> sycl_stream(
new sycl_cuda_stream_t(engine, flags));
if (!sycl_stream) return status::out_of_memory;
CHECK(sycl_stream->init());
*stream = sycl_stream.release();
return status::success;
}
static status_t create_stream(
stream_t **stream, engine_t *engine, cl::sycl::queue &queue) {
unsigned flags;
CHECK(base_t::init_flags(&flags, queue));
std::unique_ptr<sycl_cuda_stream_t> sycl_stream(
new sycl_cuda_stream_t(engine, flags, queue));
CHECK(sycl_stream->init());
*stream = sycl_stream.release();
return status::success;
}
status_t interop_task(std::function<void(cl::sycl::handler &)>);
CUstream get_underlying_stream();
CUcontext get_underlying_context();
private:
status_t init();
sycl_cuda_stream_t(engine_t *engine, unsigned flags, cl::sycl::queue &queue)
: base_t(engine, flags, queue) {}
sycl_cuda_stream_t(engine_t *engine, unsigned flags)
: base_t(engine, flags) {}
};
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -0,0 +1,522 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_NVIDIA_SYCL_CUDA_UTILS_HPP
#define GPU_NVIDIA_SYCL_CUDA_UTILS_HPP
#include <cuda.h>
#include <cudnn.h>
#include <stdexcept>
#include <cublas_v2.h>
#include "dnnl_sycl.hpp"
#include "common/engine.hpp"
#include "common/z_magic.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
#define CTX_OUT_ACCESSOR(arg) \
utils::downcast<sycl::sycl_buffer_memory_storage_t *>( \
&CTX_OUT_STORAGE(arg)) \
->buffer() \
.get_access<cl::sycl::access::mode::write>(cgh)
#define CTX_IN_ACCESSOR(arg) \
utils::downcast<sycl::sycl_buffer_memory_storage_t *>( \
&CTX_IN_STORAGE(arg)) \
->buffer() \
.get_access<cl::sycl::access::mode::read>(cgh)
#define CTX_SCRATCH_ACCESSOR(arg) \
utils::downcast<sycl::sycl_buffer_memory_storage_t *>( \
ctx.get_scratchpad_grantor().get_memory_storage(arg).get()) \
->buffer() \
.get_access<cl::sycl::access::mode::read_write>(cgh)
// Check if the device type matches the passed engine kind
inline status_t check_device(dnnl::impl::engine_kind_t eng_kind) {
return (eng_kind == dnnl::impl::engine_kind::gpu
? status::success
: status::invalid_arguments);
}
static void convert_dnnl_dims_array(
const dnnl_dim_t *dims, int *new_dims, int n_dims) {
for (size_t i = 0; i < n_dims; i++) {
new_dims[i] = static_cast<int>(dims[i]);
}
}
static void convert_dims(const dnnl_dim_t *dims, int *new_dims, int n_dims,
int adjustment_size = 4, int adjustment_value = 1) {
convert_dnnl_dims_array(dims, new_dims, n_dims);
for (size_t i = n_dims; i < adjustment_size; i++) {
new_dims[i] = adjustment_value;
}
}
static bool memory_desc_matches_nchw_vect_c(const memory_desc_t *mem_desc) {
// Only one block is supported for second (C) dimension and the block size
// must be 4 and the dimension has to be a multiple of block size.
auto is_int_8 = utils::one_of(mem_desc->data_type, data_type::s8);
auto &strides = mem_desc->format_desc.blocking.strides;
if (is_int_8 && mem_desc->format_desc.blocking.inner_nblks == 1
&& mem_desc->format_desc.blocking.inner_idxs[0] == 1
&& mem_desc->format_desc.blocking.inner_blks[0] == 4
&& mem_desc->dims[1] % 4 == 0) {
for (int d = 0; d < mem_desc->ndims - 1; ++d)
if (strides[d] < strides[d + 1]) return false;
return true;
}
return false;
}
static bool has_different_block_size(
const memory_desc_t *src_md, const memory_desc_t *dst_md) {
return ((src_md->format_desc.blocking.inner_nblks > 0
&& dst_md->format_desc.blocking.inner_nblks == 0)
|| (src_md->format_desc.blocking.inner_nblks == 0
&& dst_md->format_desc.blocking.inner_nblks > 0));
}
static bool adjust_dim_for_dnn(
int *dims, int n_dims, const memory_desc_t *mem_desc) {
if (memory_desc_matches_nchw_vect_c(mem_desc)) {
dims[n_dims] = mem_desc->format_desc.blocking.inner_blks[0];
dims[mem_desc->format_desc.blocking.inner_idxs[0]]
/= mem_desc->format_desc.blocking.inner_blks[0];
return true;
}
return false;
}
static bool adjust_stride_for_dnn(
int *stride, int n_dims, const memory_desc_t *mem_desc) {
if (memory_desc_matches_nchw_vect_c(mem_desc)) {
stride[n_dims] = mem_desc->format_desc.blocking.inner_nblks;
return true;
}
return false;
}
// Check if the dimensions contain any zeros, returns true if they do.
static bool has_zero_dims(const dnnl_dim_t *dims, int n_dims) {
for (size_t i = 0; i < n_dims; i++) {
if (dims[i] == 0) { return true; }
}
return false;
}
static status_t get_format(const memory_desc_t *md, cudnnTensorFormat_t &format,
bool consider_ab_as_nhwc = false) {
const memory_desc_wrapper mem_wrapper(md);
if (memory_desc_matches_nchw_vect_c(md)) {
format = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW_VECT_C;
} else if (mem_wrapper.matches_one_of_tag(format_tag::ab, format_tag::abc,
format_tag::abcd, format_tag::abcde,
format_tag::abcdef)) {
format = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW;
} else if (mem_wrapper.matches_one_of_tag(
format_tag::acb, format_tag::acdb, format_tag::acdeb)) {
format = cudnnTensorFormat_t::CUDNN_TENSOR_NHWC;
} else {
return status::unimplemented;
}
if (consider_ab_as_nhwc && mem_wrapper.matches_one_of_tag(format_tag::ab)) {
format = cudnnTensorFormat_t::CUDNN_TENSOR_NHWC;
}
return status::success;
}
static bool memory_format_ok(const memory_desc_t *mem_desc) {
return (memory_desc_matches_nchw_vect_c(mem_desc)
|| mem_desc->format_desc.blocking.inner_nblks == 0);
}
static status_t convert_data_type(const memory_desc_t *mem_desc,
cudnnDataType_t *cudnn_data_type, bool vectorized = true) {
switch (mem_desc->data_type) {
case dnnl_data_type_t::dnnl_f16:
*cudnn_data_type = cudnnDataType_t::CUDNN_DATA_HALF;
break;
case dnnl_data_type_t::dnnl_f32:
*cudnn_data_type = cudnnDataType_t::CUDNN_DATA_FLOAT;
break;
// CUDNN_TENSOR_NCHW_VECT_C format is only supported with tensor
// data types CUDNN_DATA_INT8x4, CUDNN_DATA_INT8x32, and
// CUDNN_DATA_UINT8x4. oneDNN does not support UINT8 and block size
// of 32, hence the only valid case is CUDNN_DATA_INT8x4
case dnnl_data_type_t::dnnl_s8:
*cudnn_data_type
= ((vectorized
&& mem_desc->format_desc.blocking.inner_blks[0]
== 4)
? cudnnDataType_t::CUDNN_DATA_INT8x4
: cudnnDataType_t::CUDNN_DATA_INT8);
break;
default: return status::unimplemented;
}
return status::success;
}
class cublas_error : virtual public std::runtime_error {
protected:
const char *cublas_error_map(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
default: return "<unknown>";
}
}
int error_number_;
public:
explicit cublas_error(const std::string &message, cublasStatus_t result)
: std::runtime_error(
(message + std::string(cublas_error_map(result)))) {
error_number_ = static_cast<int>(result);
}
virtual ~cublas_error() throw() {}
virtual int get_error_number() const throw() { return error_number_; }
};
class cuda_error : virtual public std::runtime_error {
protected:
inline const char *cuda_error_map(CUresult result) {
switch (result) {
case CUDA_SUCCESS: return "CUDA_SUCCESS";
case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED";
case CUDA_ERROR_INVALID_CONTEXT:
return "CUDA_ERROR_INVALID_CONTEXT";
case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
default: return "<unknown>";
}
}
int error_number_;
public:
explicit cuda_error(const std::string &message, CUresult result)
: std::runtime_error((message + std::string(cuda_error_map(result)))) {
error_number_ = static_cast<int>(result);
}
explicit cuda_error(const std::string &message, cudaError_t result)
: std::runtime_error(
(message + std::to_string(static_cast<int>(result)))) {
error_number_ = static_cast<int>(result);
}
virtual ~cuda_error() throw() {}
virtual int get_error_number() const throw() { return error_number_; }
};
class cudnn_error : virtual public std::runtime_error {
protected:
inline const char *cudnn_get_error_string(cudnnStatus_t status) {
switch (status) {
case CUDNN_STATUS_SUCCESS: return "CUDNN_STATUS_SUCCESS";
case CUDNN_STATUS_NOT_INITIALIZED:
return "CUDNN_STATUS_NOT_INITIALIZED";
case CUDNN_STATUS_ALLOC_FAILED: return "CUDNN_STATUS_ALLOC_FAILED";
case CUDNN_STATUS_BAD_PARAM: return "CUDNN_STATUS_BAD_PARAM";
case CUDNN_STATUS_INTERNAL_ERROR:
return "CUDNN_STATUS_INTERNAL_ERROR";
case CUDNN_STATUS_INVALID_VALUE:
return "CUDNN_STATUS_INVALID_VALUE";
case CUDNN_STATUS_ARCH_MISMATCH:
return "CUDNN_STATUS_ARCH_MISMATCH";
case CUDNN_STATUS_MAPPING_ERROR:
return "CUDNN_STATUS_MAPPING_ERROR";
case CUDNN_STATUS_EXECUTION_FAILED:
return "CUDNN_STATUS_EXECUTION_FAILED";
case CUDNN_STATUS_NOT_SUPPORTED:
return "CUDNN_STATUS_NOT_SUPPORTED";
case CUDNN_STATUS_LICENSE_ERROR:
return "CUDNN_STATUS_LICENSE_ERROR";
case CUDNN_STATUS_RUNTIME_IN_PROGRESS:
return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
default: return "<unknown>";
}
}
int error_number_;
public:
explicit cudnn_error(const std::string &message, cudnnStatus_t result)
: std::runtime_error(
(message + std::string(cudnn_get_error_string(result)))) {
error_number_ = static_cast<int>(result);
}
virtual ~cudnn_error() throw() {}
virtual int get_error_number() const throw() { return error_number_; }
};
template <typename T>
cl::sycl::event copy(cl::sycl::queue &q, T *src, cl::sycl::buffer<T, 1> &dst) {
auto event = q.submit([&, src](cl::sycl::handler &cgh) {
// Retrieve a write accessor to a global buffer
auto acc = dst.template get_access<cl::sycl::access::mode::write,
cl::sycl::access::target::global_buffer>(cgh);
// Copy from the input pointer into the buffer associated with the
// accessor
cgh.copy(src, acc);
});
return event;
}
template <typename T>
cl::sycl::event copy(cl::sycl::queue &q, cl::sycl::buffer<T, 1> &src, T *dst) {
auto event = q.submit([&, dst](cl::sycl::handler &cgh) {
// Retrieve a read accessor to a global buffer
auto acc = src.template get_access<cl::sycl::access::mode::read,
cl::sycl::access::target::global_buffer>(cgh);
// Copy from the buffer associated with the accessor into the output
// pointer
cgh.copy(acc, dst);
});
return event;
}
template <typename T>
cl::sycl::event copy(cl::sycl::queue &q, cl::sycl::buffer<T, 1> &src,
cl::sycl::buffer<T, 1> &dst) {
auto event = q.submit([&](cl::sycl::handler &cgh) {
auto src_acc
= src.template get_access<cl::sycl::access::mode::read_write>(
cgh);
auto dst_acc
= dst.template get_access<cl::sycl::access::mode::read_write>(
cgh);
cgh.copy(src_acc, dst_acc);
});
return event;
}
static status_t cudnn_to_dnnl_status(cudnnStatus_t cu_status) {
switch (cu_status) {
case CUDNN_STATUS_SUCCESS: return status::success;
case CUDNN_STATUS_BAD_PARAM: return status::invalid_arguments;
case CUDNN_STATUS_NOT_SUPPORTED: return status::unimplemented;
default: return status::runtime_error;
}
}
static status_t cublas_to_dnnl_status(cublasStatus_t cu_status) {
switch (cu_status) {
case CUBLAS_STATUS_SUCCESS: return status::success;
default: return status::runtime_error;
}
}
static status_t cuda_to_dnnl_status(CUresult cu_result) {
switch (cu_result) {
case CUDNN_STATUS_SUCCESS: return status::success;
default: return status::runtime_error;
}
}
#define CUDA_ERROR_LOCATION __FILE__ " : " STRINGIFY(__LINE__)
#define CUDA_EXECUTE_FUNC(name, ...) \
{ \
auto err = name(__VA_ARGS__); \
if (err != CUDA_SUCCESS) { \
throw cuda_error(std::string("At :") \
+ std::string(CUDA_ERROR_LOCATION) \
+ std::string(#name) + std::string(" : "), \
err); \
} \
}
#define CUBLAS_EXECUTE_FUNC(name, ...) \
{ \
auto err = name(__VA_ARGS__); \
if (err != CUBLAS_STATUS_SUCCESS) { \
throw cublas_error(std::string("At :") \
+ std::string(CUDA_ERROR_LOCATION) \
+ std::string(#name) + std::string(" : "), \
err); \
} \
}
#define CUDNN_EXECUTE_FUNC(name, ...) \
{ \
auto err = name(__VA_ARGS__); \
if (err != CUDNN_STATUS_SUCCESS) { \
throw cudnn_error(std::string("At :") \
+ std::string(CUDA_ERROR_LOCATION) \
+ std::string(#name) + std::string(" : "), \
err); \
} \
}
#define CUDA_EXECUTE_FUNC_V(name, ...) \
{ \
auto err = name(__VA_ARGS__); \
if (err != CUDA_SUCCESS) { \
std::cout << cuda_error(std::string("At :") \
+ std::string(CUDA_ERROR_LOCATION) \
+ std::string(#name) + std::string(" : "), \
err) \
.what() \
<< std::endl; \
} \
}
#define CUDNN_EXECUTE_FUNC_V(name, ...) \
{ \
auto err = name(__VA_ARGS__); \
if (err != CUDNN_STATUS_SUCCESS) { \
std::cout << cudnn_error(std::string("At :") \
+ std::string(CUDA_ERROR_LOCATION) \
+ std::string(#name) + std::string(" : "), \
err) \
.what() \
<< std::endl; \
} \
}
#define CUBLAS_EXECUTE_FUNC_V(name, ...) \
{ \
auto err = name(__VA_ARGS__); \
if (err != CUBLAS_STATUS_SUCCESS) { \
std::cout << cublas_error(std::string("At :") \
+ std::string(CUDA_ERROR_LOCATION) \
+ std::string(#name) + std::string(" : "), \
err) \
.what() \
<< std::endl; \
} \
}
#define CUDA_EXECUTE_FUNC_S(name, ...) \
[&]() { \
auto err = name(__VA_ARGS__); \
return cuda_to_dnnl_status(err); \
}()
#define CUBLAS_EXECUTE_FUNC_S(name, ...) \
[&]() { \
auto err = name(__VA_ARGS__); \
return cublas_to_dnnl_status(err); \
}()
#define CUDNN_EXECUTE_FUNC_S(name, ...) \
[&]() { \
auto err = name(__VA_ARGS__); \
if (err != CUDNN_STATUS_SUCCESS) { return cudnn_to_dnnl_status(err); } \
return status::success; \
}()
static status_t create_and_set_tensor_descriptor(
cudnnTensorDescriptor_t *tensor_desc, cudnnDataType_t data_type,
int ndims, int *dims, int *strides) {
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, tensor_desc));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptor, *tensor_desc,
data_type, ndims, dims, strides));
return status::success;
}
static status_t create_and_set_tensor_descriptor_ex(
cudnnTensorDescriptor_t *tensor_desc, cudnnTensorFormat_t format,
cudnnDataType_t data_type, int ndims, int *dims) {
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateTensorDescriptor, tensor_desc));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetTensorNdDescriptorEx, *tensor_desc,
format, data_type, ndims, dims));
return status::success;
}
static status_t create_and_set_filter_descriptor(
cudnnFilterDescriptor_t *filter_desc, cudnnTensorFormat_t format,
cudnnDataType_t data_type, int ndims, int *dims, int *) {
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateFilterDescriptor, filter_desc));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetFilterNdDescriptor, *filter_desc,
data_type, format, ndims, dims));
return status::success;
}
static status_t create_and_set_conv_descriptor(
cudnnConvolutionDescriptor_t *conv_desc, int ndims, int *padding,
int *strides, int *dilation, cudnnConvolutionMode_t mode,
cudnnDataType_t data_type) {
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateConvolutionDescriptor, conv_desc));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetConvolutionNdDescriptor, *conv_desc,
ndims, padding, strides, dilation, mode, data_type));
return status::success;
}
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif

View File

@ -153,7 +153,9 @@ struct ref_sum_t : public gpu_primitive_t {
nested_scratchpad_t ns(ctx, key_nested_multiple + i, reorders_[i]);
r_ctx.set_scratchpad_grantor(ns.grantor());
CHECK(reorders_[i]->execute(r_ctx));
#ifndef DNNL_SYCL_CUDA
ctx.stream()->wait();
#endif
}
if (pd()->need_output_reorder()) {

View File

@ -240,6 +240,14 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
if (is_nvidia_gpu()) {
const bool alg_ok = !(prb->alg == alg_t::DIV || prb->alg == alg_t::SUB);
if (!alg_ok || !prb->attr.post_ops.is_def()) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -218,6 +218,15 @@ static int compare(const prb_t *prb, data_kind_t kind, const dnn_mem_t &fp_mem,
float eps = eps_coeff * (kind == DATA ? 5e-7 : 0);
if (kind == SS && prb->dir & FLAG_BWD) eps = eps_coeff * 5e-6;
if (is_nvidia_gpu()) {
// cuDNN stores unbiased variance which requires rescaling by
// `(N - 1) / N`, where `N = MB * Spatial`. Hence, we cannot set the
// threshold to 0...
// Also the mean could also be rounded incorrectly (how?!)
if (kind == MEAN) eps = 1e-7;
if (kind == VAR) eps = 4e-7;
}
// Since bwd testing is done using results from forward which are random
// fp32 values, diff_ss starts fluctuating, so we check norm for both data
// and SS.
@ -457,6 +466,20 @@ int init_pd(dnnl_engine_t engine, const prb_t *prb, dnnl_primitive_desc_t &bpd,
void check_known_skipped_case(const prb_t *prb, res_t *res) {
check_known_skipped_case_common({prb->dt}, prb->dir, res);
if (res->state == SKIPPED) return;
if (is_nvidia_gpu()) {
const bool bwd_ok
= !((prb->dir & FLAG_BWD) && (prb->flags & GLOB_STATS));
const bool inference_ok
= IMPLICATION(prb->dt == dnnl_s8 || prb->dt == dnnl_f16,
(prb->dir & FLAG_INF) && (prb->flags & GLOB_STATS));
if (!bwd_ok || !inference_ok) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -696,6 +696,47 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
return;
}
}
if (is_nvidia_gpu()) {
const int64_t ID = prb->id, IH = prb->ih, IW = prb->iw;
const int64_t OD = prb->od, OH = prb->oh, OW = prb->ow;
const int64_t KD = prb->kd, KH = prb->kh, KW = prb->kw;
const int64_t SD = prb->sd, SH = prb->sh, SW = prb->sw;
const int64_t PD = prb->pd, PH = prb->ph, PW = prb->pw;
const int64_t PD_R = prb->pd_r, PH_R = prb->ph_r, PW_R = prb->pw_r;
const bool pad_ok = PD >= PD_R && PH >= PH_R && PW >= PW_R;
// copy-pasted from str2desc, dilation is not supported for Nvidia
const auto compute_out
= [](int64_t i, int64_t k, int64_t s, int64_t p) {
return (i - k + 2 * p) / s + 1;
};
const bool out_ok = OD == compute_out(ID, KD, SD, PD)
&& OH == compute_out(IH, KH, SH, PH)
&& OW == compute_out(IW, KW, SW, PW);
const auto &po = prb->attr.post_ops;
bool post_ops_ok = true;
for (int i = 0; i < po.len(); ++i) {
const auto &e = po.entry[i];
if (e.is_sum_kind())
continue;
else if (e.is_eltwise_kind())
post_ops_ok = post_ops_ok && is_nvidia_eltwise_ok(prb->dir, e);
else if (e.is_binary_kind() || e.is_convolution_kind())
post_ops_ok = false;
else
assert(!"unknown post-op type");
}
const auto dtag = normalize_tag(prb->dtag, prb->ndims);
const bool dtag_is_axb = dtag == normalize_tag(tag::axb, prb->ndims);
const bool tag_ok = !((prb->dir & FLAG_BWD) && dtag_is_axb);
// TODO: specified wtag (even for supported formats) is not working?
if (!pad_ok || !out_ok || !post_ops_ok || !tag_ok) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -197,6 +197,49 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
if (is_nvidia_gpu()) {
const int64_t ID = prb->id, IH = prb->ih, IW = prb->iw;
const int64_t OD = prb->od, OH = prb->oh, OW = prb->ow;
const int64_t KD = prb->kd, KH = prb->kh, KW = prb->kw;
const int64_t SD = prb->sd, SH = prb->sh, SW = prb->sw;
const int64_t PD = prb->pd, PH = prb->ph, PW = prb->pw;
const int64_t PD_R = prb->pd_r, PH_R = prb->ph_r, PW_R = prb->pw_r;
const bool pad_ok = PD >= PD_R && PH >= PH_R && PW >= PW_R;
// copy-pasted from str2desc, dilation is not supported for Nvidia
const auto compute_out
= [](int64_t i, int64_t k, int64_t s, int64_t p) {
return (i - 1) * s + k - 2 * p;
};
const bool out_ok = OD == compute_out(ID, KD, SD, PD)
&& OH == compute_out(IH, KH, SH, PH)
&& OW == compute_out(IW, KW, SW, PW);
bool post_ops_ok = prb->attr.post_ops.is_def();
const auto stag = normalize_tag(prb->stag, prb->ndims);
const bool stag_is_axb = stag == normalize_tag(tag::axb, prb->ndims);
const bool fwd_tag_ok = !((prb->dir & FLAG_FWD) && stag_is_axb);
const bool bwd_tag_ok
= !((prb->dir == BWD_W || prb->dir == BWD_WB) && stag_is_axb);
const bool tag_ok = fwd_tag_ok && bwd_tag_ok;
// TODO: specified wtag (even for supported formats) is not working?
if (!pad_ok || !out_ok || !post_ops_ok || !tag_ok) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
// FIXME: there's a bug in the library resulting in
// memory_tracking.hpp:458: Assertion `registry_.size() == 0' failed.
// Specifically for 3D spatial case, when both BWD_W and BWD_WB are
// run. It must be cache interaction, but not clear which side is
// guilty. Likely Nvidia implementation. Switch it off until further
// investigation.
if (prb->ndims == 5 && prb->dir == BWD_WB) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -17,6 +17,11 @@
#include <assert.h>
#include "oneapi/dnnl/dnnl.h"
// For is_nvidia_gpu(...)
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
#include "oneapi/dnnl/dnnl_sycl.hpp"
#endif
#include "dnnl_common.hpp"
#include "dnnl_memory.hpp"
@ -255,5 +260,45 @@ void check_known_skipped_case_common(
r->state = SKIPPED, r->reason = DATA_TYPE_NOT_SUPPORTED;
break;
}
// cuda supports only f32, f16 and s8 data types
if (is_nvidia_gpu()
&& (i_dt == dnnl_bf16 || i_dt == dnnl_u8 || i_dt == dnnl_s32)) {
r->state = SKIPPED, r->reason = DATA_TYPE_NOT_SUPPORTED;
break;
}
}
}
bool is_nvidia_gpu(const engine_t &engine) {
dnnl_engine_kind_t engine_kind = dnnl_any_engine;
DNN_SAFE_V(dnnl_engine_get_kind(engine, &engine_kind));
if (engine_kind != dnnl_gpu) return false;
#if DNNL_WITH_SYCL
constexpr int nvidia_vendor_id = 0x10DE;
auto eng = dnnl::engine(engine, true);
auto device = dnnl::sycl_interop::get_device(eng);
const auto eng_vendor_id
= device.get_info<cl::sycl::info::device::vendor_id>();
return eng_vendor_id == nvidia_vendor_id;
#endif
return false;
}
bool is_nvidia_eltwise_ok(
dir_t dir, attr_t::post_ops_t::kind_t alg, float alpha) {
using pk_t = attr_t::post_ops_t::kind_t;
switch (alg) {
case pk_t::BRELU: return true;
case pk_t::ELU: return (dir & FLAG_FWD);
case pk_t::LOGISTIC: return (dir & FLAG_FWD);
case pk_t::TANH: return (dir & FLAG_FWD);
case pk_t::RELU: return alpha == 0.f;
// TODO: can be easily supported by Nvidia backend
// case pk_t::ELU_DST: return true;
// case pk_t::LOGISTIC_DST: return true;
// case pk_t::TANH_DST: return true;
// case pk_t::RELU_DST: return alpha == 0.f;
default: return false;
};
}

View File

@ -320,4 +320,12 @@ bool check_md_consistency_with_tag(
void check_known_skipped_case_common(
const std::vector<dnnl_data_type_t> &v_dt, dir_t dir, res_t *r);
bool is_nvidia_gpu(const engine_t &engine = get_test_engine());
bool is_nvidia_eltwise_ok(
dir_t dir, attr_t::post_ops_t::kind_t alg, float alpha);
inline bool is_nvidia_eltwise_ok(
dir_t dir, const attr_t::post_ops_t::entry_t &e) {
return is_nvidia_eltwise_ok(dir, e.kind, e.eltwise.alpha);
}
#endif

View File

@ -238,7 +238,22 @@ private:
} else {
is_data_owner_ = false;
data_ = NULL;
#if DNNL_WITH_SYCL
// XXX: A hack to mitigate the issue from create_from_host_ptr when
// perform a CPU reorder due to USM in not supported on Nvidia, but
// it's not allowed to convert host_ptr to SYCL buffer.
engine_t e(engine_kind_);
if (is_nvidia_gpu(e)) {
DNN_SAFE(dnnl_sycl_interop_memory_create(&m_, &md_, engine,
dnnl_sycl_interop_buffer, handle),
CRIT);
} else {
DNN_SAFE(dnnl_memory_create(&m_, &md_, engine, handle), CRIT);
}
#else
DNN_SAFE(dnnl_memory_create(&m_, &md_, engine, handle), CRIT);
#endif
}
if (handle == DNNL_MEMORY_ALLOCATE) {

View File

@ -19,7 +19,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "dnnl.h"
#include "oneapi/dnnl/dnnl.h"
#include "tests/test_thread.hpp"
@ -341,6 +341,14 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
res->state = SKIPPED, res->reason = INVALID_CASE;
return;
}
if (is_nvidia_gpu()) {
if (!is_nvidia_eltwise_ok(prb->dir, prb->alg, prb->alpha)
|| !prb->attr.post_ops.is_def()) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -12,4 +12,3 @@
# bf16
--batch=test_resampling_bfloat16

View File

@ -304,6 +304,29 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
{prb->cfg[SRC].dt, prb->cfg[WEI].dt, prb->cfg[DST].dt}, prb->dir,
res);
if (res->state == SKIPPED) return;
if (is_nvidia_gpu()) {
const auto &po = prb->attr.post_ops;
bool post_ops_ok = true;
for (int i = 0; i < po.len(); ++i) {
const auto &e = po.entry[i];
if (e.is_sum_kind())
continue;
else if (e.is_eltwise_kind())
post_ops_ok = post_ops_ok && is_nvidia_eltwise_ok(prb->dir, e);
else if (e.is_binary_kind() || e.is_convolution_kind())
post_ops_ok = false;
else
assert(!"unknown post-op type");
}
const bool oscale_ok = prb->attr.oscale.policy == policy_t::COMMON;
if (!post_ops_ok || !oscale_ok) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -470,6 +470,12 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb,
void check_known_skipped_case(const prb_t *prb, res_t *res) {
check_known_skipped_case_common({prb->dt}, prb->dir, res);
if (res->state == SKIPPED) return;
if (is_nvidia_gpu()) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -167,6 +167,14 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb,
void check_known_skipped_case(const prb_t *prb, res_t *res) {
check_known_skipped_case_common({prb->dt}, prb->dir, res);
if (res->state == SKIPPED) return;
if (is_nvidia_gpu()) {
if (prb->alg != ACROSS || prb->ls % 2 != 1) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -290,6 +290,31 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
return;
}
}
if (is_nvidia_gpu()) {
const auto &po = prb->attr.post_ops;
bool post_ops_ok = true;
for (int i = 0; i < po.len(); ++i) {
const auto &e = po.entry[i];
if (e.is_sum_kind())
continue;
else if (e.is_eltwise_kind())
post_ops_ok = post_ops_ok && is_nvidia_eltwise_ok(FLAG_FWD, e);
else if (e.is_binary_kind() || e.is_convolution_kind())
post_ops_ok = false;
else
assert(!"unknown post-op type");
}
const bool oscale_ok = prb->attr.oscale.policy == policy_t::COMMON;
const bool zp_ok = prb->attr.zero_points.is_def();
if (!post_ops_ok || !oscale_ok || !zp_ok) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -59,6 +59,12 @@ inline int compare_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
else
ok = (fabs(fp) > 1e-5 ? rel_diff : diff) <= prb->cfg[kind].eps;
// XXX: bug in cuDNN: it spits fp16 min value as -inf, not -65504
if (!ok && is_nvidia_gpu() && prb->cfg[kind].dt == dnnl_f16) {
ok = fp == lowest_dt(prb->cfg[kind].dt) && std::isinf(dt)
&& std::signbit(dt);
}
res->errors += !ok;
bool dump = (!ok && (res->errors < 10 || verbose >= 10))
@ -258,6 +264,23 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
if (is_nvidia_gpu()) {
const int64_t PD = prb->pd, PH = prb->ph, PW = prb->pw;
const int64_t PD_R = prb->pd_r, PH_R = prb->ph_r, PW_R = prb->pw_r;
const bool pad_ok
= !(prb->alg == AVG_P && (PD < PD_R || PH < PH_R || PW < PW_R));
const int64_t DD = prb->dd, DH = prb->dh, DW = prb->dw;
const bool dilation_ok = DD == 0 && DH == 0 && DW == 0;
const bool post_ops_ok = prb->attr.post_ops.is_def();
if (!pad_ok || !dilation_ok || !post_ops_ok) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -191,6 +191,11 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
res->state = SKIPPED, res->reason = INVALID_CASE;
return;
}
if (is_nvidia_gpu()) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -318,6 +318,14 @@ void check_known_skipped_case(const prb_t *prb, res_t *res) {
return;
}
}
if (is_nvidia_gpu()) {
const bool oscale_ok = prb->attr.oscale.policy == policy_t::COMMON;
if (!oscale_ok) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -39,6 +39,7 @@ inline int compare_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
res->total = nelems;
float trh = 0;
float eps = 1e-5;
if (prb->alg == nearest) {
// On forward, `dst` consists of exact `src` elements, hence the result
// shall be exact (no matter what data type is). On backward, the
@ -54,6 +55,12 @@ inline int compare_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
} else {
assert(prb->alg == linear);
trh = prb->dt == dnnl_f32 ? 1e-6 : 1e-2;
if (is_nvidia_gpu()) {
// cuDNN precision is different from ref one due to different
// computation algorithm used for resampling.
trh = prb->dt == dnnl_f16 ? 4e-1 : 8e-4;
eps = prb->dt == dnnl_f16 ? 1e-1 : 8e-5;
}
}
for (int64_t i = 0; i < nelems; ++i) {
@ -63,7 +70,7 @@ inline int compare_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
const float diff = fabsf(fp - dt);
const float rel_diff = diff / (fabsf(fp) > FLT_MIN ? fabsf(fp) : 1);
const bool ok = (fabsf(fp) > 1e-5 ? rel_diff : diff) <= trh;
const bool ok = (fabsf(fp) > eps ? rel_diff : diff) <= trh;
res->errors += !ok;
@ -150,7 +157,7 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb,
: prb->ndims == 4 ? dst_2d_dims : dst_1d_dims;
std::string src_tag = (prb->dir & FLAG_FWD) ? prb->tag : tag::any;
std::string dst_tag = tag::any;
std::string dst_tag = (prb->dir & FLAG_BWD) ? prb->tag : tag::any;
SAFE(init_md(&src_d, prb->ndims, src_dims, prb->dt, src_tag), CRIT);
@ -219,6 +226,14 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb,
void check_known_skipped_case(const prb_t *prb, res_t *res) {
check_known_skipped_case_common({prb->dt}, prb->dir, res);
if (res->state == SKIPPED) return;
if (is_nvidia_gpu()) {
if (prb->ndims == 5 || prb->alg == nearest) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -766,6 +766,11 @@ void check_known_skipped_case(const prb_t &prb, res_t *res) {
return;
}
}
if (is_nvidia_gpu()) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
int doit(const prb_t &prb, res_t *res) {

View File

@ -146,6 +146,12 @@ static int init_pd(dnnl_engine_t engine, const prb_t *prb,
void check_known_skipped_case(const prb_t *prb, res_t *res) {
check_known_skipped_case_common({prb->dt}, prb->dir, res);
if (res->state == SKIPPED) return;
if (is_nvidia_gpu()) {
res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
return;
}
}
int doit(const prb_t *prb, res_t *res) {

View File

@ -185,7 +185,7 @@ endif()
foreach(TEST_FILE ${PRIM_TEST_CASES_SRC})
get_filename_component(exe ${TEST_FILE} NAME_WE)
if(NOT ${exe} MATCHES "${skip_usm_pattern}")
if(NOT ${exe} MATCHES "${skip_usm_pattern}" AND NOT DNNL_SYCL_CUDA)
register_gtest(${exe} ${TEST_FILE})
endif()

View File

@ -19,8 +19,13 @@ set(TEST_EXE test_api)
file(GLOB TEST_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/test_*.cpp)
list(APPEND TEST_SOURCES ${MAIN_SRC_GTEST})
# Switch off C API tests for CUDA since USM model is not supported
if(NOT DNNL_SYCL_CUDA)
register_exe(${TEST_EXE} "${TEST_SOURCES}" "test" "dnnl_gtest")
endif()
# Create DPC++ buffer target.
if(DNNL_SYCL_DPCPP)
if(DNNL_SYCL_DPCPP AND NOT DNNL_SYCL_CUDA)
register_exe(${TEST_EXE}_buffer "${TEST_SOURCES}" "test" "dnnl_gtest")
target_compile_definitions(${TEST_EXE}_buffer PUBLIC -DTEST_DNNL_DPCPP_BUFFER)
endif()

View File

@ -53,6 +53,12 @@ protected:
dnnl::memory::desc md(p.dims, memory::data_type::f32, p.fmt_tag);
dnnl::memory::dim phys_size = md.get_size() / sizeof(data_t);
#ifdef DNNL_SYCL_CUDA
const dnnl::impl::memory_desc_wrapper mdw(md.data);
SKIP_IF(!mdw.is_plain() && !mdw.format_any(),
"Non-plain formats are not supported on CUDA backend");
#endif
// mem0
// Initially spoiled by putting non-zero values in padded area.
// The test will manually fix it later.

View File

@ -0,0 +1,29 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "dnnl_test_common.hpp"
#include "gtest/gtest.h"
#include "oneapi/dnnl/dnnl.hpp"
namespace dnnl {
TEST(namespace_test, TestAliasNamespace) {
const version_t *version = ::oneapi::dnnl::version();
(void)version;
}
} // namespace dnnl

View File

@ -87,6 +87,27 @@ dnnl::engine::kind get_test_engine_kind();
dnnl::engine get_test_engine();
#endif
inline int get_vendor_id(const std::string &vendor) {
if (vendor == "nvidia") {
return 0x10DE;
} else if (vendor == "intel") {
return 0x8086;
} else {
return -1;
}
}
inline bool is_nvidia_gpu(const dnnl::engine &eng) {
#if DNNL_WITH_SYCL
const int nvidia_vendor_id = get_vendor_id("nvidia");
const auto device = dnnl::sycl_interop::get_device(eng);
const auto eng_vendor_id
= device.get_info<cl::sycl::info::device::vendor_id>();
return eng_vendor_id == nvidia_vendor_id;
#endif
return false;
}
inline bool unsupported_data_type(memory::data_type dt, dnnl::engine eng) {
dnnl::engine::kind kind = eng.get_kind();
@ -94,7 +115,16 @@ inline bool unsupported_data_type(memory::data_type dt, dnnl::engine eng) {
if (kind == dnnl::engine::kind::cpu)
supported = dnnl::impl::cpu::platform::has_data_type_support(
memory::convert_to_c(dt));
#ifdef DNNL_SYCL_CUDA
if (is_nvidia_gpu(eng)) {
switch (dt) {
case memory::data_type::f32: return false;
case memory::data_type::f16: return false;
case memory::data_type::s8: return false;
default: return true;
}
}
#endif
return !supported;
}

View File

@ -33,6 +33,27 @@
} \
} while (0)
#define SKIP_FOR_LOOP(cond, msg) \
if (cond) { \
std::cout << "[ SKIPPED ] " << (msg) << std::endl; \
continue; \
}
#ifdef DNNL_SYCL_CUDA
#define SKIP_IF_CUDA(cond, message) \
do { \
SKIP_IF(get_test_engine_kind() == engine::kind::gpu && (cond), \
(message)); \
} while (0)
#define SKIP_FOR_LOOP_CUDA(cond, message) \
SKIP_FOR_LOOP( \
get_test_engine_kind() == engine::kind::gpu && (cond), (message));
#else
#define SKIP_IF_CUDA(cond, message)
#define SKIP_FOR_LOOP_CUDA(cond, message)
#endif
#define TEST_F_(test_fixture, test_name) TEST_F(test_fixture, test_name)
#define CPU_TEST_F(test_fixture, test_name) \

View File

@ -75,10 +75,34 @@ private:
protected:
virtual void SetUp() {
p = ::testing::TestWithParam<decltype(p)>::GetParam();
SKIP_IF_CUDA(!cuda_check_format_tags(p.tags.data_tag, p.tags.diff_tag),
"Unsupported format tag");
catch_expected_failures(
[=]() { Test(); }, p.expect_to_fail, p.expected_status);
}
bool cuda_check_format_tags(
memory::format_tag src_format, memory::format_tag diff_format) {
bool src_ok = src_format == memory::format_tag::ncdhw
|| src_format == memory::format_tag::ndhwc
|| src_format == memory::format_tag::nchw
|| src_format == memory::format_tag::nhwc
|| src_format == memory::format_tag::ncw
|| src_format == memory::format_tag::nwc
|| src_format == memory::format_tag::any;
bool diff_ok = diff_format == memory::format_tag::oidhw
|| diff_format == memory::format_tag::odhwi
|| diff_format == memory::format_tag::oihw
|| diff_format == memory::format_tag::hwio
|| diff_format == memory::format_tag::oiw
|| diff_format == memory::format_tag::oiw
|| diff_format == memory::format_tag::any;
return src_ok && diff_ok;
}
void Test() {
using bf = normalization_flags;
p = ::testing::TestWithParam<decltype(p)>::GetParam();
@ -201,6 +225,11 @@ protected:
normalization_flags flags = normalization_flags::none) {
bool useScaleShift
= (bool)(flags & normalization_flags::use_scale_shift);
bool useGlobalStats
= (bool)(flags & normalization_flags::use_global_stats);
(void)useGlobalStats;
SKIP_IF_CUDA(useGlobalStats, "Global stats not supported");
auto bnorm_fwd_d = batch_normalization_forward::desc(
prop_kind::forward_training, *data_d, p.epsilon, flags);
@ -251,6 +280,11 @@ protected:
check_zero_tail<data_t>(1, diff_src->get());
check_zero_tail<data_t>(1, diff_dst->get());
// Run a forward pass first for Nvidia backend to generate the workspace
// needed by the backward pass.
if (is_nvidia_gpu(eng))
execBnormFwd(true, useGlobalStats, useScaleShift);
execBnormBwd(useScaleShift, pk);
check_bnorm_bwd(p, src->get(), diff_dst->get(), mean, variance, weights,

View File

@ -50,23 +50,37 @@ protected:
SKIP_IF(unsupported_data_type(src0_dt),
"Engine does not support this data type.");
SKIP_IF(unsupported_data_type(src1_dt),
"Engine does not support this data type.");
for (auto tag : p.srcs_format) {
MAYBE_UNUSED(tag);
SKIP_IF_CUDA(!cuda_check_format_tag(tag),
"Unsupported source format tag");
}
SKIP_IF_CUDA(!cuda_check_format_tag(p.dst_format),
"Unsupported destination format tag");
catch_expected_failures(
[=]() { Test(); }, p.expect_to_fail, p.expected_status);
}
bool cuda_check_format_tag(tag atag) {
return atag == tag::abcd || atag == tag::acdb;
}
void Test() {
auto eng = get_test_engine();
auto strm = make_stream(eng);
// binary specific types and values
using op_desc_t = binary::desc;
using pd_t = binary::primitive_desc;
allows_attr_t aa {false};
aa.po_sum = true;
aa.po_eltwise = true;
aa.po_binary = true;
aa.scales = true;
auto eng = get_test_engine();
auto strm = make_stream(eng);
aa.po_sum = !is_nvidia_gpu(eng);
aa.po_eltwise = !is_nvidia_gpu(eng);
aa.po_binary = !is_nvidia_gpu(eng);
std::vector<memory::desc> srcs_md;
std::vector<memory> srcs;

View File

@ -90,12 +90,28 @@ class concat_test_t : public ::testing::TestWithParam<concat_test_params_t> {
}
protected:
bool cuda_supported_format_tag(memory::format_tag tag) {
return impl::utils::one_of(tag, dnnl_a, dnnl_ab, dnnl_abc, dnnl_abcd,
dnnl_abcde, dnnl_abcdef, dnnl_abdec, dnnl_acb, dnnl_acbde,
dnnl_acbdef, dnnl_acdb, dnnl_acdeb, dnnl_ba, dnnl_bac,
dnnl_bacd, dnnl_bca, dnnl_bcda, dnnl_bcdea, dnnl_cba, dnnl_cdba,
dnnl_cdeba, dnnl_decab, dnnl_defcab, dnnl_aBc4b, dnnl_aBcd4b,
dnnl_aBcde4b);
}
void SetUp() override {
auto data_type = data_traits<data_t>::data_type;
SKIP_IF(unsupported_data_type(data_type),
"Engine does not support this data type.");
concat_test_params_t p
= ::testing::TestWithParam<decltype(p)>::GetParam();
for (int i = 0; i < p.srcs_cds.size(); i++) {
SKIP_IF_CUDA(!cuda_supported_format_tag(p.srcs_format[i]),
"Unsupported format tag");
}
SKIP_IF_CUDA(!cuda_supported_format_tag(p.dst_format),
"Unsupported format tag");
catch_expected_failures(
[=]() { Test(); }, p.expect_to_fail, p.expected_status, false);
}

View File

@ -92,10 +92,55 @@ protected:
virtual void SetUp() {
auto p = ::testing::TestWithParam<
test_convolution_params_t>::GetParam();
SKIP_IF_CUDA(
!(cuda_check_format_tags(p.formats.src_format)
&& cuda_check_format_tags(p.formats.dst_format)
&& (cuda_check_format_tags(p.formats.weights_format)
|| (impl::utils::one_of(
p.formats.weights_format,
/* weights formats */
memory::format_tag::gowi,
memory::format_tag::gohwi,
memory::format_tag::godhwi,
memory::format_tag::owi,
memory::format_tag::ohwi,
memory::format_tag::odhwi)))
&& data_traits<data_t_diff_src>::data_type
== memory::data_type::f32
&& data_traits<data_t_diff_dst>::data_type
== memory::data_type::f32
&& data_traits<data_t_wei>::data_type
== memory::data_type::f32
&& check_cuda_alg_format(p.formats.dst_format,
p.formats.weights_format, p.aalgorithm)),
"format is not supported.");
catch_expected_failures(
[=]() { Test(); }, p.expect_to_fail, p.expected_status);
}
bool cuda_check_format_tags(memory::format_tag tag) {
return impl::utils::one_of(tag, memory::format_tag::ab,
memory::format_tag::abc, memory::format_tag::abcd,
memory::format_tag::abcde, memory::format_tag::abcdef,
memory::format_tag::acb, memory::format_tag::acdb,
memory::format_tag::acdeb);
}
bool check_cuda_alg_format(memory::format_tag dst_fmt,
memory::format_tag wei_fmt, algorithm alg) {
bool res = dst_fmt == wei_fmt;
if (alg == dnnl::algorithm::convolution_winograd) {
res = res
&& impl::utils::one_of(wei_fmt, memory::format_tag::ab,
memory::format_tag::abc, memory::format_tag::abcd,
memory::format_tag::abcde,
memory::format_tag::abcdef);
}
return res;
}
void Test() {
auto p = ::testing::TestWithParam<
test_convolution_params_t>::GetParam();

View File

@ -124,10 +124,55 @@ protected:
virtual void SetUp() {
auto p = ::testing::TestWithParam<
test_convolution_params_t>::GetParam();
SKIP_IF_CUDA(
!(cuda_check_format_tags(p.formats.src_format)
&& cuda_check_format_tags(p.formats.dst_format)
&& (cuda_check_format_tags(p.formats.weights_format)
|| (impl::utils::one_of(
p.formats.weights_format,
/* weights formats */
memory::format_tag::gowi,
memory::format_tag::gohwi,
memory::format_tag::godhwi,
memory::format_tag::owi,
memory::format_tag::ohwi,
memory::format_tag::odhwi)))
&& data_traits<data_t_src>::data_type
== memory::data_type::f32
&& data_traits<data_t_diff_dst>::data_type
== memory::data_type::f32
&& data_traits<data_t_diff_weights>::data_type
== memory::data_type::f32
&& check_cuda_alg_format(p.formats.dst_format,
p.formats.weights_format, p.aalgorithm)),
"format is not supported.");
catch_expected_failures(
[=]() { Test(); }, p.expect_to_fail, p.expected_status);
}
bool cuda_check_format_tags(memory::format_tag tag) {
return impl::utils::one_of(tag, memory::format_tag::ab,
memory::format_tag::abc, memory::format_tag::abcd,
memory::format_tag::abcde, memory::format_tag::abcdef,
memory::format_tag::acb, memory::format_tag::acdb,
memory::format_tag::acdeb);
}
bool check_cuda_alg_format(memory::format_tag dst_fmt,
memory::format_tag wei_fmt, algorithm alg) {
bool res = dst_fmt == wei_fmt;
if (alg == dnnl::algorithm::convolution_winograd) {
res = res
&& impl::utils::one_of(wei_fmt, memory::format_tag::ab,
memory::format_tag::abc, memory::format_tag::abcd,
memory::format_tag::abcde,
memory::format_tag::abcdef);
}
return res;
}
void Test() {
auto p = ::testing::TestWithParam<
test_convolution_params_t>::GetParam();

Some files were not shown because too many files have changed in this diff Show More