mirror of
https://github.com/uxlfoundation/oneDNN.git
synced 2025-10-20 10:03:50 +08:00
273 lines
11 KiB
C++
273 lines
11 KiB
C++
/*******************************************************************************
|
|
* Copyright 2019-2025 Intel Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*******************************************************************************/
|
|
|
|
/// @example gpu_opencl_interop.cpp
|
|
/// > Annotated version: @ref gpu_opencl_interop_cpp
|
|
|
|
/// @page gpu_opencl_interop_cpp_brief
|
|
/// @brief This C++ API example demonstrates programming for Intel(R) Processor
|
|
/// Graphics with OpenCL* extensions API in oneDNN.
|
|
|
|
/// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API
|
|
/// \copybrief gpu_opencl_interop_cpp_brief
|
|
///
|
|
/// > Example code: @ref gpu_opencl_interop.cpp
|
|
///
|
|
/// The workflow includes following steps:
|
|
/// - Create a GPU engine. It uses OpenCL as the runtime in this sample.
|
|
/// - Create a GPU memory descriptor/object.
|
|
/// - Create an OpenCL kernel for GPU data initialization
|
|
/// - Access a GPU memory via OpenCL interoperability interface
|
|
/// - Access a GPU command queue via OpenCL interoperability interface
|
|
/// - Execute a OpenCL kernel with related GPU command queue and GPU memory
|
|
/// - Create primitive descriptor/primitive
|
|
/// - Execute the primitive with the initialized GPU memory
|
|
/// - Validate the result by mapping the OpenCL memory via OpenCL interoperability
|
|
/// interface
|
|
///
|
|
|
|
/// @page gpu_opencl_interop_cpp
|
|
/// @section gpu_opencl_interop_cpp_headers Public headers
|
|
///
|
|
/// To start using oneDNN, we must first include the @ref dnnl.hpp
|
|
/// header file in the application. We also include CL/cl.h for using
|
|
/// OpenCL APIs and @ref dnnl_debug.h, which contains some debugging
|
|
/// facilities such as returning a string representation
|
|
/// for common oneDNN C types.
|
|
/// All C++ API types and functions reside in the `dnnl` namespace.
|
|
/// For simplicity of the example we import this namespace.
|
|
/// @page gpu_opencl_interop_cpp
|
|
/// @snippet gpu_opencl_interop.cpp Prologue
|
|
// [Prologue]
|
|
#include <iostream>
|
|
#include <numeric>
|
|
#include <stdexcept>
|
|
|
|
#include <CL/cl.h>
|
|
|
|
#include "oneapi/dnnl/dnnl.hpp"
|
|
#include "oneapi/dnnl/dnnl_ocl.hpp"
|
|
|
|
#include "example_utils.hpp"
|
|
|
|
using namespace dnnl;
|
|
using namespace std;
|
|
// [Prologue]
|
|
|
|
#define OCL_CHECK(x) \
|
|
do { \
|
|
cl_int s = (x); \
|
|
if (s != CL_SUCCESS) { \
|
|
std::cout << "[" << __FILE__ << ":" << __LINE__ << "] '" << #x \
|
|
<< "' failed (status code: " << s << ")." << std::endl; \
|
|
exit(1); \
|
|
} \
|
|
} while (0)
|
|
|
|
cl_kernel create_init_opencl_kernel(
|
|
cl_context ocl_ctx, const char *kernel_name, const char *ocl_code) {
|
|
cl_int err;
|
|
const char *sources[] = {ocl_code};
|
|
cl_program ocl_program
|
|
= clCreateProgramWithSource(ocl_ctx, 1, sources, nullptr, &err);
|
|
OCL_CHECK(err);
|
|
|
|
OCL_CHECK(
|
|
clBuildProgram(ocl_program, 0, nullptr, nullptr, nullptr, nullptr));
|
|
|
|
cl_kernel ocl_kernel = clCreateKernel(ocl_program, kernel_name, &err);
|
|
OCL_CHECK(err);
|
|
|
|
OCL_CHECK(clReleaseProgram(ocl_program));
|
|
return ocl_kernel;
|
|
}
|
|
|
|
/// @page gpu_opencl_interop_cpp
|
|
/// @section gpu_opencl_interop_cpp_tutorial gpu_opencl_interop_tutorial() function
|
|
///
|
|
void gpu_opencl_interop_tutorial() {
|
|
/// @page gpu_opencl_interop_cpp
|
|
/// @subsection gpu_opencl_interop_cpp_sub1 Engine and stream
|
|
///
|
|
/// All oneDNN primitives and memory objects are attached to a
|
|
/// particular @ref dnnl::engine, which is an abstraction of a
|
|
/// computational device (see also @ref dev_guide_basic_concepts). The
|
|
/// primitives are created and optimized for the device to which they are
|
|
/// attached, and the memory objects refer to memory residing on the
|
|
/// corresponding device. In particular, that means neither memory objects
|
|
/// nor primitives that were created for one engine can be used on
|
|
/// another.
|
|
///
|
|
/// To create engines, we must specify the @ref dnnl::engine::kind
|
|
/// and the index of the device of the given kind. In this example we use
|
|
/// the first available GPU engine, so the index for the engine is 0.
|
|
/// This example assumes OpenCL being a runtime for GPU. In such case,
|
|
/// during engine creation, an OpenCL context is also created and attaches
|
|
/// to the created engine.
|
|
///
|
|
/// @snippet gpu_opencl_interop.cpp Initialize engine
|
|
// [Initialize engine]
|
|
engine eng(validate_engine_kind(engine::kind::gpu), 0);
|
|
// [Initialize engine]
|
|
|
|
/// In addition to an engine, all primitives require a @ref dnnl::stream
|
|
/// for the execution. The stream encapsulates an execution context and is
|
|
/// tied to a particular engine.
|
|
///
|
|
/// In this example, a GPU stream is created.
|
|
/// This example assumes OpenCL being a runtime for GPU. During stream creation,
|
|
/// an OpenCL command queue is also created and attaches to this stream.
|
|
///
|
|
/// @snippet gpu_opencl_interop.cpp Initialize stream
|
|
// [Initialize stream]
|
|
dnnl::stream strm(eng);
|
|
// [Initialize stream]
|
|
|
|
/// @subsection gpu_opencl_interop_cpp_sub2 Wrapping data into oneDNN memory object
|
|
///
|
|
/// Next, we create a memory object. We need to specify dimensions of our
|
|
/// memory by passing a memory::dims object. Then we create a memory
|
|
/// descriptor with these dimensions, with the dnnl::memory::data_type::f32
|
|
/// data type, and with the dnnl::memory::format_tag::nchw memory format.
|
|
/// Finally, we construct a memory object and pass the memory descriptor.
|
|
/// The library allocates memory internally.
|
|
/// @snippet gpu_opencl_interop.cpp memory alloc
|
|
// [memory alloc]
|
|
memory::dims tz_dims = {2, 3, 4, 5};
|
|
const size_t N = std::accumulate(tz_dims.begin(), tz_dims.end(), (size_t)1,
|
|
std::multiplies<size_t>());
|
|
|
|
memory::desc mem_d(
|
|
tz_dims, memory::data_type::f32, memory::format_tag::nchw);
|
|
|
|
memory mem(mem_d, eng);
|
|
// [memory alloc]
|
|
|
|
/// @subsection gpu_opencl_interop_cpp_sub3 Initialize the data by executing a custom OpenCL kernel
|
|
/// We are going to create an OpenCL kernel that will initialize our data.
|
|
/// It requires writing a bit of C code to create an OpenCL program from a
|
|
/// string literal source. The kernel initializes the data by the
|
|
/// 0, -1, 2, -3, ... sequence: `data[i] = (-1)^i * i`.
|
|
/// @snippet gpu_opencl_interop.cpp ocl kernel
|
|
// [ocl kernel]
|
|
const char *ocl_code
|
|
= "__kernel void init(__global float *data) {"
|
|
" int id = get_global_id(0);"
|
|
" data[id] = (id % 2) ? -id : id;"
|
|
"}";
|
|
// [ocl kernel]
|
|
|
|
/// Create/Build Opencl kernel by `create_init_opencl_kernel()` function.
|
|
/// Refer to the full code example for the `create_init_opencl_kernel()`
|
|
/// function.
|
|
/// @snippet gpu_opencl_interop.cpp oclkernel create
|
|
// [oclkernel create]
|
|
const char *kernel_name = "init";
|
|
cl_kernel ocl_init_kernel = create_init_opencl_kernel(
|
|
ocl_interop::get_context(eng), kernel_name, ocl_code);
|
|
// [oclkernel create]
|
|
|
|
/// The next step is to execute our OpenCL kernel by setting its arguments
|
|
/// and enqueueing to an OpenCL queue. You can extract the underlying OpenCL
|
|
/// buffer from the memory object using the interoperability interface:
|
|
/// dnnl::memory::get_ocl_mem_object() . For simplicity we can just construct a
|
|
/// stream, extract the underlying OpenCL queue, and enqueue the kernel to
|
|
/// this queue.
|
|
/// @snippet gpu_opencl_interop.cpp oclexecution
|
|
// [oclexecution]
|
|
cl_mem ocl_buf = ocl_interop::get_mem_object(mem);
|
|
OCL_CHECK(clSetKernelArg(ocl_init_kernel, 0, sizeof(ocl_buf), &ocl_buf));
|
|
|
|
cl_command_queue ocl_queue = ocl_interop::get_command_queue(strm);
|
|
OCL_CHECK(clEnqueueNDRangeKernel(ocl_queue, ocl_init_kernel, 1, nullptr, &N,
|
|
nullptr, 0, nullptr, nullptr));
|
|
// [oclexecution]
|
|
|
|
/// @subsection gpu_opencl_interop_cpp_sub4 Create and execute a primitive
|
|
/// There are two steps to create an operation primitive in oneDNN:
|
|
/// 1. Create a primitive descriptor.
|
|
/// 2. Create a primitive.
|
|
///
|
|
/// Let's create the primitive to perform the ReLU (rectified linear unit)
|
|
/// operation: x = max(0, x). Primitive descriptors are attached to a specific
|
|
/// engine and represent some implementation for this engine.
|
|
/// A primitive object is a realization of a primitive descriptor,
|
|
/// and its construction is usually much "heavier".
|
|
/// @snippet gpu_opencl_interop.cpp relu creation
|
|
// [relu creation]
|
|
auto relu_pd = eltwise_forward::primitive_desc(eng, prop_kind::forward,
|
|
algorithm::eltwise_relu, mem_d, mem_d, 0.0f);
|
|
auto relu = eltwise_forward(relu_pd);
|
|
// [relu creation]
|
|
|
|
/// Next, execute the primitive.
|
|
/// @snippet gpu_opencl_interop.cpp relu exec
|
|
// [relu exec]
|
|
relu.execute(strm, {{DNNL_ARG_SRC, mem}, {DNNL_ARG_DST, mem}});
|
|
strm.wait();
|
|
// [relu exec]
|
|
///
|
|
///@note
|
|
/// Our primitive mem serves as both input and output parameter.
|
|
///
|
|
///
|
|
///@note
|
|
/// Primitive submission on GPU is asynchronous; However, the user can
|
|
/// call dnnl:stream::wait() to synchronize the stream and ensure that all
|
|
/// previously submitted primitives are completed.
|
|
///
|
|
|
|
/// @page gpu_opencl_interop_cpp
|
|
/// @subsection gpu_opencl_interop_cpp_sub5 Validate the results
|
|
///
|
|
/// Before running validation codes, we need to copy the OpenCL memory to
|
|
/// the host. This can be done using OpenCL API. For convenience, we use a
|
|
/// utility function read_from_dnnl_memory() implementing required OpenCL API
|
|
/// calls. After we read the data to the host, we can run validation codes
|
|
/// on the host accordingly.
|
|
/// @snippet gpu_opencl_interop.cpp Check the results
|
|
// [Check the results]
|
|
std::vector<float> mem_data(N);
|
|
read_from_dnnl_memory(mem_data.data(), mem);
|
|
for (size_t i = 0; i < N; i++) {
|
|
float expected = (i % 2) ? 0.0f : (float)i;
|
|
if (mem_data[i] != expected) {
|
|
std::cout << "Expect " << expected << " but got " << mem_data[i]
|
|
<< "." << std::endl;
|
|
throw std::logic_error("Accuracy check failed.");
|
|
}
|
|
}
|
|
// [Check the results]
|
|
|
|
OCL_CHECK(clReleaseKernel(ocl_init_kernel));
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
return handle_example_errors(
|
|
{engine::kind::gpu}, gpu_opencl_interop_tutorial);
|
|
}
|
|
|
|
/// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API
|
|
///
|
|
/// <b></b>
|
|
///
|
|
/// Upon compiling and running the example, the output should be just:
|
|
///
|
|
/// ~~~
|
|
/// Example passed.
|
|
/// ~~~
|
|
///
|