Files
oneDNN/examples/cross_engine_reorder.cpp

242 lines
9.4 KiB
C++

/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
/// @example cross_engine_reorder.cpp
/// > Annotated version: @ref cross_engine_reorder_cpp
/// @page cross_engine_reorder_cpp_brief
/// @brief This C++ API example demonstrates programming flow when reordering memory
/// between CPU and GPU engines.
/// @page cross_engine_reorder_cpp Reorder between CPU and GPU engines
/// \copybrief cross_engine_reorder_cpp_brief
///
/// > Example code: @ref cross_engine_reorder.cpp
///
/// @section cross_engine_reorder_cpp_headers Public headers
///
/// To start using oneDNN, we must first include the @ref dnnl.hpp
/// header file in the application. We also include @ref dnnl_debug.h, which
/// contains some debugging facilities such as returning a string representation
/// for common oneDNN C types.
///
/// All C++ API types and functions reside in the `dnnl` namespace.
/// For simplicity of the example we import this namespace.
/// @page cross_engine_reorder_cpp
#include <iostream>
#include <stdexcept>
#include <vector>
/// @snippet cross_engine_reorder.cpp Prologue
// [Prologue]
#include "example_utils.hpp"
#include "oneapi/dnnl/dnnl.hpp"
#include "example_utils.hpp"
using namespace dnnl;
using namespace std;
// [Prologue]
void fill(memory &mem, const memory::dims &adims) {
std::vector<float> array(product(adims));
for (size_t e = 0; e < array.size(); ++e) {
array[e] = e % 7 ? 1.0f : -1.0f;
}
write_to_dnnl_memory(array.data(), mem);
}
int find_negative(memory &mem, const memory::dims &adims) {
int negs = 0;
size_t nelems = product(adims);
std::vector<float> array(nelems);
read_from_dnnl_memory(array.data(), mem);
for (size_t e = 0; e < nelems; ++e)
negs += array[e] < 0.0f;
return negs;
}
/// @page cross_engine_reorder_cpp
/// @section cross_engine_reorder_cpp_tutorial cross_engine_reorder_tutorial() function
///
void cross_engine_reorder_tutorial() {
/// @page cross_engine_reorder_cpp
/// @subsection cross_engine_reorder_cpp_sub1 Engine and stream
///
/// All oneDNN primitives and memory objects are attached to a
/// particular @ref dnnl::engine, which is an abstraction of a
/// computational device (see also @ref dev_guide_basic_concepts). The
/// primitives are created and optimized for the device they are attached
/// to, and the memory objects refer to memory residing on the
/// corresponding device. In particular, that means neither memory objects
/// nor primitives that were created for one engine can be used on
/// another.
///
/// To create engines, we must specify the @ref dnnl::engine::kind
/// and the index of the device of the given kind. There is only one CPU
/// engine and one GPU engine, so the index for both engines must be 0.
///
/// @snippet cross_engine_reorder.cpp Initialize engine
// [Initialize engine]
auto cpu_engine = engine(validate_engine_kind(engine::kind::cpu), 0);
auto gpu_engine = engine(validate_engine_kind(engine::kind::gpu), 0);
// [Initialize engine]
/// In addition to an engine, all primitives require a @ref dnnl::stream
/// for the execution. The stream encapsulates an execution context and is
/// tied to a particular engine.
///
/// In this example, a GPU stream is created.
///
/// @snippet cross_engine_reorder.cpp Initialize stream
// [Initialize stream]
auto stream_gpu = stream(gpu_engine, stream::flags::in_order);
// [Initialize stream]
/// @subsection cross_engine_reorder_cpp_sub2 Wrapping data into oneDNN GPU memory object
/// Fill the data in CPU memory first, and then move data from CPU to GPU
/// memory by reorder.
/// @snippet cross_engine_reorder.cpp reorder cpu2gpu
// [reorder cpu2gpu]
const auto tz = memory::dims {2, 16, 1, 1};
auto m_cpu
= memory({{tz}, memory::data_type::f32, memory::format_tag::nchw},
cpu_engine);
auto m_gpu
= memory({{tz}, memory::data_type::f32, memory::format_tag::nchw},
gpu_engine);
fill(m_cpu, tz);
auto r1 = reorder(m_cpu, m_gpu);
// [reorder cpu2gpu]
/// @subsection cross_engine_reorder_cpp_sub3 Creating a ReLU primitive
///
/// Let's now create a ReLU primitive for GPU.
///
/// The library implements the ReLU primitive as a particular algorithm of a
/// more general @ref dev_guide_eltwise primitive, which applies a specified
/// function to each element of the source tensor.
///
/// Just as in the case of @ref dnnl::memory, a user should always go
/// through (at least) three creation steps (which, however, can sometimes
/// be combined thanks to C++11):
/// 1. Create an operation primitive descriptor (here @ref
/// dnnl::eltwise_forward::primitive_desc) that defines the operation
/// parameters including a GPU memory descriptor, and GPU engine.
/// Primitive descriptor is a **lightweight** descriptor of the actual
/// algorithm that **implements** the given operation.
/// 2. Create a primitive (here @ref dnnl::eltwise_forward) that can be
/// executed on GPU memory objects to compute the operation by a GPU
/// engine.
///
///@note
/// Primitive creation might be a very expensive operation, so consider
/// creating primitive objects once and executing them multiple times.
///
/// The code:
/// @snippet cross_engine_reorder.cpp Create a ReLU primitive
// [Create a ReLU primitive]
// ReLU primitive descriptor, which corresponds to a particular
// implementation in the library. Specify engine type for the ReLU
// primitive. Use a GPU engine here.
auto relu_pd = eltwise_forward::primitive_desc(gpu_engine,
prop_kind::forward, algorithm::eltwise_relu, m_gpu.get_desc(),
m_gpu.get_desc(), 0.0f);
// ReLU primitive
auto relu = eltwise_forward(relu_pd);
// [Create a ReLU primitive]
/// @subsection cross_engine_reorder_cpp_sub4 Getting results from a oneDNN GPU memory object
/// After the ReLU operation, users need to get data from GPU to CPU memory
/// by reorder.
/// @snippet cross_engine_reorder.cpp reorder gpu2cpu
// [reorder gpu2cpu]
auto r2 = reorder(m_gpu, m_cpu);
// [reorder gpu2cpu]
/// @subsection cross_engine_reorder_cpp_sub5 Executing all primitives
///
/// Finally, let's execute all primitives and wait for their completion
/// via the following sequence:
///
/// Reorder(CPU,GPU) -> ReLU -> Reorder(GPU,CPU).
///
/// 1. After execution of the first Reorder, ReLU has source data in GPU.
///
/// 2. The input and output memory objects are passed to the ReLU
/// `execute()` method using a <tag, memory> map. Each tag specifies what
/// kind of tensor each memory object represents. All @ref dev_guide_eltwise
/// primitives require the map to have two elements: a source memory
/// object (input) and a destination memory (output). For executing
/// on GPU engine, both source and destination memory object must use
/// GPU memory.
///
/// 3. After the execution of the ReLU on GPU, the second Reorder moves
/// the results from GPU to CPU.
///
/// @note
/// All primitives are executed in the SAME GPU stream (the first
/// parameter of the `execute()` method).
///
/// Execution is asynchronous on GPU. This means that we need to call @ref
/// dnnl::stream::wait before accessing the results.
///
/// @snippet cross_engine_reorder.cpp Execute primitives
// [Execute primitives]
// wrap source data from CPU to GPU
r1.execute(stream_gpu, m_cpu, m_gpu);
// Execute ReLU on a GPU stream
relu.execute(stream_gpu, {{DNNL_ARG_SRC, m_gpu}, {DNNL_ARG_DST, m_gpu}});
// Get result data from GPU to CPU
r2.execute(stream_gpu, m_gpu, m_cpu);
stream_gpu.wait();
// [Execute primitives]
/// @page cross_engine_reorder_cpp
/// @subsection cross_engine_reorder_cpp_sub6 Validate the result
///
/// Now that we have the computed the result on CPU memory, let's validate
/// that it is actually correct.
///
/// @snippet cross_engine_reorder.cpp Check the results
// [Check the results]
if (find_negative(m_cpu, tz) != 0)
throw std::logic_error(
"Unexpected output, find a negative value after the ReLU "
"execution.");
// [Check the results]
}
int main(int argc, char **argv) {
return handle_example_errors({engine::kind::cpu, engine::kind::gpu},
cross_engine_reorder_tutorial);
}
/// @page cross_engine_reorder_cpp
///
/// <b></b>
///
/// Upon compiling and running the example, the output should be just:
///
/// ~~~
/// Example passed.
/// ~~~
///