oneDNN/examples/graph/cpu_single_op_partition.cpp

/*******************************************************************************
* Copyright 2024-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

/// @example cpu_single_op_partition.cpp
/// > Annotated version: @ref graph_cpu_single_op_partition_cpp

/// @page graph_cpu_single_op_partition_cpp_brief
/// @brief This is an example to demonstrate how to build a simple op graph and run it
/// on CPU.

/// @page graph_cpu_single_op_partition_cpp Single op partition on CPU
/// \copybrief graph_cpu_single_op_partition_cpp_brief
///
/// > Example code: @ref cpu_single_op_partition.cpp
///
/// Some key take-aways included in this example:
///
/// * how to build a single-op partition quickly
/// * how to create an engine, allocator and stream
/// * how to compile a partition
/// * how to execute a compiled partition
///
/// Some assumptions in this example:
///
/// * Only workflow is demonstrated without checking correctness
/// * Unsupported partitions should be handled by users themselves
///

/// @page graph_cpu_single_op_partition_cpp
/// @section graph_cpu_single_op_partition_cpp_headers Public headers
///
/// To start using oneDNN Graph, we must include the @ref dnnl_graph.hpp header
/// file in the application. All the C++ APIs reside in namespace `dnnl::graph`.
///
/// @page graph_cpu_single_op_partition_cpp
/// @snippet cpu_single_op_partition.cpp Headers and namespace

//[Headers and namespace]
#include <iostream>
#include <memory>
#include <vector>
#include <unordered_map>
#include <unordered_set>

#include <assert.h>

#include "oneapi/dnnl/dnnl_graph.hpp"

#include "example_utils.hpp"
#include "graph_example_utils.hpp"

using namespace dnnl::graph;
using data_type = logical_tensor::data_type;
using layout_type = logical_tensor::layout_type;
using dim = logical_tensor::dim;
using dims = logical_tensor::dims;
//[Headers and namespace]

/// @page graph_cpu_single_op_partition_cpp
/// @section graph_cpu_single_op_partition_cpp_tutorial cpu_single_op_partition_tutorial() function
///
void cpu_single_op_partition_tutorial() {

    dim M = 32, K = 1024, N = 2048;

    dims src0_dims {M, K};
    dims src1_dims {K, N};

    /// @page graph_cpu_single_op_partition_cpp
    /// @subsection graph_cpu_single_op_partition_cpp_get_partition Build Graph and Get Partitions
    ///
    /// In this section, we are trying to create a partition containing the
    /// single op `matmul` without building a graph and getting partition.
    ///

    /// Create the `Matmul` op (#dnnl::graph::op) and attaches attributes
    /// to it, including `transpose_a` and `transpose_b`.
    /// @snippet cpu_single_op_partition.cpp Create matmul
    //[Create matmul]
    logical_tensor matmul_src0_desc {0, data_type::f32};
    logical_tensor matmul_src1_desc {1, data_type::f32};
    logical_tensor matmul_dst_desc {2, data_type::f32};
    op matmul(0, op::kind::MatMul, {matmul_src0_desc, matmul_src1_desc},
            {matmul_dst_desc}, "matmul");
    matmul.set_attr<bool>(op::attr::transpose_a, false);
    matmul.set_attr<bool>(op::attr::transpose_b, false);
    //[Create matmul]

    /// @page graph_cpu_single_op_partition_cpp
    /// @subsection graph_cpu_single_op_partition_cpp_compile Compile and Execute Partition
    ///
    /// In the real case, users like framework should provide device information
    /// at this stage. But in this example, we just use a self-defined device to
    /// simulate the real behavior.
    ///
    /// Create a #dnnl::engine. Also, set a user-defined
    /// #dnnl::graph::allocator to this engine.
    ///
    /// @snippet cpu_single_op_partition.cpp Create engine
    //[Create engine]
    allocator alloc {};
    dnnl::engine eng
            = make_engine_with_allocator(dnnl::engine::kind::cpu, 0, alloc);
    //[Create engine]

    /// Create a #dnnl::stream on a given engine
    ///
    /// @snippet cpu_single_op_partition.cpp Create stream
    //[Create stream]
    dnnl::stream strm {eng};
    //[Create stream]

    // Memory buffers bound to the partition input/output tensors
    // that helps manage the lifetime of these tensors
    std::vector<std::shared_ptr<void>> data_buffer;

    // Mapping from logical tensor id to the concrete shapes.
    // In practical usage, concrete shapes and layouts are not given
    // until compilation stage, hence need this mapping to mock the step.
    std::unordered_map<size_t, dims> concrete_shapes {
            {0, src0_dims}, {1, src1_dims}};

    // Compile and execute the partitions, including the following steps:
    //
    // 1. Update the input/output logical tensors with concrete shape and layout
    // 2. Compile the partition
    // 3. Update the output logical tensors with queried ones after compilation
    // 4. Allocate memory and bind the data buffer for the partition
    // 5. Execute the partition
    //
    // Although they are not part of the APIs, these steps are essential for
    // the integration of Graph API., hence users need to implement similar
    // logic.

    /// Skip building graph and getting partition, and directly create
    /// the single-op partition
    ///
    /// @snippet cpu_single_op_partition.cpp Create partition
    //[Create partition]
    partition part(matmul, dnnl::engine::kind::cpu);
    //[Create partition]
    if (!part.is_supported()) {
        std::cout << "cpu_single_op_partition: Got unsupported partition, "
                     "users need to handle the operators by themselves."
                  << std::endl;
        return;
    }

    std::vector<logical_tensor> inputs = part.get_input_ports();
    std::vector<logical_tensor> outputs = part.get_output_ports();

    // Update input logical tensors with concrete shape and layout
    for (auto &input : inputs) {
        const auto id = input.get_id();
        // Create logical tensor with strided layout
        input = logical_tensor {id, input.get_data_type(), concrete_shapes[id],
                layout_type::strided};
    }

    // Update output logical tensors with concrete shape and layout
    for (auto &output : outputs) {
        const auto id = output.get_id();
        output = logical_tensor {id, output.get_data_type(),
                DNNL_GRAPH_UNKNOWN_NDIMS,
                // do not require concrete shape as the shape will be inferred
                // based on input shapes during compilation
                layout_type::strided};
    }

    /// Compile the partition to generate compiled partition with the
    /// input and output logical tensors.
    ///
    /// @snippet cpu_single_op_partition.cpp Compile partition
    //[Compile partition]
    compiled_partition cp = part.compile(inputs, outputs, eng);
    //[Compile partition]

    // Update output logical tensors with queried one
    for (auto &output : outputs) {
        const auto id = output.get_id();
        output = cp.query_logical_tensor(id);
    }

    // Allocate memory for the partition, and bind the data buffers with
    // input and output logical tensors
    std::vector<tensor> inputs_ts, outputs_ts;
    allocate_graph_mem(inputs_ts, inputs, data_buffer, eng);
    allocate_graph_mem(outputs_ts, outputs, data_buffer, eng);

    /// Execute the compiled partition on the specified stream.
    ///
    /// @snippet cpu_single_op_partition.cpp Execute compiled partition
    //[Execute compiled partition]
    cp.execute(strm, inputs_ts, outputs_ts);
    //[Execute compiled partition]

    // Wait for all compiled partition's execution finished
    strm.wait();

    /// @page graph_cpu_single_op_partition_cpp
    ///
    std::cout << "Graph:" << std::endl
              << " [matmul_src0] [matmul_src1]" << std::endl
              << "       \\       /" << std::endl
              << "         matmul" << std::endl
              << "            |" << std::endl
              << "        [matmul_dst]" << std::endl
              << "Note:" << std::endl
              << " '[]' represents a logical tensor, which refers to "
                 "inputs/outputs of the graph. "
              << std::endl;
}

int main(int argc, char **argv) {
    return handle_example_errors(
            {engine::kind::cpu}, cpu_single_op_partition_tutorial);
}