oneDNN/include/oneapi/dnnl/dnnl_graph.hpp

/*******************************************************************************
* Copyright 2020-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

/// @file
/// Graph C++ API

#ifndef ONEAPI_DNNL_DNNL_GRAPH_HPP
#define ONEAPI_DNNL_DNNL_GRAPH_HPP
// NOLINTBEGIN(readability-identifier-naming)

#include "oneapi/dnnl/dnnl_common.hpp"
#include "oneapi/dnnl/dnnl_graph.h"

#include <limits>
#include <memory>
#include <string>
#include <utility>
#include <vector>

/// @addtogroup dnnl_api
/// @{

namespace dnnl {

/// @addtogroup dnnl_graph_api Graph API
/// oneDNN Graph API
/// @{

/// oneDNN Graph namespace
namespace graph {

/// @cond DO_NOT_DOCUMENT_THIS

// Alias for common engine and stream API.
using engine = dnnl::engine;
using stream = dnnl::stream;
using fpmath_mode = dnnl::fpmath_mode;

/// @endcond

/// @addtogroup dnnl_graph_api_utils Utilities
/// Utility types and definitions
/// \ingroup dnnl_graph_api
/// @{

/// @cond DO_NOT_DOCUMENT_THIS

/// A class that provides the destructor for a oneDNN graph C API handle.
template <typename T>
struct graph_handle_traits : public dnnl::handle_traits<T> {};

template <>
struct graph_handle_traits<dnnl_graph_op_t> {
    static dnnl_status_t destructor(dnnl_graph_op_t p) {
        return dnnl_graph_op_destroy(p);
    }
};

template <>
struct graph_handle_traits<dnnl_graph_graph_t> {
    static dnnl_status_t destructor(dnnl_graph_graph_t p) {
        return dnnl_graph_graph_destroy(p);
    }
};

template <>
struct graph_handle_traits<dnnl_graph_tensor_t> {
    static dnnl_status_t destructor(dnnl_graph_tensor_t p) {
        return dnnl_graph_tensor_destroy(p);
    }
};

template <>
struct graph_handle_traits<dnnl_graph_partition_t> {
    static dnnl_status_t destructor(dnnl_graph_partition_t p) {
        return dnnl_graph_partition_destroy(p);
    }
};

template <>
struct graph_handle_traits<dnnl_graph_compiled_partition_t> {
    static dnnl_status_t destructor(dnnl_graph_compiled_partition_t p) {
        return dnnl_graph_compiled_partition_destroy(p);
    }
};

template <>
struct graph_handle_traits<dnnl_graph_allocator_t> {
    static dnnl_status_t destructor(dnnl_graph_allocator_t p) {
        return dnnl_graph_allocator_destroy(p);
    }
};

#define DNNL_GRAPH_HANDLE_ALIAS(type) \
    using type##_handle = dnnl::handle<dnnl_graph_##type##_t, \
            graph_handle_traits<dnnl_graph_##type##_t>>

DNNL_GRAPH_HANDLE_ALIAS(allocator);
DNNL_GRAPH_HANDLE_ALIAS(graph);
DNNL_GRAPH_HANDLE_ALIAS(op);
DNNL_GRAPH_HANDLE_ALIAS(tensor);
DNNL_GRAPH_HANDLE_ALIAS(compiled_partition);
DNNL_GRAPH_HANDLE_ALIAS(partition);

#undef DNNL_GRAPH_HANDLE_ALIAS

template <bool B>
using req = typename std::enable_if<B, bool>::type;

/// @endcond

/// @} dnnl_graph_api_utils

/// @addtogroup dnnl_graph_api_status Status
/// Definitions of status values returned by the library functions.
/// \ingroup dnnl_graph_api
/// @{

/// Status values returned by the library functions.
enum class status {
    /// The operation was successful
    success = dnnl_success,
    /// The operation failed due to an out-of-memory condition
    out_of_memory = dnnl_out_of_memory,
    /// The operation failed because of incorrect function arguments
    invalid_arguments = dnnl_invalid_arguments,
    /// The operation failed because requested functionality is not implemented
    unimplemented = dnnl_unimplemented,
    /// The last available implementation is reached
    last_impl_reached = dnnl_last_impl_reached,
    /// Primitive or engine failed on execution
    runtime_error = dnnl_runtime_error,
    /// Queried element is not required for given primitive
    not_required = dnnl_not_required,
    /// The graph is not legitimate
    invalid_graph = dnnl_invalid_graph,
    /// The operation is not legitimate according to op schema
    invalid_graph_op = dnnl_invalid_graph_op,
    /// The shape cannot be inferred or compiled
    invalid_shape = dnnl_invalid_shape,
    /// The data type cannot be inferred or compiled
    invalid_data_type = dnnl_invalid_data_type,
};

/// @} dnnl_graph_api_status

/// @addtogroup dnnl_graph_api_allocator Allocator
///
/// Definitions of allocator which is used to acquire memory resources in
/// partition compilation and execution. SYCL allocator
/// (#dnnl::graph::sycl_interop::make_allocator) should be used for SYCL runtime
/// and host allocator should be used for non-SYCL.
///
/// @{

/// Allocator
class allocator : public allocator_handle {
public:
    using allocator_handle::handle;

    /// Constructs an allocator according to given function pointers
    ///
    /// @param host_malloc A pointer to malloc function for CPU
    /// @param host_free A pointer to free function for CPU
    allocator(dnnl_graph_host_allocate_f host_malloc,
            dnnl_graph_host_deallocate_f host_free) {
        dnnl_graph_allocator_t a = nullptr;
        error::wrap_c_api(
                dnnl_graph_allocator_create(&a, host_malloc, host_free),
                "could not create allocator for cpu");
        reset(a);
    }

    /// Default constructor
    allocator() {
        dnnl_graph_allocator_t a = nullptr;
        error::wrap_c_api(dnnl_graph_allocator_create(&a, nullptr, nullptr),
                "could not create allocator");
        reset(a);
    }
};

/// @} dnnl_graph_api_allocator

/// @addtogroup dnnl_graph_api_engine Engine
/// @{

/// This API is a supplement for existing onednn engine API.
inline engine make_engine_with_allocator(
        engine::kind kind, size_t index, const allocator &alloc) {
    dnnl_engine_t c_engine;
    error::wrap_c_api(
            dnnl_graph_make_engine_with_allocator(&c_engine,
                    static_cast<dnnl_engine_kind_t>(kind), index, alloc.get()),
            "could not make an engine with allocator");
    return engine(c_engine);
}

/// @} dnnl_graph_api_engine

/// @addtogroup dnnl_graph_api_logical_tensor Logical Tensor
///
/// Logical tensor describes the meta-data of the input or output tensor, like
/// elements data type, number of dimensions, size for each dimension (shape),
/// layout, and the property of the tensor.
///
/// Each logical tensor has an unique ID. The library uses logical tensor IDs to
/// build up the connections between operations if the output of one operation
/// has the same ID as the input of another operation. The meta-data in a
/// logical tensor may be enriched in the framework graph as it progresses
/// toward final execution. For example, the library doesn't require detailed
/// shape information at the operation and graph creation stage. But shape
/// information of input logical tensor will be required at partition
/// compilation stage. Logical tensor is not mutable. Users must create a new
/// logical tensor with the same ID to pass any new additional information to
/// oneDNN Graph API. Please note that the library also has unique IDs for
/// operations. The ID should be unique among different logical tensors, but it
/// can have the same value between a logical tensor and an operation.
///
/// @{

/// Logical tensor object
class logical_tensor {
    friend class op;
    friend class tensor;
    friend class partition;
    friend class compiled_partition;

    dnnl_graph_logical_tensor_t data;

public:
    /// Integer type for representing dimension sizes and indices.
    using dim = dnnl_dim_t;
    /// Vector of dimensions. Implementations are free to force a limit on the
    /// vector's length.
    using dims = std::vector<dim>;

    /// Data Type
    enum class data_type {
        undef = dnnl_data_type_undef,
        /// 16-bit/half-precision floating point.
        f16 = dnnl_f16,
        /// non-standard 16-bit (bfloat16 w/ 7 bit mantissa) floating point.
        bf16 = dnnl_bf16,
        /// 32-bit/single-precision floating point.
        f32 = dnnl_f32,
        /// 32-bit signed integer.
        s32 = dnnl_s32,
        /// 8-bit signed integer.
        s8 = dnnl_s8,
        /// 8-bit unsigned integer.
        u8 = dnnl_u8,
        /// Boolean data type. Size is C++ implementation defined.
        boolean = dnnl_boolean,
        /// [OFP8 standard 8-bit
        /// floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
        /// with a 5-bit exponent and a 2-bit mantissa.
        f8_e5m2 = dnnl_f8_e5m2,
        /// [OFP8 standard 8-bit
        /// floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
        /// with a 4-bit exponent and a 3-bit mantissa.
        f8_e4m3 = dnnl_f8_e4m3,
        /// 4-bit signed integer.
        s4 = dnnl_s4,
        /// 4-bit unsigned integer.
        u4 = dnnl_u4,
    };

    /// Layout type
    enum class layout_type {
        /// Undefined layout type.
        undef = dnnl_graph_layout_type_undef,
        /// Any means to let the library to decide the layout for a tensor
        /// during partition compilation.
        any = dnnl_graph_layout_type_any,
        /// Strided means that the layout of a tensor is determined by the
        /// strides field in the logical tensor.
        strided = dnnl_graph_layout_type_strided,
        /// Opaque means that the layout of a tensor is the library specific.
        /// Usually, an opaque layout is generated by a partition which is
        /// compiled with layout type any.
        opaque = dnnl_graph_layout_type_opaque,
    };

    /// Tensor property
    enum class property_type {
        /// Undefined tensor property.
        undef = dnnl_graph_tensor_property_undef,
        /// Variable means the tensor may be changed during computation or
        /// between different iterations.
        variable = dnnl_graph_tensor_property_variable,
        /// Constant means the tensor will keep unchanged during computation and
        /// between different iterations. It's useful for the library to apply
        /// optimizations for constant tensors or cache constant tensors inside
        /// the library. For example, constant weight tensors in inference
        /// scenarios.
        constant = dnnl_graph_tensor_property_constant,
        /// Host scalar means the tensor will be a 0-D scalar tensor on host.
        /// It should be used with a CPU engine when creating the tensor.
        host_scalar = dnnl_graph_tensor_property_host_scalar,
    };

    /// default constructor
    /// construct an empty object
    logical_tensor() = default;

    /// Constructs a logical tensor object
    explicit logical_tensor(const dnnl_graph_logical_tensor_t &c_data)
        : data(c_data) {}

    /// Copy
    logical_tensor(const logical_tensor &other) = default;

    /// Assign
    logical_tensor &operator=(const logical_tensor &other) = default;

    /// Constructs a logical tensor object with ID, data type, ndims, layout
    /// type, and property type.
    ///
    /// @param tid Logical tensor ID.
    /// @param dtype Elements data type.
    /// @param ndims Number of dimensions. -1 means unknown (see
    ///     #DNNL_GRAPH_UNKNOWN_NDIMS) and 0 means a scalar tensor.
    /// @param ltype Layout type.
    /// @param ptype Property type.
    logical_tensor(size_t tid, data_type dtype, int32_t ndims,
            layout_type ltype, property_type ptype = property_type::undef) {
        dnnl_graph_logical_tensor_t val;
        error::wrap_c_api(
                dnnl_graph_logical_tensor_init(&val, tid, convert_to_c(dtype),
                        ndims, convert_to_c(ltype), convert_to_c(ptype)),
                "could not create logical_tensor with property");
        data = val;
    }

    /// Delegated constructor.
    ///
    /// @param tid Logical tensor ID.
    /// @param dtype Elements data type.
    /// @param ltype Layout type.
    logical_tensor(
            size_t tid, data_type dtype, layout_type ltype = layout_type::undef)
        : logical_tensor(tid, dtype, DNNL_GRAPH_UNKNOWN_NDIMS, ltype) {}

    /// Constructs a logical tensor object with basic information and detailed
    /// dims.
    ///
    /// @param tid Logical tensor ID.
    /// @param dtype Elements data type.
    /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means
    ///     the size of that dimension is unknown. 0 is used to define
    ///     zero-dimension tensor.
    /// @param ltype Layout type. If it's strided, the strides field in the
    ///     output logical tensor will be deduced accordingly.
    /// @param ptype Property type.
    logical_tensor(size_t tid, data_type dtype, const dims &adims,
            layout_type ltype, property_type ptype = property_type::undef) {
        dnnl_graph_logical_tensor_t val;
        // if dimension size equals to 0, it's a scalar
        if (adims.empty())
            error::wrap_c_api(dnnl_graph_logical_tensor_init(&val, tid,
                                      convert_to_c(dtype), 0,
                                      convert_to_c(ltype), convert_to_c(ptype)),
                    "could not create logical_tensor with property");
        else
            error::wrap_c_api(
                    dnnl_graph_logical_tensor_init_with_dims(&val, tid,
                            convert_to_c(dtype),
                            static_cast<int32_t>(adims.size()), adims.data(),
                            convert_to_c(ltype), convert_to_c(ptype)),
                    "could not create logical_tensor with dims and property");
        data = val;
    }

    /// Constructs a logical tensor object with detailed dims and strides. The
    /// layout_type of the output logical tensor object will always be strided.
    ///
    /// @param tid Logical tensor ID.
    /// @param dtype Elements data type.
    /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means
    ///     the size of that dimension is unknown. 0 is used to define
    ///     zero-dimension tensor.
    /// @param strides Logical tensor strides.  #DNNL_GRAPH_UNKNOWN_DIM means
    ///     the stride of the dimension is unknown. The library currently
    ///      doesn't support other negative stride values.
    /// @param ptype Property type.
    logical_tensor(size_t tid, data_type dtype, const dims &adims,
            const dims &strides, property_type ptype = property_type::undef) {
        dnnl_graph_logical_tensor_t val;
        // TODO(lvtao): check the size of adims and strides.
        // They should be same.
        error::wrap_c_api(
                dnnl_graph_logical_tensor_init_with_strides(&val, tid,
                        convert_to_c(dtype), static_cast<int32_t>(adims.size()),
                        adims.data(), strides.data(), convert_to_c(ptype)),
                "could not create logical_tensor with strides and property");
        data = val;
    }

    /// Constructs a logical tensor object with detailed dims and an opaque
    /// layout ID. layout_type of the output logical tensor object will always
    /// be opaque.
    ///
    /// @param tid Logical tensor ID.
    /// @param dtype Elements data type.
    /// @param adims Logical tensor dimensions. #DNNL_GRAPH_UNKNOWN_DIM means
    ///     the size of that dimension is unknown. 0 is used to define
    ///     zero-dimension tensor.
    /// @param lid Opaque layout id.
    /// @param ptype Property type
    logical_tensor(size_t tid, data_type dtype, const dims &adims, size_t lid,
            property_type ptype = property_type::undef) {
        dnnl_graph_logical_tensor_t val;

        if (adims.empty()) {
            error::wrap_c_api(dnnl_graph_logical_tensor_init(&val, tid,
                                      convert_to_c(dtype), 0,
                                      convert_to_c(layout_type::opaque),
                                      convert_to_c(ptype)),
                    "could not create logical_tensor");
        } else {
            error::wrap_c_api(
                    dnnl_graph_logical_tensor_init_with_dims(&val, tid,
                            convert_to_c(dtype),
                            static_cast<int32_t>(adims.size()), adims.data(),
                            convert_to_c(layout_type::opaque),
                            convert_to_c(ptype)),
                    "could not create logical_tensor with dims");
        }

        val.layout.layout_id = lid;
        data = val;
    }

    /// Returns dimensions of a logical tensor.
    ///
    /// @returns A vector describing the size of each dimension.
    dims get_dims() const {
        if (data.ndims < 0) {
            error::wrap_c_api(dnnl_invalid_arguments,
                    "cannot return dims when ndims < 0");
        }

        return {data.dims, data.dims + data.ndims};
    }

    /// Returns the unique id of a logical tensor.
    ///
    /// @returns An integer value describing the ID.
    size_t get_id() const { return data.id; }

    /// Returns the data type of a logical tensor.
    ///
    /// @returns The data type.
    data_type get_data_type() const {
        return static_cast<data_type>(data.data_type);
    }

    /// Returns the property type of a logical tensor.
    ///
    /// @returns The property type.
    property_type get_property_type() const {
        return static_cast<property_type>(data.property);
    }

    /// Returns the layout type of a logical tensor.
    ///
    /// @returns The layout type.
    layout_type get_layout_type() const {
        return static_cast<layout_type>(data.layout_type);
    }

    /// Returns the layout ID of a logical tensor. The API should be called on a
    /// logical tensor with opaque layout type. Otherwise, an exception will be
    /// raised.
    ///
    /// @returns Layout ID.
    size_t get_layout_id() const {
        if (get_layout_type() != layout_type::opaque) {
            error::wrap_c_api(
                    dnnl_invalid_arguments, "layout type should be opaque");
        }

        return data.layout.layout_id;
    }

    /// Returns the strides of a logical tensor. The API should be called on a
    /// logical tensor with strided layout type. Otherwise, an exception will be
    /// raised.
    ///
    /// @returns A vector describing the stride size of each dimension.
    dims get_strides() const {
        if (get_layout_type() != layout_type::strided) {
            error::wrap_c_api(
                    dnnl_invalid_arguments, "layout type should be strided");
        }

        if (data.ndims < 0) {
            error::wrap_c_api(dnnl_invalid_arguments,
                    "cannot return strides when ndims < 0");
        }

        return {data.layout.strides, data.layout.strides + data.ndims};
    }

    /// Returns memory size in bytes required by this logical tensor.
    ///
    /// @returns The memory size in bytes.
    size_t get_mem_size() const {
        size_t size = 0;
        error::wrap_c_api(dnnl_graph_logical_tensor_get_mem_size(&data, &size),
                "could not get memory size from the logical_tensor");
        return size;
    }

    /// Compares if two logical tenors are equal. Users can decide accordingly
    /// if layout reordering is needed for two logical tensors. The method will
    /// return true for below two circumstances:
    ///
    /// 1. the two logical tensors are equal regarding each field in the struct,
    /// eg. id, ndims, dims, layout type, property, etc.
    /// 2. If all other fields are equal but the layout types in two logical
    /// tensors are different, the method will return true when the underlying
    /// memory layout is the same. For example, one logical tensor has strided
    /// layout type while the other one has opaque layout type, but underneath,
    /// both layouts are NHWC, the method will still return true for this case.
    ///
    /// @param lt The input logical tensor to be compared.
    /// @returns @c true if the two logical tensors are equal. @c false otherwise
    bool is_equal(const logical_tensor &lt) const {
        uint8_t equal = 0;
        error::wrap_c_api(
                dnnl_graph_logical_tensor_is_equal(&data, &lt.data, &equal),
                "could not compare between the two logical tensors");
        return equal != 0;
    }

private:
    static dnnl_data_type_t convert_to_c(data_type dtype) {
        return static_cast<dnnl_data_type_t>(dtype);
    }

    static dnnl_graph_layout_type_t convert_to_c(layout_type ltype) {
        return static_cast<dnnl_graph_layout_type_t>(ltype);
    }

    static dnnl_graph_tensor_property_t convert_to_c(property_type ptype) {
        return static_cast<dnnl_graph_tensor_property_t>(ptype);
    }
};

/// @} dnnl_graph_api_logical_tensor

/// @addtogroup dnnl_graph_api_tensor Tensor
///
/// Tensor is an abstraction for multi-dimensional input and output data needed
/// in the execution of a compiled partition. A tensor object encapsulates a
/// handle to a memory buffer allocated on a specific engine and a logical
/// tensor which describes the dimensions, elements data type, and memory
/// layout.
///
/// @{

/// A tensor object
class tensor : public tensor_handle {
public:
    using tensor_handle::handle;

    /// Default constructor. Constructs an empty object.
    tensor() = default;

    /// Constructs a tensor object according to a given logical tensor, an
    /// engine, and a memory handle.
    ///
    /// @param lt The given logical tensor
    /// @param aengine Engine to store the data on.
    /// @param handle Handle of memory buffer to use as an underlying storage.
    ///     - A pointer to the user-allocated buffer. In this case the library
    ///       doesn't own the buffer.
    ///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
    ///       allocate the buffer for the tensor. In this case the library
    ///       owns the buffer.
    ///     - DNNL_MEMORY_NONE to create tensor without an underlying buffer.
    tensor(const logical_tensor &lt, const engine &aengine, void *handle) {
        dnnl_graph_tensor_t t = nullptr;
        error::wrap_c_api(
                dnnl_graph_tensor_create(&t, &(lt.data), aengine.get(), handle),
                "could not create tensor object with the logical_tensor, "
                "engine, and handle");
        reset(t);
    }

    /// Constructs a tensor object.
    /// The underlying buffer for the memory will be allocated by the library.
    ///
    /// @param lt The given logical tensor
    /// @param aengine Engine to store the data on.
    tensor(const logical_tensor &lt, const engine &aengine)
        : tensor(lt, aengine, DNNL_MEMORY_ALLOCATE) {}

    /// Creates a tensor object for host-side scalar value. The data type contained
    /// in the logical tensor parameter will be used to interpret the scalar
    /// pointer. The property type in the logical tensor must be `host_scalar`.
    ///
    /// @param lt The logical tensor describing the host scalar
    /// @param scalar The pointer to scalar value
    /// @returns Created tensor object
    static tensor make_scalar_tensor(const logical_tensor &lt, void *scalar) {
        dnnl_graph_tensor_t t = nullptr;
        error::wrap_c_api(
                dnnl_graph_tensor_create_scalar(&t, &(lt.data), scalar),
                "could not create a scalar tensor object");

        return tensor(t);
    }

    /// Returns the underlying memory buffer.
    ///
    /// On the CPU engine, or when using USM, this is a pointer to the
    /// allocated memory.
    void *get_data_handle() const {
        void *handle = nullptr;
        error::wrap_c_api(dnnl_graph_tensor_get_data_handle(get(), &handle),
                "could not get data handle from the tensor");
        return handle;
    }

    /// Sets the underlying memory handle.
    ///
    /// @param handle Memory handle.
    void set_data_handle(void *handle) {
        error::wrap_c_api(dnnl_graph_tensor_set_data_handle(get(), handle),
                "setting data handle to the tensor failed");
    }

    /// Returns the associated engine.
    ///
    /// @returns An engine object
    engine get_engine() const {
        dnnl_engine_t c_engine = nullptr;
        error::wrap_c_api(dnnl_graph_tensor_get_engine(get(), &c_engine),
                "could not get an engine from a tensor object");
        return engine(c_engine, true);
    }

    /// Returns the logical tensor of a tensor object.
    ///
    /// @returns A logical_tensor object.
    logical_tensor get_logical_tensor() const {
        dnnl_graph_logical_tensor_t lt;
        error::wrap_c_api(dnnl_graph_tensor_get_logical_tensor(get(), &lt),
                "could not get logical tensor from a tensor object");
        return logical_tensor(lt);
    }
};

/// @} dnnl_graph_api_tensor

/// @addtogroup dnnl_graph_api_compiled_partition Compiled Partition
///
/// A compiled partition represents the generated kernels specialized for a
/// partition on a target hardware (engine) with input and output information
/// specified by the logical tensors.
///
/// @{

/// A compiled partition object.
class compiled_partition : public compiled_partition_handle {
public:
    /// Default constructor. Constructs an empty object.
    compiled_partition() = default;

    /// Constructs a compiled partition object
    compiled_partition(dnnl_graph_compiled_partition_t compiled_partition) {
        reset(compiled_partition, false);
    }

    /// Queries an input or output logical tensor according to tensor ID. If the
    /// tensor ID doesn't belong to any input or output of the compiled
    /// partition, an exception will be raised by the API.
    ///
    /// @param tid The unique id of required tensor.
    /// @returns The logical tensor.
    logical_tensor query_logical_tensor(size_t tid) const {
        dnnl_graph_logical_tensor_t lt;
        error::wrap_c_api(dnnl_graph_compiled_partition_query_logical_tensor(
                                  get(), tid, &lt),
                "query logical tensor from compiled_partition failed");
        return logical_tensor {lt};
    }

    /// Returns the hint of in-place pairs from a compiled partition. It
    /// indicates that an input and an output of the partition can share the
    /// same memory buffer for computation. In-place computation helps to reduce
    /// the memory footprint and improves cache locality. But since the library
    /// may not have a global view of user's application, it's possible that the
    /// input tensor is used at other places in user's computation graph. In
    /// this case, the user should take the in-place pair as a hint and pass a
    /// different memory buffer for output tensor to avoid overwriting the input
    /// memory buffer which will probably cause unexpected incorrect results.
    ///
    /// @returns A list of pairs of input and output IDs.
    std::vector<std::pair<size_t, size_t>> get_inplace_ports() const {
        size_t num = 0;
        const dnnl_graph_inplace_pair_t *inplace_pairs;

        error::wrap_c_api(dnnl_graph_compiled_partition_get_inplace_ports(
                                  get(), &num, &inplace_pairs),
                "could not get the in-place pairs from a compiled partition");
        if (num == 0) return {};

        std::vector<std::pair<size_t, size_t>> inplace_options;
        inplace_options.reserve(num);
        for (size_t i = 0; i < num; ++i) {
            const dnnl_graph_inplace_pair_t *inplace_pair = inplace_pairs + i;
            inplace_options.emplace_back(
                    inplace_pair->input_id, inplace_pair->output_id);
        }
        return inplace_options;
    }

    /// Execute a compiled partition.
    ///
    /// @param astream Stream object to run over.
    /// @param inputs A list of input tensors.
    /// @param outputs A list of output tensors.
    void execute(stream &astream, const std::vector<tensor> &inputs,
            const std::vector<tensor> &outputs) const {
        std::vector<const_dnnl_graph_tensor_t> c_inputs;
        c_inputs.reserve(inputs.size());
        for (auto &in : inputs) {
            c_inputs.push_back(in.get());
        }
        std::vector<const_dnnl_graph_tensor_t> c_outputs;
        c_outputs.reserve(outputs.size());
        for (auto &out : outputs) {
            c_outputs.push_back(out.get());
        }

        error::wrap_c_api(
                dnnl_graph_compiled_partition_execute(get(), astream.get(),
                        c_inputs.size(), c_inputs.data(), c_outputs.size(),
                        c_outputs.data()),
                "could not execute the compiled_partition");
    }
};

/// @} dnnl_graph_api_compiled_partition

/// @addtogroup dnnl_graph_api_op Op
///
/// OP is an abstraction of computation logic for deep neural network
/// operations. An op object encapsulates an operation kind which describes the
/// computation logic, an unique ID which differentiates operations with the
/// same kind, and logical tensors which describes the input and output of the
/// operation and its connections to other operations in the graph.
///
/// @{

/// An op object.
class op : public op_handle {
public:
    /// Kinds of operations
    enum class kind {
        Abs = dnnl_graph_op_abs,
        AbsBackward = dnnl_graph_op_abs_backward,
        Add = dnnl_graph_op_add,
        AvgPool = dnnl_graph_op_avg_pool,
        AvgPoolBackward = dnnl_graph_op_avg_pool_backward,
        BatchNormForwardTraining = dnnl_graph_op_batch_norm_forward_training,
        BatchNormInference = dnnl_graph_op_batch_norm_inference,
        BatchNormTrainingBackward = dnnl_graph_op_batch_norm_backward,
        BiasAdd = dnnl_graph_op_bias_add,
        BiasAddBackward = dnnl_graph_op_bias_add_backward,
        Clamp = dnnl_graph_op_clamp,
        ClampBackward = dnnl_graph_op_clamp_backward,
        Concat = dnnl_graph_op_concat,
        Convolution = dnnl_graph_op_convolution,
        ConvolutionBackwardData = dnnl_graph_op_convolution_backward_data,
        ConvolutionBackwardWeights = dnnl_graph_op_convolution_backward_weights,
        ConvTranspose = dnnl_graph_op_conv_transpose,
        ConvTransposeBackwardData = dnnl_graph_op_conv_transpose_backward_data,
        ConvTransposeBackwardWeights
        = dnnl_graph_op_conv_transpose_backward_weights,
        Dequantize = dnnl_graph_op_dequantize,
        Divide = dnnl_graph_op_divide,
        DynamicDequantize = dnnl_graph_op_dynamic_dequantize,
        DynamicQuantize = dnnl_graph_op_dynamic_quantize,
        Elu = dnnl_graph_op_elu,
        EluBackward = dnnl_graph_op_elu_backward,
        End = dnnl_graph_op_end,
        Exp = dnnl_graph_op_exp,
        GELU = dnnl_graph_op_gelu,
        GELUBackward = dnnl_graph_op_gelu_backward,
        GroupNorm = dnnl_graph_op_group_norm,
        HardSigmoid = dnnl_graph_op_hard_sigmoid,
        HardSigmoidBackward = dnnl_graph_op_hard_sigmoid_backward,
        HardSwish = dnnl_graph_op_hard_swish,
        HardSwishBackward = dnnl_graph_op_hard_swish_backward,
        Interpolate = dnnl_graph_op_interpolate,
        InterpolateBackward = dnnl_graph_op_interpolate_backward,
        LayerNorm = dnnl_graph_op_layer_norm,
        LayerNormBackward = dnnl_graph_op_layer_norm_backward,
        LeakyReLU = dnnl_graph_op_leaky_relu,
        Log = dnnl_graph_op_log,
        LogSoftmax = dnnl_graph_op_log_softmax,
        LogSoftmaxBackward = dnnl_graph_op_log_softmax_backward,
        MatMul = dnnl_graph_op_matmul,
        Maximum = dnnl_graph_op_maximum,
        MaxPool = dnnl_graph_op_max_pool,
        MaxPoolBackward = dnnl_graph_op_max_pool_backward,
        Minimum = dnnl_graph_op_minimum,
        Mish = dnnl_graph_op_mish,
        MishBackward = dnnl_graph_op_mish_backward,
        Multiply = dnnl_graph_op_multiply,
        Pow = dnnl_graph_op_pow,
        PReLU = dnnl_graph_op_prelu,
        PReLUBackward = dnnl_graph_op_prelu_backward,
        Quantize = dnnl_graph_op_quantize,
        Reciprocal = dnnl_graph_op_reciprocal,
        ReduceL1 = dnnl_graph_op_reduce_l1,
        ReduceL2 = dnnl_graph_op_reduce_l2,
        ReduceMax = dnnl_graph_op_reduce_max,
        ReduceMean = dnnl_graph_op_reduce_mean,
        ReduceMin = dnnl_graph_op_reduce_min,
        ReduceProd = dnnl_graph_op_reduce_prod,
        ReduceSum = dnnl_graph_op_reduce_sum,
        ReLU = dnnl_graph_op_relu,
        ReLUBackward = dnnl_graph_op_relu_backward,
        Reorder = dnnl_graph_op_reorder,
        Round = dnnl_graph_op_round,
        Select = dnnl_graph_op_select,
        Sigmoid = dnnl_graph_op_sigmoid,
        SigmoidBackward = dnnl_graph_op_sigmoid_backward,
        SoftMax = dnnl_graph_op_softmax,
        SoftMaxBackward = dnnl_graph_op_softmax_backward,
        SoftPlus = dnnl_graph_op_softplus,
        SoftPlusBackward = dnnl_graph_op_softplus_backward,
        Sqrt = dnnl_graph_op_sqrt,
        SqrtBackward = dnnl_graph_op_sqrt_backward,
        Square = dnnl_graph_op_square,
        SquaredDifference = dnnl_graph_op_squared_difference,
        StaticReshape = dnnl_graph_op_static_reshape,
        StaticTranspose = dnnl_graph_op_static_transpose,
        Subtract = dnnl_graph_op_subtract,
        Tanh = dnnl_graph_op_tanh,
        TanhBackward = dnnl_graph_op_tanh_backward,
        TypeCast = dnnl_graph_op_type_cast,
        Wildcard = dnnl_graph_op_wildcard,
        GenIndex = dnnl_graph_op_gen_index,
        GreaterEqual = dnnl_graph_op_greater_equal,
        // Sentinel
        LastSymbol = dnnl_graph_op_last_symbol,
    };

    /// Attributes of operations. Different operations support different
    /// attributes. Check the document of each operation for what attributes are
    /// supported and what are the potential values for them. Missing required
    /// attribute or illegal attribute value may lead to failure when adding the
    /// operation to a graph.
    enum class attr {
        /// Undefined op attribute.
        undef = dnnl_graph_op_attr_undef,

        // float32 attributes. The value of these attributes can be any single
        // float32 number.

        /// Specifies an alpha attribute to an op.
        alpha = dnnl_graph_op_attr_alpha,
        /// Specifies an beta attribute to an op.
        beta = dnnl_graph_op_attr_beta,
        /// Specifies an epsilon attribute to an op.
        epsilon = dnnl_graph_op_attr_epsilon,
        /// Specifies a max attribute to an op.
        max = dnnl_graph_op_attr_max,
        /// Specifies a min attribute to an op.
        min = dnnl_graph_op_attr_min,
        /// Specifies a momentum attribute to an op.
        momentum = dnnl_graph_op_attr_momentum,

        // float32 vector attributes. The value of these attributes can be a
        // vector of float32 numbers.

        /// Specifies a scales attribute to an op.
        scales = dnnl_graph_op_attr_scales,

        // int64_t attributes. The value of these attributes can be any single
        // int64 number.

        /// Specifies an axis attribute to an op.
        axis = dnnl_graph_op_attr_axis,
        /// Specifies a begin_norm_axis attribute to an op.
        begin_norm_axis = dnnl_graph_op_attr_begin_norm_axis,
        /// Specifies a groups attribute to an op.
        groups = dnnl_graph_op_attr_groups,

        // int64_t vector attributes. The value of these attributes can be a
        // vector of int64 numbers.

        /// Specifies an axes attribute to an op.
        axes = dnnl_graph_op_attr_axes,
        /// Specifies a dilations attribute to an op.
        dilations = dnnl_graph_op_attr_dilations,
        /// Specifies an dst_shape attribute to an op.
        dst_shape = dnnl_graph_op_attr_dst_shape,
        /// Specifies a kernel attribute to an op.
        kernel = dnnl_graph_op_attr_kernel,
        /// Specifies an order attribute to an op.
        order = dnnl_graph_op_attr_order,
        /// Specifies an output_padding attribute to an op.
        output_padding = dnnl_graph_op_attr_output_padding,
        /// Specifies a pads_begin attribute to an op.
        pads_begin = dnnl_graph_op_attr_pads_begin,
        /// Specifies a pads_end attribute to an op.
        pads_end = dnnl_graph_op_attr_pads_end,
        /// Specifies a shape attribute to an op.
        shape = dnnl_graph_op_attr_shape,
        /// Specifies a sizes attribute to an op.
        sizes = dnnl_graph_op_attr_sizes,
        /// Specifies an src_shape attribute to an op.
        src_shape = dnnl_graph_op_attr_src_shape,
        /// Specifies a strides attribute to an op.
        strides = dnnl_graph_op_attr_strides,
        /// Specifies a weight_shape attribute to an op.
        weights_shape = dnnl_graph_op_attr_weights_shape,
        /// Specifies a zps attribute to an op.
        zps = dnnl_graph_op_attr_zps,
        /// Specifies the group shape of an op. The size of the vector should
        /// match that of the input. For the dimensions where the grouped
        /// quantization occurs, the values should correspond to the group
        /// size, which indicates the number of elements that will share the
        /// same scaling factor.
        group_shape = dnnl_graph_op_attr_group_shape,

        // bool attributes. The value of these attributes can be any single bool
        // value.

        /// Specifies an exclude_pad attribute to an op.
        exclude_pad = dnnl_graph_op_attr_exclude_pad,
        /// Specifies a keep_dims attribute to an op.
        keep_dims = dnnl_graph_op_attr_keep_dims,
        /// Specifies a keep_stats attribute to an op.
        keep_stats = dnnl_graph_op_attr_keep_stats,
        /// Specifies a per_channel_broadcast attribute to an op.
        per_channel_broadcast = dnnl_graph_op_attr_per_channel_broadcast,
        /// Specifies a special_zero attribute to an op.
        special_zero = dnnl_graph_op_attr_special_zero,
        /// Specifies a transpose_a attribute to an op.
        transpose_a = dnnl_graph_op_attr_transpose_a,
        /// Specifies a transpose_b attribute to an op.
        transpose_b = dnnl_graph_op_attr_transpose_b,
        /// Specifies an use_affine attribute to an op.
        use_affine = dnnl_graph_op_attr_use_affine,
        /// Specifies an use_dst attribute to an op.
        use_dst = dnnl_graph_op_attr_use_dst,

        // string attributes. The value of these attributes can be a string.

        /// Specifies an auto_broadcast attribute to an op. The value can be
        /// "none" or "numpy".
        auto_broadcast = dnnl_graph_op_attr_auto_broadcast,
        /// Specifies an auto_pad attribute to an op. The value can be "none",
        /// "same_upper", "same_lower", or "valid".
        auto_pad = dnnl_graph_op_attr_auto_pad,
        /// Specifies an coordinate_transformation_mode attribute to an op. The
        /// value can be "half_pixel" or "align_corners". The attribute is
        /// defined for Interpolate operations.
        coordinate_transformation_mode
        = dnnl_graph_op_attr_coordinate_transformation_mode,
        /// Specifies a data_format of an op. The value can be "NCX" or "NXC".
        data_format = dnnl_graph_op_attr_data_format,
        /// Specifies a mode attribute of an op.
        /// Interpolate: "nearest", "linear", "bilinear", or "trilinear".
        /// SoftMax: "none", "inf_as_zero".
        /// GELU/GELUBackward: "gelu_erf", "gelu_tanh".
        mode = dnnl_graph_op_attr_mode,
        /// Specifies a qtype attribute to an op. The value can be "per_channel"
        /// or "per_tensor". The attribute is defined for quantization
        /// operations.
        qtype = dnnl_graph_op_attr_qtype,
        /// Specifies a rounding_type attribute to an op. The value can be
        /// "ceil" or "floor".
        rounding_type = dnnl_graph_op_attr_rounding_type,
        /// Specifies a weights_format of an op. The value can be "OIX", "XIO",
        /// "IOX", or "XOI". Different operations may support different values.
        weights_format = dnnl_graph_op_attr_weights_format,
        /// Specifies an accumulation_mode attribute to an op. The value can be
        /// "strict", "relaxed", "any", "f32", "s32", or "f16".
        accumulation_mode = dnnl_graph_op_attr_accumulation_mode,

        /// Specifies the end of all above exteral attributes for check.
        end = dnnl_graph_op_attr_end,
    };

    /// Constructs an op object with an unique ID, an operation kind, and a name
    /// string.
    ///
    /// @param id The unique ID of the op.
    /// @param akind The op kind specifies which computation is represented by
    ///     the op, such as Convolution or ReLU.
    /// @param verbose_name The string added as the op name.
    op(size_t id, kind akind, const std::string &verbose_name = "") {
        dnnl_graph_op_t op = nullptr;
        error::wrap_c_api(dnnl_graph_op_create(&op, id, convert_to_c(akind),
                                  verbose_name.c_str()),
                "could not create op with id and op kind");
        reset(op);
    }

    /// Constructs an op object with an unique ID, an operation kind, and
    /// input/output logical tensors.
    ///
    /// @param id The unique ID of this op.
    /// @param akind The op kind specifies which computation is represented by
    ///     this op, such as Convolution or ReLU.
    /// @param inputs Input logical tensor to be bound to this op.
    /// @param outputs Output logical tensor to be bound to this op.
    /// @param verbose_name The string added as the op name.
    op(size_t id, kind akind, const std::vector<logical_tensor> &inputs,
            const std::vector<logical_tensor> &outputs,
            const std::string &verbose_name = "")
        : op(id, akind, verbose_name) {
        for (const auto &input : inputs) {
            error::wrap_c_api(dnnl_graph_op_add_input(get(), &(input.data)),
                    "adding input to the op failed");
        }
        for (const auto &output : outputs) {
            error::wrap_c_api(dnnl_graph_op_add_output(get(), &(output.data)),
                    "adding output to the op failed");
        }
    }

    /// Adds an input logical tensor to the op.
    ///
    /// @param t Input logical tensor.
    void add_input(const logical_tensor &t) {
        error::wrap_c_api(dnnl_graph_op_add_input(get(), &(t.data)),
                "adding input to the op failed");
    }

    /// Adds a vector of input logical tensors to the op.
    ///
    /// @param ts The list of input logical tensors.
    void add_inputs(const std::vector<logical_tensor> &ts) {
        for (const auto &t : ts) {
            error::wrap_c_api(dnnl_graph_op_add_input(get(), &(t.data)),
                    "adding input to the op failed");
        }
    }

    /// Adds an output logical tensor to the op.
    ///
    /// @param t Output logical tensor.
    void add_output(const logical_tensor &t) {
        error::wrap_c_api(dnnl_graph_op_add_output(get(), &(t.data)),
                "adding output to the op failed");
    }

    /// Adds a vector of output logical tensors to the op.
    ///
    /// @param ts The list of output logical tensors.
    void add_outputs(const std::vector<logical_tensor> &ts) {
        for (const auto &t : ts) {
            error::wrap_c_api(dnnl_graph_op_add_output(get(), &(t.data)),
                    "adding output to the op failed");
        }
    }

    /// Sets the attribute according to the name and type (int64_t).
    ///
    /// @tparam Type_i Attribute's type.
    /// @param name Attribute's name.
    /// @param value The attribute's value.
    /// @returns The Op self.
    template <typename Type_i, req<std::is_same<Type_i, int64_t>::value> = true>
    op &set_attr(attr name, const Type_i &value) {
        dnnl_graph_op_attr_t attr = convert_to_c(name);
        error::wrap_c_api(dnnl_graph_op_set_attr_s64(get(), attr, &value, 1),
                "could not set attribute to the op");
        return *this;
    }

    /// Sets the attribute according to the name and type (float).
    ///
    /// @tparam Type_f Attribute's type.
    /// @param name Attribute's name.
    /// @param value The attribute's value.
    /// @returns The Op self.
    template <typename Type_f, req<std::is_same<Type_f, float>::value> = true>
    op &set_attr(attr name, const Type_f &value) {
        dnnl_graph_op_attr_t attr = convert_to_c(name);
        error::wrap_c_api(dnnl_graph_op_set_attr_f32(get(), attr, &value, 1),
                "could not set attribute to the op");
        return *this;
    }

    /// Sets the attribute according to the name and type (bool).
    ///
    /// @tparam Type_b Attribute's type.
    /// @param name Attribute's name.
    /// @param value The attribute's value.
    /// @returns The Op self.
    template <typename Type_b, req<std::is_same<Type_b, bool>::value> = true>
    op &set_attr(attr name, const Type_b &value) {
        dnnl_graph_op_attr_t attr = convert_to_c(name);
        const uint8_t val = value;
        error::wrap_c_api(dnnl_graph_op_set_attr_bool(get(), attr, &val, 1),
                "could not set attribute to the op");
        return *this;
    }

    /// Sets the attribute according to the name and type (string).
    ///
    /// @tparam Type_s Attribute's type.
    /// @param name Attribute's name.
    /// @param value The attribute's value.
    /// @returns The Op self.
    template <typename Type_s,
            req<std::is_same<Type_s, std::string>::value> = true>
    op &set_attr(attr name, const Type_s &value) {
        dnnl_graph_op_attr_t attr = convert_to_c(name);
        error::wrap_c_api(dnnl_graph_op_set_attr_str(
                                  get(), attr, value.c_str(), value.size()),
                "could not set attribute to the op");
        return *this;
    }

    /// Sets the attribute according to the name and type
    /// (std::vector<int64_t>).
    ///
    /// @tparam Type_is Attribute's type.
    /// @param name Attribute's name.
    /// @param value The attribute's value.
    /// @returns The Op self.
    template <typename Type_is,
            req<std::is_same<Type_is, std::vector<int64_t>>::value> = true>
    op &set_attr(attr name, const Type_is &value) {
        dnnl_graph_op_attr_t attr = convert_to_c(name);
        error::wrap_c_api(dnnl_graph_op_set_attr_s64(
                                  get(), attr, value.data(), value.size()),
                "could not set attribute to the op");
        return *this;
    }

    /// Sets the attribute according to the name and type (std::vector<float>).
    ///
    /// @tparam Type_fs Attribute's type.
    /// @param name Attribute's name.
    /// @param value The attribute's value.
    /// @returns The Op self.
    template <typename Type_fs,
            req<std::is_same<Type_fs, std::vector<float>>::value> = true>
    op &set_attr(attr name, const Type_fs &value) {
        dnnl_graph_op_attr_t attr = convert_to_c(name);
        error::wrap_c_api(dnnl_graph_op_set_attr_f32(
                                  get(), attr, value.data(), value.size()),
                "could not set attribute to the op");
        return *this;
    }

private:
    dnnl_graph_op_kind_t convert_to_c(kind akind) {
        return static_cast<dnnl_graph_op_kind_t>(akind);
    }

    dnnl_graph_op_attr_t convert_to_c(attr aattr) {
        return static_cast<dnnl_graph_op_attr_t>(aattr);
    }
};

/// @} dnnl_graph_api_op

/// @addtogroup dnnl_graph_api_partition Partition
///
/// Partition represents a collection of operations and their input and output
/// logical tensors identified by library as the basic unit for compilation and
/// execution.
///
/// @{

/// A partition object.
class partition : public partition_handle {
public:
    /// Policy specifications for partitioning.
    enum class policy {
        /// Fusion policy returns partitions with typical post-op fusions, eg.
        /// Convolution + ReLU or other element-wise operations or a chian of
        /// post-ops.
        fusion = dnnl_graph_partition_policy_fusion,
        /// Debug policy doesn't not apply any fusions. It returns partitions
        /// with single operations in each partition. The policy is useful when
        /// users notice any bug or correctness issue in fusion policy.
        debug = dnnl_graph_partition_policy_debug,
    };

    partition() = default;

    /// Constructs a partition object
    ///
    /// @param p A raw pointer to the C API handle
    partition(dnnl_graph_partition_t p) { reset(p, false); }

    /// Creates a new partition with a given operator and engine kind. The API
    /// is used to create a partition from an operation directly without
    /// creating the graph and calling `get_partitions()`. The output partition
    /// contains only one operation.
    ///
    /// @param aop An operation used to create the partition.
    /// @param ekind Engine kind.
    partition(const op &aop, engine::kind ekind) {
        dnnl_graph_partition_t p = nullptr;
        error::wrap_c_api(dnnl_graph_partition_create_with_op(&p, aop.get(),
                                  static_cast<dnnl_engine_kind_t>(ekind)),
                "could not create a partition with the op and engine kind");
        reset(p);
    }

    /// Returns the number of operations contained in the partition.
    ///
    /// @returns Number of operations.
    size_t get_ops_num() const {
        size_t num {0};
        error::wrap_c_api(dnnl_graph_partition_get_op_num(get(), &num),
                "could not get number of ops from the partition");
        return num;
    }

    /// Returns all operation IDs contained in the partition.
    ///
    /// @returns An unordered set of operation IDs.
    std::vector<size_t> get_ops() const {
        auto num = get_ops_num();
        std::vector<size_t> ops(num);

        error::wrap_c_api(dnnl_graph_partition_get_ops(get(), num, ops.data()),
                "could not get op ids from the partition");
        return ops;
    }

    /// Returns the unique ID of the partition. Partition ID is generated by the
    /// library internally. The ID can be used for debugging purpose or verbose.
    ///
    /// @returns ID of the partition.
    size_t get_id() const {
        size_t id {};
        error::wrap_c_api(dnnl_graph_partition_get_id(get(), &id),
                "could not get id of the partition");
        return id;
    }

    /// Compiles a partition with given input and output logical tensors. The
    /// output logical tensors can contain unknown dimensions. For this case,
    /// the compilation will deduce the output shapes according to input shapes.
    /// The output logical tensors can also have layout type `any`. The
    /// compilation will choose the optimal layout for output tensors. The
    /// optimal layout will be represented as an opaque layout ID saved in the
    /// output logical tensor.
    ///
    /// @param inputs A list of input logical tensors.
    /// @param outputs A list of output logical tensors.
    /// @param e The engine used to compile the partition.
    /// @returns A compiled partition.
    compiled_partition compile(const std::vector<logical_tensor> &inputs,
            const std::vector<logical_tensor> &outputs, const engine &e) const {
        if (!is_supported()) {
            error::wrap_c_api(dnnl_invalid_arguments,
                    "could not compile an unsupported partition");
        }

        return compile_(inputs, outputs, e);
    }

    /// Returns the supporting status of a partition. Some operations may not be
    /// supported by the library under certain circumstances. During
    /// partitioning stage, unsupported partitions will be returned to users
    /// with each containing an unsupported operation. Users should check the
    /// supporting status of a partition before transforming the computation
    /// graph or compiling the partition.
    ///
    /// @returns @c true if this partition is supported or @c false if this
    ///     partition isn't supported by the library
    bool is_supported() const {
        uint8_t supported {0};
        error::wrap_c_api(dnnl_graph_partition_is_supported(get(), &supported),
                "could not get supporting status of the partition");
        return supported != 0;
    }

    /// Returns a list of input logical tensors from the partition.
    ///
    /// @returns A list of input logical tensors.
    std::vector<logical_tensor> get_input_ports() const {
        size_t num = 0;
        error::wrap_c_api(dnnl_graph_partition_get_input_ports_num(get(), &num),
                "could not get number of inputs of the partition");
        if (num == 0) return {};

        std::vector<dnnl_graph_logical_tensor_t> c_inputs(num);
        error::wrap_c_api(dnnl_graph_partition_get_input_ports(
                                  get(), num, c_inputs.data()),
                "could not get input logical tensors of the partition");

        std::vector<logical_tensor> inputs;
        inputs.reserve(num);
        for (auto &c_lt : c_inputs)
            inputs.emplace_back(c_lt);
        return inputs;
    }

    /// Returns a list of output logical tensors from the partition.
    ///
    /// @returns A list of output logical tensor.
    std::vector<logical_tensor> get_output_ports() const {
        size_t num = 0;
        error::wrap_c_api(
                dnnl_graph_partition_get_output_ports_num(get(), &num),
                "cannot get number of outputs of the partition");
        if (num == 0) return {};

        std::vector<dnnl_graph_logical_tensor_t> c_outputs(num);
        error::wrap_c_api(dnnl_graph_partition_get_output_ports(
                                  get(), num, c_outputs.data()),
                "could not get output logical tensors of the partition");

        std::vector<logical_tensor> outputs;
        outputs.reserve(num);
        for (auto &c_lt : c_outputs)
            outputs.emplace_back(c_lt);
        return outputs;
    }

    /// Returns the engine kind of the partition
    ///
    /// @returns The engine kind
    engine::kind get_engine_kind() const {
        dnnl_engine_kind_t akind;
        error::wrap_c_api(dnnl_graph_partition_get_engine_kind(get(), &akind),
                "cannot get the engine kind from the partition");

        return static_cast<engine::kind>(akind);
    }

private:
    compiled_partition compile_(const std::vector<logical_tensor> &inputs,
            const std::vector<logical_tensor> &outputs, const engine &e) const {
        std::vector<const dnnl_graph_logical_tensor_t *> c_inputs;
        std::vector<const dnnl_graph_logical_tensor_t *> c_outputs;

        c_inputs.reserve(inputs.size());
        for (const auto &in : inputs) {
            c_inputs.push_back(&(in.data));
        }

        c_outputs.reserve(outputs.size());
        for (const auto &out : outputs) {
            c_outputs.push_back(&(out.data));
        }

        dnnl_graph_compiled_partition_t cpartitions = nullptr;
        error::wrap_c_api(
                dnnl_graph_compiled_partition_create(&cpartitions, get()),
                "could not create compiled_partition");
        error::wrap_c_api(dnnl_graph_partition_compile(get(), cpartitions,
                                  c_inputs.size(), c_inputs.data(),
                                  c_outputs.size(), c_outputs.data(), e.get()),
                "partition compile failed");

        return compiled_partition(cpartitions);
    }
};

/// @} dnnl_graph_api_partition

/// @addtogroup dnnl_graph_api_graph Graph
///
/// Graph represents a computational DAG with a set of operations.
/// #dnnl::graph::graph::add_op() adds an operation and its input and output
/// logical tensors into a graph. The library accumulates the operations and
/// logical tensors and constructs and validates the graph as an internal state.
/// A graph object is associated to a specific engine kind. The partitions
/// returned from the graph will inherit the engine kind of the graph.
///
/// @{

/// A graph object.
class graph : public graph_handle {
public:
    /// Constructs a graph with an engine kind.
    ///
    /// @param engine_kind Engine kind.
    graph(engine::kind engine_kind) {
        dnnl_graph_graph_t g = nullptr;
        error::wrap_c_api(
                dnnl_graph_graph_create(&g, convert_to_c(engine_kind)),
                "could not create graph with engine kind");
        reset(g);
    }

    /// Creates a new empty graph with an engine kind and a floating-point math
    /// mode. All partitions returned from the graph will inherit the engine
    /// kind and floating-point math mode.
    ///
    /// Setting the floating-point math mode enables automatic down-conversion
    /// of inputs for the given graph, promoting speedup by using
    /// lower-precision data types when available.
    ///
    /// @param engine_kind Engine kind.
    /// @param mode Floating-point math mode.
    graph(engine::kind engine_kind, fpmath_mode mode) {
        dnnl_graph_graph_t g = nullptr;
        error::wrap_c_api(
                dnnl_graph_graph_create_with_fpmath_mode(
                        &g, convert_to_c(engine_kind), convert_to_c(mode)),
                "could not create graph with engine kind and math mode");
        reset(g);
    }

    /// Set the floating point math mode for a graph. Users can enforce the
    /// graph to comply with the mode by specifying a boolean flag with the
    /// setter function.
    ///
    /// @param mode The floating-point math mode.
    /// @param apply_to_int The flag that controls whether to use
    /// floating-point arithmetic for integral operations.
    void set_fpmath_mode(fpmath_mode mode, bool apply_to_int = false) {
        error::wrap_c_api(dnnl_graph_graph_set_fpmath_mode(
                                  get(), convert_to_c(mode), apply_to_int),
                "could not set fpmath mode graph attribute");
    }

    /// Get the floating point math mode and the boolean flag that specifies
    /// whether the graph will be enforced to comply the mode.
    ///
    /// @param mode The floating-point math mode.
    /// @param apply_to_int The flag that controls whether to use
    /// floating-point arithmetic for integral operations.
    void get_fpmath_mode(fpmath_mode &mode, bool &apply_to_int) const {
        dnnl_fpmath_mode_t c_mode;
        int c_apply_to_int;

        error::wrap_c_api(dnnl_graph_graph_get_fpmath_mode(
                                  get(), &c_mode, &c_apply_to_int),
                "could not get fpmath mode graph attribute");

        mode = fpmath_mode(c_mode);
        apply_to_int = static_cast<bool>(c_apply_to_int);
    }

    /// Adds an op into the graph to construct a computational DAG. The API will
    /// return failure if the operator has already been added to the graph or
    /// the operation cannot pass the schema check in the library (eg. input and
    /// output numbers and data types, the attributes of the operation, etc.).
    ///
    /// @param op An operation to be added.
    /// @param allow_exception A flag indicating whether the method is allowed
    ///     to throw an exception if it fails to add the op to the graph.
    /// @returns #status::success or a status describing the error otherwise.
    status add_op(const op &op, bool allow_exception = true) {
        dnnl_status_t ret = dnnl_graph_add_op(get(), op.get());

        if (allow_exception) {
            error::wrap_c_api(ret, "could not add op to the graph");
        }

        return static_cast<status>(ret);
    }

    /// Finalizes a graph. It means users have finished adding operations into
    /// the graph and the graph is ready for partitioning. Adding a new
    /// operation into a finalized graph will return failures. Similarly,
    /// partitioning on a un-finalized graph will also return failures.
    void finalize() {
        error::wrap_c_api(dnnl_graph_graph_finalize(get()),
                "could not finalize the graph");
    }

    /// Checks if a graph is finalized.
    ///
    /// @return True if the graph is finalized or false if the graph is not
    /// finalized.
    bool is_finalized() const {
        uint8_t ret = 0;
        error::wrap_c_api(dnnl_graph_graph_is_finalized(get(), &ret),
                "could not get the finalization status of the graph");

        return ret != 0;
    }

    /// Gets filtered partitions from a graph. Partitions will be claimed
    /// internally according to the capability of the library, the engine kind
    /// of the graph, and the policy.
    ///
    /// @param policy Partition policy, defaults to policy
    ///     #dnnl::graph::partition::policy::fusion.
    /// @return A vector storing the partitions.
    std::vector<partition> get_partitions(
            partition::policy policy = partition::policy::fusion) {
        if (!is_finalized()) {
            error::wrap_c_api(
                    dnnl_invalid_graph, "the graph is not finalized yet");
        }

        error::wrap_c_api(
                dnnl_graph_graph_filter(get(),
                        static_cast<dnnl_graph_partition_policy_t>(policy)),
                "could not filter the graph");

        size_t num = 0;
        error::wrap_c_api(dnnl_graph_graph_get_partition_num(get(), &num),
                "could not get number of partitions from the graph");

        // return early if there is no partitions in the graph.
        if (num == 0) return {};

        std::vector<partition> out_list;
        out_list.reserve(num);

        std::vector<dnnl_graph_partition_t> partitions(num);
        error::wrap_c_api(
                dnnl_graph_graph_get_partitions(get(), num, partitions.data()),
                "could not get partitions from the graph");

        for (auto p : partitions) {
            out_list.emplace_back(p);
        }

        return out_list;
    }

private:
    static dnnl_fpmath_mode_t convert_to_c(fpmath_mode mode) {
        return static_cast<dnnl_fpmath_mode_t>(mode);
    }

    static dnnl_engine_kind_t convert_to_c(engine::kind akind) {
        return static_cast<dnnl_engine_kind_t>(akind);
    }
};

/// @} dnnl_graph_api_graph

/// @addtogroup dnnl_graph_api_compiled_partition_cache Compiled Partition Cache
///
/// A set of functions that provide compiled partition cache control.
///
/// @{

/// Returns the number of compiled partition that can be held in the compiled
/// partition cache at the same time.
inline int get_compiled_partition_cache_capacity() {
    int result = 0;
    error::wrap_c_api(dnnl_graph_get_compiled_partition_cache_capacity(&result),
            "could not get compiled partition cache capacity");
    return result;
}

/// @copydoc dnnl_graph_set_compiled_partition_cache_capacity(int capacity)
inline void set_compiled_partition_cache_capacity(int capacity) {
    error::wrap_c_api(
            dnnl_graph_set_compiled_partition_cache_capacity(capacity),
            "could not set compiled partition cache capacity");
}

/// @} dnnl_graph_api_compiled_partition_cache

/// @addtogroup dnnl_graph_api_constant_tensor_cache Constant Tensor Cache
///
/// A set of functions that provide constant tensor cache control
///
/// @{

/// Control the enabling or disabling of constant tensor cache. This API must be
/// called once before compilation stage. By default, constant tensor cache is
/// disabled in the library.
/// @note This API is deprecated and will be removed in future release, please
/// use the set_constant_tensor_cache_capacity API to disable
/// constant tensor cache by setting it's capacity to zero.
///
/// @param flag Set to positive value to enable the cache and set to 0 to
/// disable the cache. Negative values are invalid.
inline void set_constant_tensor_cache(int flag) {
    error::wrap_c_api(dnnl_graph_set_constant_tensor_cache(flag),
            "fail to set constant tensor cache");
}

/// Return the enabling status of constant tensor cache.
/// @note This API is deprecated and will be removed in future release, please
/// use the get_constant_tensor_cache_capacity API to check the
/// enabling status by checking it's capacity.
inline int get_constant_tensor_cache() {
    int result = 0;
    error::wrap_c_api(dnnl_graph_get_constant_tensor_cache(&result),
            "fail to get constant tensor cache");
    return result;
}

/// Control the capacity for the constant tensor cache that used for specific
/// engine kind. This API is thread safe and can be called multiple times at
/// runtime. The capacity is set to zero by default which means the cache is
/// disabled. When calling this API, the corresponding cache will be flushed.
/// Setting capacity to 0 means to clear all cached tensors and disable cache.
/// Once the capacity limit is reached, no new tensors will be cached. If there
/// are multiple devices for an engine kind, the capacity set here is for each
/// device.
///
/// @param kind The engine kind that the constant tensor cache used for.
/// @param size The constant tensor cache capacity size to set.
inline void set_constant_tensor_cache_capacity(engine::kind kind, size_t size) {
    error::wrap_c_api(dnnl_graph_set_constant_tensor_cache_capacity(
                              static_cast<dnnl_engine_kind_t>(kind), size),
            "fail to set constant tensor cache capacity");
}

/// Return the current capacity of constant tensor cache.
///
/// @param kind The engine kind that the constant tensor cache used for.
inline size_t get_constant_tensor_cache_capacity(engine::kind kind) {
    size_t size = 0;
    error::wrap_c_api(dnnl_graph_get_constant_tensor_cache_capacity(
                              static_cast<dnnl_engine_kind_t>(kind), &size),
            "fail to get constant tensor cache capacity");
    return size;
}

/// @} dnnl_graph_api_constant_tensor_cache

} // namespace graph

/// @} dnnl_graph_api

} // namespace dnnl

/// @cond DO_NOT_DOCUMENT_THIS

/// oneAPI namespace
// Contains the oneapi::dnnl namespace as an alias to the ::dnnl namespace.
namespace oneapi {
// Note: without this guard, doxygen warns of potentially recursive namespace
#ifndef DOXYGEN_SHOULD_SKIP_THIS
/// oneDNN alias namespace
namespace dnnl = ::dnnl;
#endif
} // namespace oneapi

/// @endcond

/// @} dnnl_api

// NOLINTEND(readability-identifier-naming)
#endif /* ONEAPI_DNNL_DNNL_GRAPH_HPP */