/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

/// @example memory_format_propagation.cpp
/// > Annotated version: @ref memory_format_propagation_cpp

#include <iostream>
#include <sstream>
#include <string>

/// @page memory_format_propagation_cpp_brief
/// @brief This example demonstrates memory format propagation, which is critical for
/// deep learning applications performance.

/// @page memory_format_propagation_cpp Memory Format Propagation
/// \copybrief memory_format_propagation_cpp_brief
///
/// > Example code: @ref memory_format_propagation.cpp
///
/// Memory format propagation is one of the central notions that needs to be
/// well-understood to use oneDNN correctly.
///
/// Convolution and inner product primitives choose the memory format when you
/// create them with the placeholder memory format
/// #dnnl::memory::format_tag::any for input or output. The memory format
/// chosen is based on different circumstances such as hardware and
/// convolutional parameters. Using the placeholder memory format is the
/// recommended practice for convolutions, since they are the most
/// compute-intensive operations in most topologies where they are present.
///
/// Other primitives, such as Elementwise, LRN, batch normalization and other,
/// on forward propagation should use the same memory format as the preceding
/// layer thus propagating the memory format through multiple oneDNN primitives.
/// This avoids unnecessary reorders which may be expensive and should be
/// avoided unless a compute-intensive primitive requires a different format.
/// For performance reasons, backward computations of such primitives requires
/// consistent memory format with the corresponding forward computations.
/// Hence, when initializing there primitives for backward computations you
/// should use #dnnl::memory::format_tag::any memory format tag as well.
///
/// Below is the short summary when to use and not to use memory format
/// #dnnl::memory::format_tag::any during operation description initialization:
///
/// | Primitive Kinds                                                                                                               | Forward Propagation                                                                               | Backward Propagation                                                                                | No Propagation                                                                                    |
/// | :--                                                                                                                           | :--                                                                                               | :--                                                                                                 | :--                                                                                               |
/// | Compute intensive: (De-)convolution, Inner product, RNN                                                                       | Use #dnnl::memory::format_tag::any                                                                | Use #dnnl::memory::format_tag::any                                                                  | N/A                                                                                               |
/// | Compute intensive (no propagation): Matrix Multiplication                                                                     | N/A                                                                                               | N/A                                                                                                 | Use #dnnl::memory::format_tag::any                                                                |
/// | Memory-bandwidth limited: Pooling, Layer and Batch Normalization, Local Response Normalization, Elementwise, Shuffle, Softmax | Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs | Use #dnnl::memory::format_tag::any for gradient tensors, and actual memory formats for data tensors | N/A                                                                                               |
/// | Memory-bandwidth limited: Reorder, Concat, Sum, Binary                                                                        | N/A                                                                                               | N/A                                                                                                 | Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs |
///
/// Additional format synchronization is required between forward and backward
/// computations when running training workloads. This topic is covered in
/// [Training-Specific Aspects](@ref dev_guide_inference_and_training_aspects_training).
///
/// For better understanding of the architecture and design of oneDNN
/// as well as the concepts used in the library, please refer to @ref
/// dev_guide_understanding_memory_formats.
///
/// @section memory_format_propagation_intro Introduction to the tutorial
///
/// This C++ API example demonstrates how to use optimized memory formats
/// supported by oneDNN:
/// - How to configure primitives to use optimized memory formats.
/// - How to determine whether data needs to be reordered from/to optimized
///   memory formats.
///
/// This tutorial assumes that the reader has already reviewed the
/// @ref getting_started_cpp tutorial.
///
/// The example is built around a CNN consisting of a convolution followed by
/// a pooling and consists of the following steps:
/// 1. Create a pooling primitive descriptor based on the memory format chosen
///    by the convolution primitive.
/// 2. Create memory descriptors for input and output data in the NCHW memory
///    format.
/// 3. Determine if input and output data needs to be reordered from/to the
///    optimized memory format.
/// 4. Create memory objects; and necessary primitives and execute them.
///
/// These steps are implemented in the @ref memory_format_propagation_tutorial
/// which in turn is called from `main()` which is also responsible for error
/// handling.

#include "oneapi/dnnl/dnnl.hpp"

#include "example_utils.hpp"

using namespace dnnl;

/// @page memory_format_propagation_cpp
/// @section memory_format_propagation_tutorial memory_format_propagation() function
///
void memory_format_propagation_tutorial(engine::kind engine_kind) {
    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub1 Initialization
    ///
    /// We start by creating an engine and a stream that we will use when
    /// creating primitive descriptors and executing primitives.
    ///
    /// @snippet memory_format_propagation.cpp Initialize engine and stream
    // [Initialize engine and stream]
    engine eng(engine_kind, 0);
    stream s(eng);
    // [Initialize engine and stream]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub2 Create convolution and pooling primitives
    ///
    /// To specify that a primitive should pick an optimized format for the
    /// specified computation parameters, we create memory descriptors with
    /// memory format set to @ref dnnl::memory::format_tag::any.
    ///
    /// This approach works only for a limited set of primitives: convolutions
    /// and inner products. Additionally, @ref dnnl::memory::format_tag::any
    /// can be specified for destination memory descriptors which implies that
    /// destination will have the same memory format as the source.
    ///
    /// @snippet memory_format_propagation.cpp Create placeholder memory descriptors
    // [Create placeholder memory descriptors]
    // Tensor and kernel dimensions. We use the same 3x3 kernel with padding=1
    // for both convolution and pooling primitives, which means that the
    // activation tensor shapes do not change.
    const int N = 1, H = 14, W = 14, IC = 128, OC = 256, KH = 3, KW = 3;
    auto conv_src_md = memory::desc({N, IC, H, W}, memory::data_type::f32,
            memory::format_tag::any // let convolution choose memory format
    );
    auto conv_weights_md = memory::desc(
            {OC, IC, KH, KW}, memory::data_type::f32,
            memory::format_tag::any // let convolution choose memory format
    );
    auto conv_dst_md = memory::desc({N, OC, H, W}, memory::data_type::f32,
            memory::format_tag::any // let convolution choose memory format
    );
    const auto &pool_dst_md = conv_dst_md; // shape does not change
    // [Create placeholder memory descriptors]

    /// @page memory_format_propagation_cpp
    ///
    /// Next, we pass the memory descriptors to primitive descriptors
    /// constructors.
    ///
    /// @snippet memory_format_propagation.cpp Create convolution and pooling primitive descriptors
    // [Create convolution and pooling primitive descriptors]
    auto conv_pd = convolution_forward::primitive_desc(
            eng, prop_kind::forward_inference, algorithm::convolution_auto,
            conv_src_md, conv_weights_md,
            conv_dst_md, // shape information
            {1, 1}, // strides
            {1, 1}, {1, 1} // left and right padding
    );

    auto pool_pd
            = pooling_forward::primitive_desc(eng, prop_kind::forward_inference,
                    algorithm::pooling_max, conv_pd.dst_desc(),
                    pool_dst_md, // shape information
                    {1, 1}, {KH, KW}, // strides and kernel
                    {0, 0}, // dilation
                    {1, 1}, {1, 1} // left and right padding
            );
    // [Create convolution and pooling primitive descriptors]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub3 Create source and destination memory objects
    ///
    /// We assume that the 'user' source and destination memory format is
    /// NCHW. Since there is no result validation in this tutorial, we do not
    /// bother with filling the data with some values and let oneDNN
    /// allocate the memory.
    ///
    /// @snippet memory_format_propagation.cpp Create source and destination memory objects
    // [Create source and destination memory objects]
    auto src_mem = memory(
            {{N, IC, H, W}, memory::data_type::f32, memory::format_tag::nchw},
            eng);
    auto weights_mem = memory({{OC, IC, KH, KW}, memory::data_type::f32,
                                      memory::format_tag::oihw},
            eng);
    auto dst_mem = memory(
            {{N, OC, H, W}, memory::data_type::f32, memory::format_tag::nchw},
            eng);
    // [Create source and destination memory objects]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub4 Determine if source and destination need to be reordered
    ///
    /// The idiomatic way to check if a reorder is necessary between the memory
    /// format expected a primitive (the convolution in our case) and the
    /// available memory format is to compare the corresponding memory
    /// descriptors.
    ///
    /// @snippet memory_format_propagation.cpp Determine if source needs to be reordered
    // [Determine if source needs to be reordered]
    bool need_reorder_src = conv_pd.src_desc() != src_mem.get_desc();
    // [Determine if source needs to be reordered]

    /// @page memory_format_propagation_cpp
    ///
    /// @warning It is by design that it is not possible to just compare
    /// memory tags. The reason behind this is that a memory format tags only
    /// provide a partial description of how data is laid out in memory and do
    /// not, for example, describe memory objects obtained via sub-memory
    /// constructor.
    ///
    /// We repeat the process for the weights and destination memory format
    /// descriptors as well.
    ///
    /// @snippet memory_format_propagation.cpp Determine if weights and destination need to be reordered
    // [Determine if weights and destination need to be reordered]
    bool need_reorder_weights
            = conv_pd.weights_desc() != weights_mem.get_desc();
    bool need_reorder_dst = conv_pd.dst_desc() != dst_mem.get_desc();
    // [Determine if weights and destination need to be reordered]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub45 Allocate intermediate buffers if necessary
    ///
    /// Based on the flags computed before, we can now decide if we need extra
    /// intermediate buffers to hold the source and weights data for the
    /// convolution and the output of the pooling.
    ///
    /// Memory objects for the intermediate buffers are created based on the
    /// memory descriptors obtained from the primitive descriptors to ensure
    /// consistency.
    ///
    /// @snippet memory_format_propagation.cpp Allocate intermediate buffers if necessary
    // [Allocate intermediate buffers if necessary]
    auto conv_src_mem
            = need_reorder_src ? memory(conv_pd.src_desc(), eng) : src_mem;
    auto conv_weights_mem = need_reorder_weights
            ? memory(conv_pd.weights_desc(), eng)
            : weights_mem;
    auto conv_dst_mem = memory(conv_pd.dst_desc(), eng);
    auto pool_dst_mem
            = need_reorder_dst ? memory(pool_pd.dst_desc(), eng) : dst_mem;
    // [Allocate intermediate buffers if necessary]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub5 Perform reorders for source data if necessary
    ///
    /// Now we get to the part where we actually start executing things. We
    /// check if reorders are necessary based on the flags computed before and
    /// create and execute them immediately.
    ///
    /// @note We call @ref dnnl::stream::wait() before reorder primitives
    /// get out of scope and destroyed to accommodate for potentially
    /// asynchronous execution.
    ///
    /// @snippet memory_format_propagation.cpp Perform reorders for source data if necessary
    // [Perform reorders for source data if necessary]
    if (need_reorder_src) {
        auto reorder_src = reorder(src_mem, conv_src_mem);
        reorder_src.execute(
                s, {{DNNL_ARG_FROM, src_mem}, {DNNL_ARG_TO, conv_src_mem}});
        s.wait(); // wait for the reorder to complete
    }

    if (need_reorder_weights) {
        auto reorder_weights = reorder(weights_mem, conv_weights_mem);
        reorder_weights.execute(s,
                {{DNNL_ARG_FROM, weights_mem},
                        {DNNL_ARG_TO, conv_weights_mem}});
        s.wait(); // wait for the reorder to complete
    }
    // [Perform reorders for source data if necessary]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub6 Create and execute convolution and pooling primitives
    ///
    /// After the reorders, we are now ready to compute convolution and
    /// pooling.
    ///
    /// @snippet memory_format_propagation.cpp Create and execute convolution and pooling primitives
    // [Create and execute convolution and pooling primitives]
    auto conv_scratchpad_mem = memory(conv_pd.scratchpad_desc(), eng);
    auto conv = convolution_forward(conv_pd);
    conv.execute(s,
            {{DNNL_ARG_SRC, conv_src_mem}, {DNNL_ARG_WEIGHTS, conv_weights_mem},
                    {DNNL_ARG_DST, conv_dst_mem}});
    auto pool_scratchpad_mem = memory(pool_pd.scratchpad_desc(), eng);
    auto pool = pooling_forward(pool_pd);
    pool.execute(
            s, {{DNNL_ARG_SRC, conv_dst_mem}, {DNNL_ARG_DST, pool_dst_mem}});
    s.wait();
    // [Create and execute convolution and pooling primitives]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub7 Reorder destination data if necessary
    ///
    /// The only potentially remaining operation is a reorder from the pooling
    /// destination memory object to the user's one.  Similarly to the
    /// reorders for the source and weights memory objects, it is performed
    /// depending on the value of the previously computed flag.
    ///
    /// @snippet memory_format_propagation.cpp Reorder destination data if necessary
    // [Reorder destination data if necessary]
    if (need_reorder_dst) {
        auto reorder_dst = reorder(pool_dst_mem, dst_mem);
        reorder_dst.execute(
                s, {{DNNL_ARG_FROM, pool_dst_mem}, {DNNL_ARG_TO, dst_mem}});
        s.wait();
    }
    // [Reorder destination data if necessary]
}

int main(int argc, char **argv) {
    return handle_example_errors(
            memory_format_propagation_tutorial, parse_engine_kind(argc, argv));
}

/// @page memory_format_propagation_cpp
/// @subsection memory_format_propagation_results Results
///
/// Upon compiling and run the example the output should be just:
///
/// ~~~sh
/// Example passed.
/// ~~~
///
/// It may be interesting to check what really happens during the run. We can
/// use `ONEDNN_VERBOSE` environment variable for that (see also @ref
/// dev_guide_verbose). Here's an example output:
///
/// ~~~sh
/// $ ONEDNN_VERBOSE=1 ./memory-format-propagation-cpp
/// onednn_verbose,v0,info,oneDNN <ver> (Git Hash <hash>)
/// onednn_verbose,v0,info,cpu,runtime:OpenMP
/// onednn_verbose,v0,info,cpu,isa:Intel AVX2
/// onednn_verbose,v0,info,gpu,runtime:none
/// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,
///     src_f32::blocked:abcd:f0 dst_f32::blocked:aBcd8b:f0,,,1x128x14x14,0.326904
/// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,
///     src_f32::blocked:abcd:f0 dst_f32::blocked:ABcd8b8a:f0,,,256x128x3x3,0.244141
/// onednn_verbose,v0,exec,cpu,convolution,jit:avx2,forward_inference,
///     src_f32::blocked:aBcd8b:f0 wei_f32::blocked:ABcd8b8a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd8b:f0,,
///     alg:convolution_direct,mb1_ic128oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1,1.20312
/// onednn_verbose,v0,exec,cpu,pooling,jit:avx,forward_inference,
///     src_f32::blocked:aBcd8b:f0 dst_f32::blocked:aBcd8b:f0 ws_undef::undef::f0,,
///     alg:pooling_max,mb1ic256_ih14oh14kh3sh1ph1_iw14ow14kw3sw1pw1,0.187012
/// onednn_verbose,v0,exec,cpu,reorder,jit:uni,undef,
///     src_f32::blocked:aBcd8b:f0 dst_f32::blocked:abcd:f0,,,1x256x14x14,0.0419922
/// Example passed on CPU.
/// ~~~
///
/// From this output we can deduce that:
/// * The convolution primitive picked up @ref
///   dnnl::memory::format_tag::aBcd8b optimized memory format for
///   activations. In this format the channels dimension (denoted by letter B
///   since it is the second dimension; see also @ref dev_guide_conventions)
///   is blocked by a factor of 8. Because of this memory format is different
///   from the NCHW format the tutorial uses, the source and destination had
///   to be reordered to and from this optimized memory layout.
/// * The convolution primitive picked up @ref
///   dnnl::memory::format_tag::ABcd8b8a optimized memory format (output (A)
///   and input (B) channel dimensions blocked by 8) which we also had to
///   reorder the initial weights to since they are in the OIHW memory format.