api: adjust header structure to fulfil oneAPI requirements. Introduce API v2.

2025-10-20 18:43:49 +08:00 · 2020-11-21 06:24:33 -08:00
parent 0ac9e76eec
commit f1aa540beb
321 changed files with 18607 additions and 19002 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -141,8 +141,8 @@ enable_testing()
 include_directories(include)

 configure_file(
-    "${PROJECT_SOURCE_DIR}/include/dnnl_config.h.in"
-    "${PROJECT_BINARY_DIR}/include/dnnl_config.h"
+    "${PROJECT_SOURCE_DIR}/include/oneapi/dnnl/dnnl_config.h.in"
+    "${PROJECT_BINARY_DIR}/include/oneapi/dnnl/dnnl_config.h"
 )
 include_directories(${PROJECT_BINARY_DIR}/include)

--- a/cmake/Doxygen.cmake
+++ b/cmake/Doxygen.cmake
@ -47,8 +47,8 @@ if(DOXYGEN_FOUND)
        DESTINATION ${DOXYGEN_OUTPUT_DIR}/html/assets/mathjax/config/
        )
    file(GLOB_RECURSE HEADERS
-        ${PROJECT_SOURCE_DIR}/include/*.h
-        ${PROJECT_SOURCE_DIR}/include/*.hpp
+        ${PROJECT_SOURCE_DIR}/include/oneapi/dnnl/*.h
+        ${PROJECT_SOURCE_DIR}/include/oneapi/dnnl/*.hpp
        )
    file(GLOB_RECURSE DOX
        ${PROJECT_SOURCE_DIR}/doc/*
--- a/cmake/OpenCL.cmake
+++ b/cmake/OpenCL.cmake
@ -25,7 +25,6 @@ set(OpenCL_cmake_included true)
 if(DNNL_GPU_RUNTIME STREQUAL "OCL")
    message(STATUS "GPU support is enabled (OpenCL)")
 else()
-    message(STATUS "GPU support is disabled")
    return()
 endif()

--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@ -41,8 +41,8 @@ if(NOT GIT_FOUND OR RESULT)
 endif()

 configure_file(
-    "${PROJECT_SOURCE_DIR}/include/dnnl_version.h.in"
-    "${PROJECT_BINARY_DIR}/include/dnnl_version.h"
+    "${PROJECT_SOURCE_DIR}/include/oneapi/dnnl/dnnl_version.h.in"
+    "${PROJECT_BINARY_DIR}/include/oneapi/dnnl/dnnl_version.h"
 )

 if(WIN32)
--- a/doc/advanced/design/understanding_memory_formats.md
+++ b/doc/advanced/design/understanding_memory_formats.md
@ -115,9 +115,9 @@ in this example.

 One can create memory with **NCHW** data layout using
 #dnnl_nchw of the enum type #dnnl_format_tag_t defined in
-[dnnl_types.h](https://github.com/oneapi-src/oneDNN/blob/master/include/dnnl_types.h)
+[dnnl_types.h](https://github.com/oneapi-src/oneDNN/blob/master/include/oneapi/dnnl/dnnl_types.h)
 for the C API, and dnnl::memory::format_tag::nchw defined in
-[dnnl.hpp](https://github.com/oneapi-src/oneDNN/blob/master/include/dnnl.hpp)
+[dnnl.hpp](https://github.com/oneapi-src/oneDNN/blob/master/include/oneapi/dnnl/dnnl.hpp)
 for the C++ API.


--- a/doc/advanced/opencl_interoperability.md
+++ b/doc/advanced/opencl_interoperability.md
@ -1,17 +1,21 @@
 OpenCL Interoperability {#dev_guide_opencl_interoperability}
-===============================================================
+============================================================

-oneDNN uses the OpenCL runtime for GPU engines to interact with the GPU.
-Users may need to use oneDNN with other code that uses OpenCL. For that
-purpose, the library provides API extensions to interoperate with underlying
-OpenCL objects.
+> [API Reference](@ref dnnl_api_ocl_interop)
+
+## Overview
+
+oneDNN uses the OpenCL runtime for GPU engines to interact with the GPU. Users
+may need to use oneDNN with other code that uses OpenCL. For that purpose, the
+library provides API extensions to interoperate with underlying OpenCL objects.
+This interoperability API is defined in the `dnnl_ocl.hpp` header.

 The interoperability API is provided for two scenarios:
 - Construction of oneDNN objects based on existing OpenCL objects
 - Accessing OpenCL objects for existing oneDNN objects

-The mapping between oneDNN and OpenCL objects is provided in the
-following table:
+The mapping between oneDNN and OpenCL objects is provided in the following
+table:

 | oneDNN object        | OpenCL object(s)                |
 | :------------------- | :------------------------------ |
@ -19,46 +23,20 @@ following table:
 | Stream               | `cl_command_queue`              |
 | Memory               | `cl_mem`                        |

-## C++ API Extensions for Interoperability with OpenCL
+The table below summarizes how to construct oneDNN objects based on OpenCL
+objects and how to query underlying OpenCL objects for existing oneDNN objects.

-### API to Construct oneDNN Objects
+| oneDNN object | API to construct oneDNN object                                   | API to access OpenCL object(s)                                                                    |
+| :------------ | :--------------------------------------------------------------- | :------------------------------------------------------------------------------------------------ |
+| Engine        | dnnl::ocl_interop::make_engine(cl_device_id, cl_context)         | dnnl::ocl_interop::get_device(const engine &) <br> dnnl::ocl_interop::get_context(const engine &) |
+| Stream        | dnnl::ocl_interop::make_stream(const engine &, cl_command_queue) | dnnl::ocl_interop::get_command_queue(const stream &)                                              |
+| Memory        | dnnl::memory(const memory::desc &, const engine &, cl_mem)       | dnnl::ocl_interop::get_mem_object(const memory &)                                                 |

-| oneDNN object        | API to construct oneDNN object                                   |
-| :------------------- | :--------------------------------------------------------------- |
-| Engine               | [dnnl::engine(kind, ocl_dev, ocl_ctx)](@ref dnnl::engine)        |
-| Stream               | [dnnl::stream(engine, ocl_queue)](@ref dnnl::stream)             |
-| Memory               | [dnnl::memory(memory_desc, engine, ocl_mem)](@ref dnnl::memory)  |
-
-@note oneDNN follows retain/release OpenCL semantics when using OpenCL
-objects during construction. An OpenCL object is retained on construction and
-released on destruction - that ensures that the OpenCL object will not be
-destroyed while the oneDNN object stores a reference to it.
-
-### API to Access OpenCL Objects
-
-| oneDNN object        | API to access OpenCL object(s)                                     |
-| :------------------- | :----------------------------------------------------------------- |
-| Engine               | dnnl::engine::get_ocl_device() and dnnl::engine::get_ocl_context() |
-| Stream               | dnnl::stream::get_ocl_command_queue()                              |
-| Memory               | dnnl::memory::get_ocl_mem_object()                                 |
+@note oneDNN follows retain/release OpenCL semantics when using OpenCL objects
+during construction. An OpenCL object is retained on construction and released
+on destruction. This ensures that the OpenCL object will not be destroyed while
+the oneDNN object stores a reference to it.

@note The access interfaces do not retain the OpenCL object. It is the user's
 responsibility to retain the returned OpenCL object if necessary.

-## C API Extensions for Interoperability with OpenCL
-
-### API to Construct oneDNN Objects
-
-| oneDNN object        | API to construct oneDNN object                                                         |
-| :------------------- | :------------------------------------------------------------------------------------- |
-| Engine               | [dnnl_engine_create_ocl(&engine, kind, ocl_dev, ocl_ctx)](@ref dnnl_engine_create_ocl) |
-| Stream               | [dnnl_stream_create_ocl(&stream, engine, ocl_queue)](@ref dnnl_stream_create_ocl)      |
-| Memory               | [dnnl_memory_create(&memory, memory_desc, engine, &ocl_mem)](@ref dnnl_memory_create)  |
-
-### API to Access OpenCL Objects
-
-| oneDNN object        | API to access OpenCL object(s)                                 |
-| :------------------- | :------------------------------------------------------------- |
-| Engine               | dnnl_engine_get_ocl_device() and dnnl_engine_get_ocl_context() |
-| Stream               | dnnl_stream_get_ocl_command_queue()                            |
-| Memory               | dnnl_memory_get_ocl_mem_object()                               |
--- a/doc/advanced/threadpool.md
+++ b/doc/advanced/threadpool.md
@ -13,7 +13,7 @@ for testing (see `tests/testing_threadpool.hpp`).
 ~~~cpp
 #include "dnnl_threadpool_iface.hpp"

-class threadpool : public dnnl::threadpool_iface {
+class threadpool : public dnnl::threadpool_interop::threadpool_iface {
 private:
    // Change to Eigen::NonBlockingThreadPool if using Eigen <= 3.3.7
    std::unique_ptr<Eigen::ThreadPool> tp_;
--- a/doc/build/link.md
+++ b/doc/build/link.md
@ -7,13 +7,13 @@ on how oneDNN was built.

 ## Header Files

-| File                   | Description
-| :---                   | :---
-| include/dnnl.h         | C header
-| include/dnnl.hpp       | C++ header
-| include/dnnl_types.h   | Auxiliary C header
-| include/dnnl_config.h  | Auxiliary C header
-| include/dnnl_version.h | C header with version information
+| File                               | Description
+| :---                               | :---
+| include/oneapi/dnnl/dnnl.h         | C header
+| include/oneapi/dnnl/dnnl.hpp       | C++ header
+| include/oneapi/dnnl/dnnl_types.h   | Auxiliary C header
+| include/oneapi/dnnl/dnnl_config.h  | Auxiliary C header
+| include/oneapi/dnnl/dnnl_version.h | C header with version information

 ## Libraries

--- a/examples/cnn_inference_f32.c
+++ b/examples/cnn_inference_f32.c
@ -45,7 +45,7 @@
 #include <stdlib.h>
 #include <string.h>

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "example_utils.h"

--- a/examples/cnn_inference_f32.cpp
+++ b/examples/cnn_inference_f32.cpp
@ -44,7 +44,7 @@
 #include <vector>
 #include <unordered_map>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/cnn_inference_int8.cpp
+++ b/examples/cnn_inference_int8.cpp
@ -26,7 +26,7 @@

 #include <stdexcept>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/cnn_training_bf16.cpp
+++ b/examples/cnn_training_bf16.cpp
@ -30,7 +30,7 @@
 #include <iostream>
 #include <stdexcept>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/cnn_training_f32.cpp
+++ b/examples/cnn_training_f32.cpp
@ -27,7 +27,7 @@

 #include <math.h>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/cpu_cnn_training_f32.c
+++ b/examples/cpu_cnn_training_f32.c
@ -30,7 +30,7 @@
 #include <stdlib.h>
 #include <string.h>

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "example_utils.h"

--- a/examples/cpu_rnn_inference_f32.cpp
+++ b/examples/cpu_rnn_inference_f32.cpp
@ -41,7 +41,7 @@
 #include <numeric>
 #include <string>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/cpu_rnn_inference_int8.cpp
+++ b/examples/cpu_rnn_inference_int8.cpp
@ -41,7 +41,7 @@
 #include <numeric>
 #include <string>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/cross_engine_reorder.c
+++ b/examples/cross_engine_reorder.c
@ -26,7 +26,7 @@
 #include <stdio.h>
 #include <stdlib.h>

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "example_utils.h"

--- a/examples/cross_engine_reorder.cpp
+++ b/examples/cross_engine_reorder.cpp
@ -41,7 +41,8 @@

 /// @snippet cross_engine_reorder.cpp Prologue
 // [Prologue]
-#include "dnnl.hpp"
+#include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/example_utils.h
+++ b/examples/example_utils.h
@ -26,6 +26,10 @@
 #include "dnnl.h"
 #include "dnnl_debug.h"

+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#include "dnnl_ocl.h"
+#endif
+
 #define COMPLAIN_DNNL_ERROR_AND_EXIT(what, status) \
    do { \
        printf("[%s:%d] `%s` returns oneDNN error: %s.\n", __FILE__, __LINE__, \
@ -42,6 +46,18 @@
        exit(2); \
    } while (0)

+static dnnl_engine_kind_t validate_engine_kind(dnnl_engine_kind_t akind) {
+    // Checking if a GPU exists on the machine
+    if (akind == dnnl_gpu) {
+        if (!dnnl_engine_get_count(dnnl_gpu)) {
+            printf("Application couldn't find GPU, please run with CPU "
+                   "instead.\n");
+            exit(0);
+        }
+    }
+    return akind;
+}
+
 #define CHECK(f) \
    do { \
        dnnl_status_t s_ = f; \
@ -51,19 +67,14 @@
 static inline dnnl_engine_kind_t parse_engine_kind(int argc, char **argv) {
    // Returns default engine kind, i.e. CPU, if none given
    if (argc == 1) {
-        return dnnl_cpu;
+        return validate_engine_kind(dnnl_cpu);
    } else if (argc == 2) {
        // Checking the engine type, i.e. CPU or GPU
        char *engine_kind_str = argv[1];
        if (!strcmp(engine_kind_str, "cpu")) {
-            return dnnl_cpu;
+            return validate_engine_kind(dnnl_cpu);
        } else if (!strcmp(engine_kind_str, "gpu")) {
-            // Checking if a GPU exists on the machine
-            if (!dnnl_engine_get_count(dnnl_gpu))
-                COMPLAIN_EXAMPLE_ERROR_AND_EXIT("%s",
-                        "could not find compatible GPU\nPlease run the example "
-                        "with CPU instead");
-            return dnnl_gpu;
+            return validate_engine_kind(dnnl_gpu);
        }
    }

@ -103,14 +114,14 @@ static inline void read_from_dnnl_memory(void *handle, dnnl_memory_t mem) {
        }
    }
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-    else if (eng_kind == dnnl_gpu) {
+    if (eng_kind == dnnl_gpu) {
        dnnl_stream_t s;
        cl_command_queue q;
        cl_mem m;

-        CHECK(dnnl_memory_get_ocl_mem_object(mem, &m));
+        CHECK(dnnl_ocl_interop_memory_get_mem_object(mem, &m));
        CHECK(dnnl_stream_create(&s, eng, dnnl_stream_default_flags));
-        CHECK(dnnl_stream_get_ocl_command_queue(s, &q));
+        CHECK(dnnl_ocl_interop_stream_get_command_queue(s, &q));

        cl_int ret = clEnqueueReadBuffer(
                q, m, CL_TRUE, 0, bytes, handle, 0, NULL, NULL);
@ -121,6 +132,19 @@ static inline void read_from_dnnl_memory(void *handle, dnnl_memory_t mem) {
        dnnl_stream_destroy(s);
    }
 #endif
+
+    if (eng_kind == dnnl_cpu) {
+        void *ptr = NULL;
+        CHECK(dnnl_memory_get_data_handle(mem, &ptr));
+        if (ptr) {
+            for (size_t i = 0; i < bytes; ++i) {
+                ((char *)handle)[i] = ((char *)ptr)[i];
+            }
+        }
+        return;
+    }
+
+    assert(!"not expected");
 }

 // Read from handle, write to memory
@ -146,14 +170,14 @@ static inline void write_to_dnnl_memory(void *handle, dnnl_memory_t mem) {
        }
    }
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-    else if (eng_kind == dnnl_gpu) {
+    if (eng_kind == dnnl_gpu) {
        dnnl_stream_t s;
        cl_command_queue q;
        cl_mem m;

-        CHECK(dnnl_memory_get_ocl_mem_object(mem, &m));
+        CHECK(dnnl_ocl_interop_memory_get_mem_object(mem, &m));
        CHECK(dnnl_stream_create(&s, eng, dnnl_stream_default_flags));
-        CHECK(dnnl_stream_get_ocl_command_queue(s, &q));
+        CHECK(dnnl_ocl_interop_stream_get_command_queue(s, &q));

        cl_int ret = clEnqueueWriteBuffer(
                q, m, CL_TRUE, 0, bytes, handle, 0, NULL, NULL);
@ -162,8 +186,22 @@ static inline void write_to_dnnl_memory(void *handle, dnnl_memory_t mem) {
                    "clEnqueueWriteBuffer failed (status code: %d)", ret);

        dnnl_stream_destroy(s);
+        return;
    }
 #endif
+
+    if (eng_kind == dnnl_cpu) {
+        void *ptr = NULL;
+        CHECK(dnnl_memory_get_data_handle(mem, &ptr));
+        if (ptr) {
+            for (size_t i = 0; i < bytes; ++i) {
+                ((char *)ptr)[i] = ((char *)handle)[i];
+            }
+        }
+        return;
+    }
+
+    assert(!"not expected");
 }

 #endif
--- a/examples/example_utils.hpp
+++ b/examples/example_utils.hpp
@ -30,6 +30,10 @@
 #include "dnnl.hpp"
 #include "dnnl_debug.h"

+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#include "dnnl_ocl.hpp"
+#endif
+
 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP

 #ifdef _MSC_VER
@ -50,6 +54,18 @@
 #define PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(n)
 #endif

+dnnl::engine::kind validate_engine_kind(dnnl::engine::kind akind) {
+    // Checking if a GPU exists on the machine
+    if (akind == dnnl::engine::kind::gpu) {
+        if (dnnl::engine::get_count(dnnl::engine::kind::gpu) == 0) {
+            std::cout << "Application couldn't find GPU, please run with CPU "
+                         "instead.\n";
+            exit(0);
+        }
+    }
+    return akind;
+}
+
 // Exception class to indicate that the example uses a feature that is not
 // available on the current systems. It is not treated as an error then, but
 // just notifies a user.
@ -116,21 +132,14 @@ inline dnnl::engine::kind parse_engine_kind(
        int argc, char **argv, int extra_args = 0) {
    // Returns default engine kind, i.e. CPU, if none given
    if (argc == 1) {
-        return dnnl::engine::kind::cpu;
+        return validate_engine_kind(dnnl::engine::kind::cpu);
    } else if (argc <= extra_args + 2) {
        std::string engine_kind_str = argv[1];
        // Checking the engine type, i.e. CPU or GPU
        if (engine_kind_str == "cpu") {
-            return dnnl::engine::kind::cpu;
+            return validate_engine_kind(dnnl::engine::kind::cpu);
        } else if (engine_kind_str == "gpu") {
-            // Checking if a GPU exists on the machine
-            if (dnnl::engine::get_count(dnnl::engine::kind::gpu) == 0) {
-                std::cout << "Could not find compatible GPU" << std::endl
-                          << "Please run the example with CPU instead"
-                          << std::endl;
-                exit(1);
-            }
-            return dnnl::engine::kind::gpu;
+            return validate_engine_kind(dnnl::engine::kind::gpu);
        }
    }

@ -156,50 +165,69 @@ inline dnnl::memory::dim product(const dnnl::memory::dims &dims) {
 // Read from memory, write to handle
 inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) {
    dnnl::engine eng = mem.get_engine();
-    size_t bytes = mem.get_desc().get_size();
+    size_t size = mem.get_desc().get_size();

    if (eng.get_kind() == dnnl::engine::kind::cpu) {
        uint8_t *src = static_cast<uint8_t *>(mem.get_data_handle());
-        for (size_t i = 0; i < bytes; ++i)
+        for (size_t i = 0; i < size; ++i)
            ((uint8_t *)handle)[i] = src[i];
    }
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-    else if (eng.get_kind() == dnnl::engine::kind::gpu) {
+    if (eng.get_kind() == dnnl::engine::kind::gpu) {
        dnnl::stream s(eng);
-        cl_command_queue q = s.get_ocl_command_queue();
-        cl_mem m = mem.get_ocl_mem_object();
+        cl_command_queue q = dnnl::ocl_interop::get_command_queue(s);
+        cl_mem m = dnnl::ocl_interop::get_mem_object(mem);

        cl_int ret = clEnqueueReadBuffer(
-                q, m, CL_TRUE, 0, bytes, handle, 0, NULL, NULL);
+                q, m, CL_TRUE, 0, size, handle, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
            throw std::runtime_error("clEnqueueReadBuffer failed.");
+        return;
    }
 #endif
+
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+        uint8_t *src = static_cast<uint8_t *>(mem.get_data_handle());
+        for (size_t i = 0; i < size; ++i)
+            ((uint8_t *)handle)[i] = src[i];
+        return;
+    }
+
+    assert(!"not expected");
 }

 // Read from handle, write to memory
 inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {
    dnnl::engine eng = mem.get_engine();
-    size_t bytes = mem.get_desc().get_size();
+    size_t size = mem.get_desc().get_size();

    if (eng.get_kind() == dnnl::engine::kind::cpu) {
        uint8_t *dst = static_cast<uint8_t *>(mem.get_data_handle());
-        for (size_t i = 0; i < bytes; ++i)
+        for (size_t i = 0; i < size; ++i)
            dst[i] = ((uint8_t *)handle)[i];
    }
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-    else if (eng.get_kind() == dnnl::engine::kind::gpu) {
+    if (eng.get_kind() == dnnl::engine::kind::gpu) {
        dnnl::stream s(eng);
-        cl_command_queue q = s.get_ocl_command_queue();
-        cl_mem m = mem.get_ocl_mem_object();
-        size_t bytes = mem.get_desc().get_size();
+        cl_command_queue q = dnnl::ocl_interop::get_command_queue(s);
+        cl_mem m = dnnl::ocl_interop::get_mem_object(mem);

        cl_int ret = clEnqueueWriteBuffer(
-                q, m, CL_TRUE, 0, bytes, handle, 0, NULL, NULL);
+                q, m, CL_TRUE, 0, size, handle, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
            throw std::runtime_error("clEnqueueWriteBuffer failed.");
+        return;
    }
 #endif
+
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+        uint8_t *dst = static_cast<uint8_t *>(mem.get_data_handle());
+        for (size_t i = 0; i < size; ++i)
+            dst[i] = ((uint8_t *)handle)[i];
+        return;
+    }
+
+    assert(!"not expected");
 }

 #endif
--- a/examples/getting_started.cpp
+++ b/examples/getting_started.cpp
@ -23,8 +23,8 @@
 #include <stdexcept>
 #include <vector>

-#include "dnnl.hpp"
-#include "dnnl_debug.h"
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_debug.h"

 #include "example_utils.hpp"

--- a/examples/gpu_opencl_interop.cpp
+++ b/examples/gpu_opencl_interop.cpp
@ -56,7 +56,8 @@

 #include <CL/cl.h>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_ocl.hpp"

 #include "example_utils.hpp"

@ -174,7 +175,7 @@ void gpu_opencl_interop_tutorial() {
    // [oclkernel create]
    const char *kernel_name = "init";
    cl_kernel ocl_init_kernel = create_init_opencl_kernel(
-            eng.get_ocl_context(), kernel_name, ocl_code);
+            ocl_interop::get_context(eng), kernel_name, ocl_code);
    //  [oclkernel create]

    /// The next step is to execute our OpenCL kernel by setting its arguments
@ -185,10 +186,10 @@ void gpu_opencl_interop_tutorial() {
    /// this queue.
    /// @snippet  gpu_opencl_interop.cpp oclexecution
    // [oclexecution]
-    cl_mem ocl_buf = mem.get_ocl_mem_object();
+    cl_mem ocl_buf = ocl_interop::get_mem_object(mem);
    OCL_CHECK(clSetKernelArg(ocl_init_kernel, 0, sizeof(ocl_buf), &ocl_buf));

-    cl_command_queue ocl_queue = strm.get_ocl_command_queue();
+    cl_command_queue ocl_queue = ocl_interop::get_command_queue(strm);
    OCL_CHECK(clEnqueueNDRangeKernel(ocl_queue, ocl_init_kernel, 1, nullptr, &N,
            nullptr, 0, nullptr, nullptr));
    // [oclexecution]
--- a/examples/memory_format_propagation.cpp
+++ b/examples/memory_format_propagation.cpp
@ -91,7 +91,7 @@
 /// which in turn is called from `main()` which is also responsible for error
 /// handling.

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/performance_profiling.cpp
+++ b/examples/performance_profiling.cpp
@ -92,7 +92,7 @@
 #include <stdexcept>
 #include <vector>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/primitives/batch_normalization.cpp
+++ b/examples/primitives/batch_normalization.cpp
@ -39,8 +39,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/binary.cpp
+++ b/examples/primitives/binary.cpp
@ -37,8 +37,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/concat.cpp
+++ b/examples/primitives/concat.cpp
@ -38,8 +38,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/convolution.cpp
+++ b/examples/primitives/convolution.cpp
@ -38,8 +38,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/eltwise.cpp
+++ b/examples/primitives/eltwise.cpp
@ -34,8 +34,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/inner_product.cpp
+++ b/examples/primitives/inner_product.cpp
@ -37,8 +37,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/layer_normalization.cpp
+++ b/examples/primitives/layer_normalization.cpp
@ -38,8 +38,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/logsoftmax.cpp
+++ b/examples/primitives/logsoftmax.cpp
@ -38,8 +38,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/lrn.cpp
+++ b/examples/primitives/lrn.cpp
@ -34,8 +34,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/lstm.cpp
+++ b/examples/primitives/lstm.cpp
@ -37,8 +37,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/matmul.cpp
+++ b/examples/primitives/matmul.cpp
@ -36,8 +36,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/pooling.cpp
+++ b/examples/primitives/pooling.cpp
@ -34,8 +34,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/reduction.cpp
+++ b/examples/primitives/reduction.cpp
@ -29,8 +29,8 @@

 #include <cmath>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/reorder.cpp
+++ b/examples/primitives/reorder.cpp
@ -36,8 +36,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/resampling.cpp
+++ b/examples/primitives/resampling.cpp
@ -34,8 +34,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/shuffle.cpp
+++ b/examples/primitives/shuffle.cpp
@ -36,8 +36,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/softmax.cpp
+++ b/examples/primitives/softmax.cpp
@ -38,8 +38,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/primitives/sum.cpp
+++ b/examples/primitives/sum.cpp
@ -36,8 +36,8 @@
 #include <string>
 #include <vector>

-#include "dnnl.hpp"
 #include "example_utils.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 using namespace dnnl;

--- a/examples/rnn_training_f32.cpp
+++ b/examples/rnn_training_f32.cpp
@ -27,7 +27,7 @@
 #include <numeric>
 #include <utility>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/tutorials/matmul/cpu_matmul_quantization.cpp
+++ b/examples/tutorials/matmul/cpu_matmul_quantization.cpp
@ -121,7 +121,7 @@
 #include <vector>
 #include <type_traits>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/tutorials/matmul/cpu_sgemm_and_matmul.cpp
+++ b/examples/tutorials/matmul/cpu_sgemm_and_matmul.cpp
@ -61,7 +61,7 @@
 #include <stdexcept>
 #include <vector>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/examples/tutorials/matmul/inference_int8_matmul.cpp
+++ b/examples/tutorials/matmul/inference_int8_matmul.cpp
@ -62,7 +62,7 @@
 #include <stdexcept>
 #include <vector>

-#include "dnnl.hpp"
+#include "oneapi/dnnl/dnnl.hpp"

 #include "example_utils.hpp"

--- a/include/dnnl.h
+++ b/include/dnnl.h
--- a/include/dnnl.hpp
+++ b/include/dnnl.hpp
--- a/include/dnnl_config.h
+++ b/include/dnnl_config.h
@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_CONFIG_H
+#define DNNL_CONFIG_H
+
+#include "oneapi/dnnl/dnnl_config.h"
+
+#endif /* DNNL_CONFIG_H */
--- a/include/dnnl_debug.h
+++ b/include/dnnl_debug.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2020 Intel Corporation
+* Copyright 2020 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,60 +14,9 @@
 * limitations under the License.
 *******************************************************************************/

-// DO NOT EDIT, AUTO-GENERATED
-
-// clang-format off
-
 #ifndef DNNL_DEBUG_H
 #define DNNL_DEBUG_H

-/// @file
-/// Debug capabilities
+#include "oneapi/dnnl/dnnl_debug.h"

-#include "dnnl_config.h"
-#include "dnnl_types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-const char DNNL_API *dnnl_status2str(dnnl_status_t v);
-const char DNNL_API *dnnl_dt2str(dnnl_data_type_t v);
-const char DNNL_API *dnnl_fmt_kind2str(dnnl_format_kind_t v);
-const char DNNL_API *dnnl_fmt_tag2str(dnnl_format_tag_t v);
-const char DNNL_API *dnnl_prop_kind2str(dnnl_prop_kind_t v);
-const char DNNL_API *dnnl_prim_kind2str(dnnl_primitive_kind_t v);
-const char DNNL_API *dnnl_alg_kind2str(dnnl_alg_kind_t v);
-const char DNNL_API *dnnl_rnn_flags2str(dnnl_rnn_flags_t v);
-const char DNNL_API *dnnl_rnn_direction2str(dnnl_rnn_direction_t v);
-const char DNNL_API *dnnl_engine_kind2str(dnnl_engine_kind_t v);
-const char DNNL_API *dnnl_scratchpad_mode2str(dnnl_scratchpad_mode_t v);
-const char DNNL_API *dnnl_cpu_isa2str(dnnl_cpu_isa_t v);
-
-const char DNNL_API *dnnl_runtime2str(unsigned v);
-
-/// Forms a format string for a given memory descriptor.
-///
-/// The format is defined as: 'dt:[p|o|0]:fmt_kind:fmt:extra'.
-/// Here:
-///  - dt       -- data type
-///  - p        -- indicates there is non-trivial padding
-///  - o        -- indicates there is non-trivial padding offset
-///  - 0        -- indicates there is non-trivial offset0
-///  - fmt_kind -- format kind (blocked, wino, etc...)
-///  - fmt      -- extended format string (format_kind specific)
-///  - extra    -- shows extra fields (underspecified)
-int DNNL_API dnnl_md2fmt_str(char *fmt_str, size_t fmt_str_len,
-        const dnnl_memory_desc_t *md);
-
-/// Forms a dimension string for a given memory descriptor.
-///
-/// The format is defined as: 'dim0xdim1x...xdimN
-int DNNL_API dnnl_md2dim_str(char *dim_str, size_t dim_str_len,
-        const dnnl_memory_desc_t *md);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+#endif /* DNNL_DEBUG_H */
--- a/include/dnnl_ocl.h
+++ b/include/dnnl_ocl.h
@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_OCL_H
+#define DNNL_OCL_H
+
+#include "oneapi/dnnl/dnnl_ocl.h"
+
+#endif /* DNNL_OCL_H */
--- a/include/dnnl_ocl.hpp
+++ b/include/dnnl_ocl.hpp
@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_OCL_HPP
+#define DNNL_OCL_HPP
+
+#include "oneapi/dnnl/dnnl_ocl.hpp"
+
+#endif /* DNNL_OCL_HPP */
--- a/include/dnnl_threadpool.h
+++ b/include/dnnl_threadpool.h
@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_THREADPOOL_H
+#define DNNL_THREADPOOL_H
+
+#include "oneapi/dnnl/dnnl_threadpool.h"
+
+#endif /* DNNL_THREADPOOL_H */
--- a/include/dnnl_threadpool.hpp
+++ b/include/dnnl_threadpool.hpp
@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_THREADPOOL_HPP
+#define DNNL_THREADPOOL_HPP
+
+#include "oneapi/dnnl/dnnl_threadpool.hpp"
+
+#endif /* DNNL_THREADPOOL_HPP */
--- a/include/dnnl_threadpool_iface.hpp
+++ b/include/dnnl_threadpool_iface.hpp
@ -17,37 +17,6 @@
 #ifndef DNNL_THREADPOOL_IFACE_HPP
 #define DNNL_THREADPOOL_IFACE_HPP

-#include <functional>
+#include "oneapi/dnnl/dnnl_threadpool_iface.hpp"

-namespace dnnl {
-
-/// Abstract threadpool interface. The users are expected to subclass this
-/// interface and pass an object to the library during CPU stream creation or
-/// directly in case of BLAS functions.
-struct threadpool_iface {
-    /// Returns the number of worker threads.
-    virtual int get_num_threads() const = 0;
-
-    /// Returns true if the calling thread belongs to this threadpool.
-    virtual bool get_in_parallel() const = 0;
-
-    /// Submits n instances of a closure for execution in parallel:
-    ///
-    /// for (int i = 0; i < n; i++) fn(i, n);
-    ///
-    virtual void parallel_for(int n, const std::function<void(int, int)> &fn)
-            = 0;
-
-    /// Returns threadpool behavior flags bit mask (see below).
-    virtual uint64_t get_flags() const = 0;
-
-    /// If set, parallel_for() returns immediately and oneDNN needs implement
-    /// waiting for the submitted closures to finish execution on its own.
-    static constexpr uint64_t ASYNCHRONOUS = 1;
-
-    virtual ~threadpool_iface() {}
-};
-
-} // namespace dnnl
-
-#endif
+#endif /* DNNL_THREADPOOL_IFACE_HPP */
--- a/include/dnnl_types.h
+++ b/include/dnnl_types.h
--- a/include/dnnl_version.h
+++ b/include/dnnl_version.h
@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_VERSION_H
+#define DNNL_VERSION_H
+
+#include "oneapi/dnnl/dnnl_version.h"
+
+#endif /* DNNL_VERSION_H */
--- a/include/mkldnn_dnnl_mangling.h
+++ b/include/mkldnn_dnnl_mangling.h
@ -85,6 +85,8 @@
 #define MKLDNN_RUNTIME_OMP DNNL_RUNTIME_OMP
 #define MKLDNN_RUNTIME_SEQ DNNL_RUNTIME_SEQ
 #define MKLDNN_RUNTIME_TBB DNNL_RUNTIME_TBB
+#define MKLDNN_RUNTIME_SYCL DNNL_RUNTIME_SYCL
+#define MKLDNN_WITH_SYCL DNNL_WITH_SYCL
 #define MKLDNN_VERBOSE DNNL_VERBOSE
 #define MKLDNN_VERSION_HASH DNNL_VERSION_HASH
 #define MKLDNN_VERSION_MAJOR DNNL_VERSION_MAJOR
@ -388,12 +390,12 @@
 #define mkldnn_eltwise_tanh dnnl_eltwise_tanh
 #define mkldnn_engine dnnl_engine
 #define mkldnn_engine_create dnnl_engine_create
-#define mkldnn_engine_create_ocl dnnl_engine_create_ocl
+#define mkldnn_engine_create_ocl dnnl_ocl_interop_engine_create
 #define mkldnn_engine_destroy dnnl_engine_destroy
 #define mkldnn_engine_get_count dnnl_engine_get_count
 #define mkldnn_engine_get_kind dnnl_engine_get_kind
-#define mkldnn_engine_get_ocl_context dnnl_engine_get_ocl_context
-#define mkldnn_engine_get_ocl_device dnnl_engine_get_ocl_device
+#define mkldnn_engine_get_ocl_context dnnl_ocl_interop_engine_get_context
+#define mkldnn_engine_get_ocl_device dnnl_ocl_interop_get_device
 #define mkldnn_engine_kind2str dnnl_engine_kind2str
 #define mkldnn_engine_kind_t dnnl_engine_kind_t
 #define mkldnn_engine_t dnnl_engine_t
@ -542,10 +544,10 @@
 #define mkldnn_memory_get_data_handle dnnl_memory_get_data_handle
 #define mkldnn_memory_get_engine dnnl_memory_get_engine
 #define mkldnn_memory_get_memory_desc dnnl_memory_get_memory_desc
-#define mkldnn_memory_get_ocl_mem_object dnnl_memory_get_ocl_mem_object
+#define mkldnn_memory_get_ocl_mem_object dnnl_ocl_interop_memory_get_mem_object
 #define mkldnn_memory_map_data dnnl_memory_map_data
 #define mkldnn_memory_set_data_handle dnnl_memory_set_data_handle
-#define mkldnn_memory_set_ocl_mem_object dnnl_memory_set_ocl_mem_object
+#define mkldnn_memory_set_ocl_mem_object dnnl_ocl_interop_memory_set_mem_object
 #define mkldnn_memory_t dnnl_memory_t
 #define mkldnn_memory_unmap_data dnnl_memory_unmap_data
 #define mkldnn_nCdhw16c dnnl_nCdhw16c
@ -711,12 +713,12 @@
 #define mkldnn_status_t dnnl_status_t
 #define mkldnn_stream dnnl_stream
 #define mkldnn_stream_create dnnl_stream_create
-#define mkldnn_stream_create_ocl dnnl_stream_create_ocl
+#define mkldnn_stream_create_ocl dnnl_ocl_interop_stream_create
 #define mkldnn_stream_default_flags dnnl_stream_default_flags
-#define mkldnn_stream_default_order dnnl_stream_default_order
 #define mkldnn_stream_destroy dnnl_stream_destroy
 #define mkldnn_stream_flags_t dnnl_stream_flags_t
-#define mkldnn_stream_get_ocl_command_queue dnnl_stream_get_ocl_command_queue
+#define mkldnn_stream_get_ocl_command_queue \
+    dnnl_ocl_interop_stream_get_command_queue
 #define mkldnn_stream_in_order dnnl_stream_in_order
 #define mkldnn_stream_out_of_order dnnl_stream_out_of_order
 #define mkldnn_stream_t dnnl_stream_t
--- a/include/oneapi/dnnl/dnnl.h
+++ b/include/oneapi/dnnl/dnnl.h
--- a/include/oneapi/dnnl/dnnl.hpp
+++ b/include/oneapi/dnnl/dnnl.hpp
--- a/include/oneapi/dnnl/dnnl_config.h.in
+++ b/include/oneapi/dnnl/dnnl_config.h.in
@ -14,10 +14,10 @@
 * limitations under the License.
 *******************************************************************************/

-#ifndef DNNL_CONFIG_H
-#define DNNL_CONFIG_H
+#ifndef ONEAPI_DNNL_DNNL_CONFIG_H
+#define ONEAPI_DNNL_DNNL_CONFIG_H

-#include "dnnl_types.h"
+#include "oneapi/dnnl/dnnl_types.h"

 /// @cond DO_NOT_DOCUMENT_THIS

@ -74,11 +74,40 @@
 #error "Unexpected DNNL_CPU_RUNTIME"
 #endif
 #if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
-        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL)
+        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL) \
+        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL)
 #error "Unexpected DNNL_GPU_RUNTIME"
 #endif
 #else
 #error "BOTH DNNL_CPU_RUNTIME and DNNL_GPU_RUNTIME must be defined"
 #endif

+#if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL) \
+        || (DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL)
+#define DNNL_WITH_SYCL 1
+#else
+#define DNNL_WITH_SYCL 0
+#endif
+
+// For SYCL CPU, a primitive may be created and executed in different threads
+// hence the global scratchpad does not work. This enables concurrent execution
+// when CPU runtime is SYCL to avoid the issue.
+#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
+#ifndef DNNL_ENABLE_CONCURRENT_EXEC
+#define DNNL_ENABLE_CONCURRENT_EXEC
+#endif
+#endif
+
+// When defined, oneAPI DPC++ Compiler is used.
+#cmakedefine DNNL_SYCL_DPCPP
+
+// When defined, ComputeCpp SYCL is used.
+#cmakedefine DNNL_SYCL_COMPUTECPP
+
+// When defined, Level Zero is supported.
+#cmakedefine DNNL_WITH_LEVEL_ZERO
+
+// When defined, SYCL CUDA backend is used.
+#cmakedefine DNNL_SYCL_CUDA
+
 #endif
--- a/include/oneapi/dnnl/dnnl_debug.h
+++ b/include/oneapi/dnnl/dnnl_debug.h
@ -0,0 +1,73 @@
+/*******************************************************************************
+* Copyright 2018-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+// DO NOT EDIT, AUTO-GENERATED
+
+// clang-format off
+
+#ifndef ONEAPI_DNNL_DNNL_DEBUG_H
+#define ONEAPI_DNNL_DNNL_DEBUG_H
+
+/// @file
+/// Debug capabilities
+
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const char DNNL_API *dnnl_status2str(dnnl_status_t v);
+const char DNNL_API *dnnl_dt2str(dnnl_data_type_t v);
+const char DNNL_API *dnnl_fmt_kind2str(dnnl_format_kind_t v);
+const char DNNL_API *dnnl_fmt_tag2str(dnnl_format_tag_t v);
+const char DNNL_API *dnnl_prop_kind2str(dnnl_prop_kind_t v);
+const char DNNL_API *dnnl_prim_kind2str(dnnl_primitive_kind_t v);
+const char DNNL_API *dnnl_alg_kind2str(dnnl_alg_kind_t v);
+const char DNNL_API *dnnl_rnn_flags2str(dnnl_rnn_flags_t v);
+const char DNNL_API *dnnl_rnn_direction2str(dnnl_rnn_direction_t v);
+const char DNNL_API *dnnl_engine_kind2str(dnnl_engine_kind_t v);
+const char DNNL_API *dnnl_scratchpad_mode2str(dnnl_scratchpad_mode_t v);
+const char DNNL_API *dnnl_cpu_isa2str(dnnl_cpu_isa_t v);
+
+const char DNNL_API *dnnl_runtime2str(unsigned v);
+
+/// Forms a format string for a given memory descriptor.
+///
+/// The format is defined as: 'dt:[p|o|0]:fmt_kind:fmt:extra'.
+/// Here:
+///  - dt       -- data type
+///  - p        -- indicates there is non-trivial padding
+///  - o        -- indicates there is non-trivial padding offset
+///  - 0        -- indicates there is non-trivial offset0
+///  - fmt_kind -- format kind (blocked, wino, etc...)
+///  - fmt      -- extended format string (format_kind specific)
+///  - extra    -- shows extra fields (underspecified)
+int DNNL_API dnnl_md2fmt_str(char *fmt_str, size_t fmt_str_len,
+        const dnnl_memory_desc_t *md);
+
+/// Forms a dimension string for a given memory descriptor.
+///
+/// The format is defined as: 'dim0xdim1x...xdimN
+int DNNL_API dnnl_md2dim_str(char *dim_str, size_t dim_str_len,
+        const dnnl_memory_desc_t *md);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/include/oneapi/dnnl/dnnl_ocl.h
+++ b/include/oneapi/dnnl/dnnl_ocl.h
@ -0,0 +1,122 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_OCL_H
+#define ONEAPI_DNNL_DNNL_OCL_H
+
+#include "oneapi/dnnl/dnnl.h"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+// Set target version for OpenCL explicitly to suppress a compiler warning.
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 120
+#endif
+
+#include <CL/cl.h>
+/// @endcond
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_ocl_interop
+/// @{
+
+/// Returns an OpenCL memory object associated with a memory object.
+///
+/// @param memory Memory object.
+/// @param mem_object Output OpenCL memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_mem_object(
+        const_dnnl_memory_t memory, cl_mem *mem_object);
+
+/// Sets OpenCL memory object associated with a memory object.
+///
+/// For behavioral details, see dnnl_memory_set_data_handle().
+///
+/// @param memory Memory object.
+/// @param mem_object OpenCL memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_set_mem_object(
+        dnnl_memory_t memory, cl_mem mem_object);
+
+/// Creates an engine associated with an OpenCL device and an OpenCL context.
+///
+/// @param engine Output engine.
+/// @param device Underlying OpenCL device to use for the engine.
+/// @param context Underlying OpenCL context to use for the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create(
+        dnnl_engine_t *engine, cl_device_id device, cl_context context);
+
+/// Returns the OpenCL context associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param context Output underlying OpenCL context of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_context(
+        dnnl_engine_t engine, cl_context *context);
+
+/// Returns the OpenCL device associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Output underlying OpenCL device of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_get_device(
+        dnnl_engine_t engine, cl_device_id *device);
+
+/// Creates an execution stream for a given engine associated with
+/// an OpenCL command queue.
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param queue OpenCL command queue to use.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, cl_command_queue queue);
+
+/// Returns the OpenCL command queue associated with an execution stream.
+///
+/// @param stream Execution stream to query.
+/// @param queue Output OpenCL command queue.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_stream_get_command_queue(
+        dnnl_stream_t stream, cl_command_queue *queue);
+
+/// @} dnnl_api_ocl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/include/oneapi/dnnl/dnnl_ocl.hpp
+++ b/include/oneapi/dnnl/dnnl_ocl.hpp
@ -0,0 +1,152 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_OCL_HPP
+#define ONEAPI_DNNL_DNNL_OCL_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "oneapi/dnnl/dnnl_ocl.h"
+
+#include <CL/cl.h>
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop Runtime interoperability API
+/// API extensions to interact with the underlying run-time.
+/// @{
+
+/// @addtogroup dnnl_api_ocl_interop OpenCL interoperability API
+/// API extensions to interact with the underlying OpenCL run-time.
+///
+/// @sa @ref dev_guide_opencl_interoperability in developer guide
+/// @{
+
+/// OpenCL interoperability namespace
+namespace ocl_interop {
+
+/// Constructs an engine from OpenCL device and context objects.
+///
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @returns An engine.
+inline engine make_engine(cl_device_id device, cl_context context) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_create(&c_engine, device, context),
+            "could not create an engine");
+    return engine(c_engine);
+}
+
+/// Returns OpenCL context associated with the engine.
+///
+/// @param aengine An engine.
+/// @returns Underlying OpenCL context.
+inline cl_context get_context(const engine &aengine) {
+    cl_context context = nullptr;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_get_context(aengine.get(), &context),
+            "could not get an OpenCL context from an engine");
+    return context;
+}
+
+/// Returns OpenCL device associated with the engine.
+///
+/// @param aengine An engine.
+/// @returns Underlying OpenCL device.
+inline cl_device_id get_device(const engine &aengine) {
+    cl_device_id device = nullptr;
+    error::wrap_c_api(dnnl_ocl_interop_get_device(aengine.get(), &device),
+            "could not get an OpenCL device from an engine");
+    return device;
+}
+
+/// Constructs an execution stream for the specified engine and OpenCL queue.
+///
+/// @param aengine Engine to create the stream on.
+/// @param queue OpenCL queue to use for the stream.
+/// @returns An execution stream.
+inline stream make_stream(const engine &aengine, cl_command_queue queue) {
+    dnnl_stream_t c_stream;
+    error::wrap_c_api(
+            dnnl_ocl_interop_stream_create(&c_stream, aengine.get(), queue),
+            "could not create a stream");
+    return stream(c_stream);
+}
+
+/// Returns OpenCL queue object associated with the execution stream.
+///
+/// @param astream An execution stream.
+/// @returns Underlying OpenCL queue.
+inline cl_command_queue get_command_queue(const stream &astream) {
+    cl_command_queue queue = nullptr;
+    error::wrap_c_api(
+            dnnl_ocl_interop_stream_get_command_queue(astream.get(), &queue),
+            "could not get an OpenCL command queue from a stream");
+    return queue;
+}
+
+/// Returns the OpenCL memory object associated with the memory object.
+///
+/// @param amemory A memory object.
+/// @returns Underlying OpenCL memory object.
+inline cl_mem get_mem_object(const memory &amemory) {
+    cl_mem mem_object;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_get_mem_object(amemory.get(), &mem_object),
+            "could not get OpenCL buffer object from a memory object");
+    return mem_object;
+}
+
+/// Sets the OpenCL memory object associated with the memory object.
+///
+/// For behavioral details see memory::set_data_handle().
+///
+/// @param amemory A memory object.
+/// @param mem_object OpenCL cl_mem object to use as the underlying
+///     storage. It must have at least get_desc().get_size() bytes
+///     allocated.
+inline void set_mem_object(memory &amemory, cl_mem mem_object) {
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_set_mem_object(amemory.get(), mem_object),
+            "could not set OpenCL buffer object from a memory object");
+}
+
+} // namespace ocl_interop
+
+/// @} dnnl_api_ocl_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
--- a/include/oneapi/dnnl/dnnl_threadpool.h
+++ b/include/oneapi/dnnl/dnnl_threadpool.h
@ -0,0 +1,98 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_H
+#define ONEAPI_DNNL_DNNL_THREADPOOL_H
+
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop
+/// @{
+
+/// Creates an execution stream with specified threadpool.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param threadpool Pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, void *threadpool);
+
+/// Returns a threadpool to be used by the execution stream.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param astream Execution stream.
+/// @param threadpool Output pointer to an instance of a C++ class that
+///     implements dnnl::threapdool_iface interface. Set to NULL if the
+///     stream was created without threadpool.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_stream_get_threadpool(
+        dnnl_stream_t astream, void **threadpool);
+
+/// @copydoc dnnl_sgemm()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_sgemm(char transa, char transb,
+        dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, float alpha, const float *A,
+        dnnl_dim_t lda, const float *B, dnnl_dim_t ldb, float beta, float *C,
+        dnnl_dim_t ldc, void *threadpool);
+
+/// @copydoc dnnl_gemm_u8s8s32()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_gemm_u8s8s32(char transa,
+        char transb, char offsetc, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K,
+        float alpha, const uint8_t *A, dnnl_dim_t lda, uint8_t ao,
+        const int8_t *B, dnnl_dim_t ldb, int8_t bo, float beta, int32_t *C,
+        dnnl_dim_t ldc, const int32_t *co, void *threadpool);
+
+/// @copydoc dnnl_gemm_s8s8s32()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_gemm_s8s8s32(char transa,
+        char transb, char offsetc, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K,
+        float alpha, const int8_t *A, dnnl_dim_t lda, int8_t ao,
+        const int8_t *B, dnnl_dim_t ldb, int8_t bo, float beta, int32_t *C,
+        dnnl_dim_t ldc, const int32_t *co, void *threadpool);
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/include/oneapi/dnnl/dnnl_threadpool.hpp
+++ b/include/oneapi/dnnl/dnnl_threadpool.hpp
@ -0,0 +1,113 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_HPP
+#define ONEAPI_DNNL_DNNL_THREADPOOL_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_threadpool.h"
+
+#include "oneapi/dnnl/dnnl_threadpool_iface.hpp"
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop Threadpool interoperability API
+/// API extensions to interact with the underlying Threadpool run-time.
+/// @{
+
+/// Threadpool interoperability namespace
+namespace threadpool_interop {
+
+/// Constructs an execution stream for the specified engine and threadpool.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param aengine Engine to create the stream on.
+/// @param threadpool Pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface.
+/// @returns An execution stream.
+inline dnnl::stream make_stream(
+        const dnnl::engine &aengine, threadpool_iface *threadpool) {
+    dnnl_stream_t c_stream;
+    dnnl::error::wrap_c_api(dnnl_threadpool_interop_stream_create(
+                                    &c_stream, aengine.get(), threadpool),
+            "could not create stream");
+    return dnnl::stream(c_stream);
+}
+
+/// Returns the pointer to a threadpool that is used by an execution stream.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param astream An execution stream.
+/// @returns Output pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface or NULL if the stream was created
+///     without threadpool.
+inline threadpool_iface *get_threadpool(const dnnl::stream &astream) {
+    void *tp;
+    dnnl::error::wrap_c_api(
+            dnnl_threadpool_interop_stream_get_threadpool(astream.get(), &tp),
+            "could not get stream threadpool");
+    return static_cast<threadpool_iface *>(tp);
+}
+
+/// @copydoc dnnl_sgemm_tp()
+inline status sgemm(char transa, char transb, dnnl_dim_t M, dnnl_dim_t N,
+        dnnl_dim_t K, float alpha, const float *A, dnnl_dim_t lda,
+        const float *B, dnnl_dim_t ldb, float beta, float *C, dnnl_dim_t ldc,
+        threadpool_iface *tp) {
+    return static_cast<status>(dnnl_threadpool_interop_sgemm(
+            transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, tp));
+}
+/// @copydoc dnnl_gemm_u8s8s32_tp()
+inline status gemm_u8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const uint8_t *A,
+        dnnl_dim_t lda, uint8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co,
+        threadpool_iface *tp) {
+    return static_cast<status>(
+            dnnl_threadpool_interop_gemm_u8s8s32(transa, transb, offsetc, M, N,
+                    K, alpha, A, lda, ao, B, ldb, bo, beta, C, ldc, co, tp));
+}
+
+/// @copydoc dnnl_gemm_s8s8s32_tp()
+inline status gemm_s8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const int8_t *A,
+        dnnl_dim_t lda, int8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co,
+        threadpool_iface *tp) {
+    return static_cast<status>(
+            dnnl_threadpool_interop_gemm_s8s8s32(transa, transb, offsetc, M, N,
+                    K, alpha, A, lda, ao, B, ldb, bo, beta, C, ldc, co, tp));
+}
+
+} // namespace threadpool_interop
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
--- a/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
+++ b/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
@ -0,0 +1,72 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP
+#define ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP
+
+#include <functional>
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop
+/// @{
+
+namespace threadpool_interop {
+
+/// Abstract threadpool interface. The users are expected to subclass this
+/// interface and pass an object to the library during CPU stream creation or
+/// directly in case of BLAS functions.
+struct threadpool_iface {
+    /// Returns the number of worker threads.
+    virtual int get_num_threads() const = 0;
+
+    /// Returns true if the calling thread belongs to this threadpool.
+    virtual bool get_in_parallel() const = 0;
+
+    /// Submits n instances of a closure for execution in parallel:
+    ///
+    /// for (int i = 0; i < n; i++) fn(i, n);
+    ///
+    virtual void parallel_for(int n, const std::function<void(int, int)> &fn)
+            = 0;
+
+    /// Returns threadpool behavior flags bit mask (see below).
+    virtual uint64_t get_flags() const = 0;
+
+    /// If set, parallel_for() returns immediately and oneDNN needs implement
+    /// waiting for the submitted closures to finish execution on its own.
+    static constexpr uint64_t ASYNCHRONOUS = 1;
+
+    virtual ~threadpool_iface() {}
+};
+
+} // namespace threadpool_interop
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
--- a/include/oneapi/dnnl/dnnl_types.h
+++ b/include/oneapi/dnnl/dnnl_types.h
--- a/include/oneapi/dnnl/dnnl_version.h.in
+++ b/include/oneapi/dnnl/dnnl_version.h.in
@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/

-#ifndef DNNL_VERSION_H
-#define DNNL_VERSION_H
+#ifndef ONEAPI_DNNL_DNNL_VERSION_H
+#define ONEAPI_DNNL_DNNL_VERSION_H

 // clang-format off

--- a/scripts/generate_dnnl_debug.py
+++ b/scripts/generate_dnnl_debug.py
@ -64,8 +64,8 @@ def header(body):
 /// @file
 /// Debug capabilities

-#include "dnnl_config.h"
-#include "dnnl_types.h"
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_types.h"

 #ifdef __cplusplus
 extern "C" {
@ -106,8 +106,8 @@ def source(body):
    return '''\
 #include <assert.h>

-#include "dnnl_debug.h"
-#include "dnnl_types.h"
+#include "oneapi/dnnl/dnnl_debug.h"
+#include "oneapi/dnnl/dnnl_types.h"

 %s
 ''' % body
@ -118,7 +118,7 @@ def header_benchdnn(body):
 #ifndef DNNL_DEBUG_HPP
 #define DNNL_DEBUG_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 %s
 /* status */
@ -145,7 +145,8 @@ def source_benchdnn(body):
 #include <assert.h>
 #include <string.h>

-#include "dnnl_debug.h"
+#include "oneapi/dnnl/dnnl_debug.h"
+
 #include "dnnl_debug.hpp"

 #include "src/common/z_magic.hpp"
@ -306,7 +307,7 @@ def usage():
 Generates oneDNN debug header and source files with enum to string mapping.
 Input types.xml file can be obtained with CastXML[1]:
 $ castxml --castxml-cc-gnu-c clang --castxml-output=1 \\
-        include/dnnl_types.h -o types.xml
+        include/oneapi/dnnl/dnnl_types.h -o types.xml

 [1] https://github.com/CastXML/CastXML''' % sys.argv[0])
    sys.exit(1)
@ -321,7 +322,7 @@ script_root = os.path.dirname(os.path.realpath(__file__))
 ifile = sys.argv[1] if len(sys.argv) > 1 else usage()

 file_paths = (
-    '%s/../include/dnnl_debug.h' % script_root,
+    '%s/../include/oneapi/dnnl/dnnl_debug.h' % script_root,
    '%s/../src/common/dnnl_debug_autogenerated.cpp' % script_root,
    '%s/../tests/benchdnn/dnnl_debug.hpp' % script_root,
    '%s/../tests/benchdnn/dnnl_debug_autogenerated.cpp' % script_root)
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@ -21,6 +21,11 @@ file(GLOB_RECURSE SOURCES
    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
    )

+if(NOT DNNL_CPU_RUNTIME STREQUAL "THREADPOOL")
+    # avoid building and linking empty objects
+    list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/stream_threadpool.cpp")
+endif()
+
 set(OBJ_LIB ${LIB_NAME}_common)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
--- a/src/common/batch_normalization.cpp
+++ b/src/common/batch_normalization.cpp
@ -15,7 +15,7 @@
 *******************************************************************************/

 #include <assert.h>
-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "type_helpers.hpp"
--- a/src/common/batch_normalization_pd.hpp
+++ b/src/common/batch_normalization_pd.hpp
@ -17,7 +17,7 @@
 #ifndef COMMON_BATCH_NORMALIZATION_PD_HPP
 #define COMMON_BATCH_NORMALIZATION_PD_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "primitive_desc.hpp"
--- a/src/common/bfloat16.hpp
+++ b/src/common/bfloat16.hpp
@ -26,7 +26,7 @@

 #include "common/bit_cast.hpp"

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 namespace dnnl {
 namespace impl {
--- a/src/common/binary.cpp
+++ b/src/common/binary.cpp
@ -16,7 +16,7 @@

 #include <assert.h>

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "type_helpers.hpp"
--- a/src/common/binary_pd.hpp
+++ b/src/common/binary_pd.hpp
@ -19,7 +19,7 @@

 #include <assert.h>

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "primitive_desc.hpp"
--- a/src/common/c_types_map.hpp
+++ b/src/common/c_types_map.hpp
@ -17,7 +17,8 @@
 #ifndef COMMON_C_TYPES_MAP_HPP
 #define COMMON_C_TYPES_MAP_HPP

-#include "dnnl_types.h"
+#include "oneapi/dnnl/dnnl_types.h"
+
 #include "gemm_types.hpp"
 #include "internal_desc_types.hpp"

--- a/src/common/concat.cpp
+++ b/src/common/concat.cpp
@ -16,7 +16,7 @@

 #include <assert.h>

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "concat_pd.hpp"
--- a/src/common/convolution.cpp
+++ b/src/common/convolution.cpp
@ -15,7 +15,7 @@
 *******************************************************************************/

 #include <assert.h>
-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "type_helpers.hpp"
--- a/src/common/convolution_pd.hpp
+++ b/src/common/convolution_pd.hpp
@ -17,7 +17,7 @@
 #ifndef COMMON_CONVOLUTION_PD_HPP
 #define COMMON_CONVOLUTION_PD_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "primitive_desc.hpp"
--- a/src/common/deconvolution.cpp
+++ b/src/common/deconvolution.cpp
@ -15,7 +15,7 @@
 *******************************************************************************/

 #include <assert.h>
-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "type_helpers.hpp"
--- a/src/common/deconvolution_pd.hpp
+++ b/src/common/deconvolution_pd.hpp
@ -17,7 +17,7 @@
 #ifndef COMMON_DECONVOLUTION_PD_HPP
 #define COMMON_DECONVOLUTION_PD_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "convolution_pd.hpp"
--- a/src/common/dnnl_debug.cpp
+++ b/src/common/dnnl_debug.cpp
@ -18,8 +18,8 @@
 #include <cinttypes>
 #include <stdio.h>

-#include "dnnl_debug.h"
-#include "dnnl_types.h"
+#include "oneapi/dnnl/dnnl_debug.h"
+#include "oneapi/dnnl/dnnl_types.h"

 #include "c_types_map.hpp"
 #include "type_helpers.hpp"
--- a/src/common/dnnl_debug_autogenerated.cpp
+++ b/src/common/dnnl_debug_autogenerated.cpp
@ -20,8 +20,8 @@

 #include <assert.h>

-#include "dnnl_debug.h"
-#include "dnnl_types.h"
+#include "oneapi/dnnl/dnnl_debug.h"
+#include "oneapi/dnnl/dnnl_types.h"

 const char *dnnl_status2str(dnnl_status_t v) {
    if (v == dnnl_success) return "success";
--- a/src/common/dnnl_thread.hpp
+++ b/src/common/dnnl_thread.hpp
@ -61,7 +61,7 @@ inline void dnnl_thr_barrier() {

 #elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
 #include <thread>
-#include "dnnl_threadpool_iface.hpp"
+#include "oneapi/dnnl/dnnl_threadpool_iface.hpp"
 #define DNNL_THR_SYNC 0

 namespace dnnl {
@ -75,14 +75,14 @@ namespace threadpool_utils {
 // Sets `tp` to be the active threadpool for the calling thread. This will
 // make all calls to `get_active_threadpool()` to return `tp` thus enabling
 // `parallel()` and `parallel_nd()` to submit work to `tp`.
-void activate_threadpool(threadpool_iface *tp);
+void activate_threadpool(dnnl::threadpool_interop::threadpool_iface *tp);

 // Resets the active threadpool for the calling thread to nullptr. After this
 // call `parallel()` and `parallel_nd()` would execute work sequentially.
 void deactivate_threadpool();

 // Returns the active threadpool for the calling thread.
-threadpool_iface *get_active_threadpool();
+dnnl::threadpool_interop::threadpool_iface *get_active_threadpool();

 } // namespace threadpool_utils
 } // namespace impl
@ -90,7 +90,7 @@ threadpool_iface *get_active_threadpool();

 inline int dnnl_get_max_threads() {
    using namespace dnnl::impl::threadpool_utils;
-    dnnl::threadpool_iface *tp = get_active_threadpool();
+    dnnl::threadpool_interop::threadpool_iface *tp = get_active_threadpool();
    // This is the maximum number of threads oneDNN would use
    int def_max_threads = std::thread::hardware_concurrency();
    assert(def_max_threads > 0);
@ -101,7 +101,7 @@ inline int dnnl_get_max_threads() {
 }
 inline int dnnl_in_parallel() {
    using namespace dnnl::impl::threadpool_utils;
-    dnnl::threadpool_iface *tp = get_active_threadpool();
+    dnnl::threadpool_interop::threadpool_iface *tp = get_active_threadpool();
    return tp ? tp->get_in_parallel() : 0;
 }
 inline void dnnl_thr_barrier() {
--- a/src/common/dnnl_thread_parallel_nd.hpp
+++ b/src/common/dnnl_thread_parallel_nd.hpp
@ -82,14 +82,15 @@ void parallel(int nthr, F f) {
            tbb::static_partitioner());
 #elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
    using namespace dnnl::impl::threadpool_utils;
-    dnnl::threadpool_iface *tp = get_active_threadpool();
+    dnnl::threadpool_interop::threadpool_iface *tp = get_active_threadpool();
    if (!tp || dnnl_in_parallel()) {
        threadpool_utils::deactivate_threadpool();
        for (int ithr = 0; ithr < nthr; ithr++)
            f(ithr, nthr);
        threadpool_utils::activate_threadpool(tp);
    } else {
-        bool async = tp->get_flags() & dnnl::threadpool_iface::ASYNCHRONOUS;
+        bool async = tp->get_flags()
+                & dnnl::threadpool_interop::threadpool_iface::ASYNCHRONOUS;
        counting_barrier_t b;
        if (async) b.init(nthr);
        tp->parallel_for(nthr, [tp, &f, &b, async](int ithr, int nthr) {
--- a/src/common/dnnl_traits.hpp
+++ b/src/common/dnnl_traits.hpp
@ -20,9 +20,10 @@
 #include <assert.h>
 #include <stdint.h>

+#include "oneapi/dnnl/dnnl.h"
+
 #include "bfloat16.hpp"
 #include "c_types_map.hpp"
-#include "dnnl.h"
 #include "float16.hpp"
 #include "nstl.hpp"
 #include "utils.hpp"
--- a/src/common/eltwise.cpp
+++ b/src/common/eltwise.cpp
@ -15,7 +15,7 @@
 *******************************************************************************/

 #include <assert.h>
-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "math_utils.hpp"
--- a/src/common/eltwise_pd.hpp
+++ b/src/common/eltwise_pd.hpp
@ -17,7 +17,7 @@
 #ifndef COMMON_ELTWISE_PD_HPP
 #define COMMON_ELTWISE_PD_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "primitive_desc.hpp"
--- a/src/common/engine.cpp
+++ b/src/common/engine.cpp
@ -16,12 +16,12 @@

 #include <memory>

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"
+
+#include "c_types_map.hpp"
 #include "engine.hpp"
 #include "nstl.hpp"
 #include "primitive.hpp"
-
-#include "c_types_map.hpp"
 #include "utils.hpp"

 #include "cpu/cpu_engine.hpp"
--- a/src/common/engine.hpp
+++ b/src/common/engine.hpp
@ -17,7 +17,11 @@
 #ifndef COMMON_ENGINE_HPP
 #define COMMON_ENGINE_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"
+
+#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
+#include "oneapi/dnnl/dnnl_threadpool_iface.hpp"
+#endif

 #include "c_types_map.hpp"
 #include "memory.hpp"
@ -57,10 +61,17 @@ struct dnnl_engine : public dnnl::impl::c_compatible {
    }

    /** create stream */
-    virtual dnnl::impl::status_t create_stream(dnnl::impl::stream_t **stream,
-            unsigned flags, const dnnl::impl::stream_attr_t *attr)
+    virtual dnnl::impl::status_t create_stream(
+            dnnl::impl::stream_t **stream, unsigned flags)
            = 0;

+#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
+    virtual dnnl::impl::status_t create_stream(dnnl::impl::stream_t **stream,
+            dnnl::threadpool_interop::threadpool_iface *threadpool) {
+        return dnnl::impl::status::invalid_arguments;
+    }
+#endif
+
    virtual dnnl::impl::status_t get_service_stream(
            dnnl::impl::stream_t *&stream) {
        stream = nullptr;
--- a/src/common/gemm_pd.hpp
+++ b/src/common/gemm_pd.hpp
@ -17,7 +17,7 @@
 #ifndef COMMON_GEMM_PD_HPP
 #define COMMON_GEMM_PD_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "common/c_types_map.hpp"
 #include "common/gemm_utils.hpp"
--- a/src/common/gemm_types.hpp
+++ b/src/common/gemm_types.hpp
@ -18,7 +18,8 @@
 #define COMMON_GEMM_TYPES_HPP

 #include <assert.h>
-#include "dnnl_types.h"
+
+#include "oneapi/dnnl/dnnl_types.h"

 namespace dnnl {
 namespace impl {
--- a/src/common/gemm_utils.hpp
+++ b/src/common/gemm_utils.hpp
@ -17,7 +17,7 @@
 #ifndef COMMON_GEMM_UTILS_HPP
 #define COMMON_GEMM_UTILS_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "common/c_types_map.hpp"
 #include "common/nstl.hpp"
--- a/src/common/inner_product.cpp
+++ b/src/common/inner_product.cpp
@ -15,7 +15,7 @@
 *******************************************************************************/

 #include <assert.h>
-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "type_helpers.hpp"
--- a/src/common/inner_product_pd.hpp
+++ b/src/common/inner_product_pd.hpp
@ -17,7 +17,7 @@
 #ifndef COMMON_INNER_PRODUCT_PD_HPP
 #define COMMON_INNER_PRODUCT_PD_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "primitive_desc.hpp"
--- a/src/common/internal_desc_types.hpp
+++ b/src/common/internal_desc_types.hpp
@ -18,7 +18,7 @@
 #define COMMON_INTERNAL_DESC_TYPES_HPP

 #include <vector>
-#include "dnnl_types.h"
+#include "oneapi/dnnl/dnnl_types.h"

 namespace dnnl {
 namespace impl {
--- a/src/common/layer_normalization.cpp
+++ b/src/common/layer_normalization.cpp
@ -15,7 +15,7 @@
 *******************************************************************************/

 #include <assert.h>
-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "type_helpers.hpp"
--- a/src/common/layer_normalization_pd.hpp
+++ b/src/common/layer_normalization_pd.hpp
@ -17,7 +17,7 @@
 #ifndef COMMON_LAYER_NORMALIZATION_PD_HPP
 #define COMMON_LAYER_NORMALIZATION_PD_HPP

-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "primitive_desc.hpp"
--- a/src/common/lrn.cpp
+++ b/src/common/lrn.cpp
@ -15,7 +15,7 @@
 *******************************************************************************/

 #include <assert.h>
-#include "dnnl.h"
+#include "oneapi/dnnl/dnnl.h"

 #include "c_types_map.hpp"
 #include "type_helpers.hpp"
--- a/Show More
+++ b/Show More