gpu: intel: add Level Zero backend

2025-10-20 18:43:49 +08:00 · 2025-07-22 08:34:47 -07:00
parent 3a756b982b
commit 633a03d736
83 changed files with 4100 additions and 816 deletions
--- a/README.md
+++ b/README.md
@ -287,6 +287,7 @@ Runtime-specific dependencies:
 | `ONEDNN_CPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), TBB (`tbb.dll`), OpenCL loader (`OpenCL.dll`)
 | `ONEDNN_GPU_RUNTIME=OCL`   | any                           | OpenCL loader (`OpenCL.dll`)
 | `ONEDNN_GPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), OpenCL loader (`OpenCL.dll`), oneAPI Level Zero loader (`ze_loader.dll`)
+| `ONEDNN_GPU_RUNTIME=L0`    | any                           | oneAPI Level Zero loader (`ze_loader.dll`)

 #### macOS

--- a/cmake/OpenCL.cmake
+++ b/cmake/OpenCL.cmake
@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2019-2021 Intel Corporation
+# Copyright 2019-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -22,7 +22,7 @@ if(OpenCL_cmake_included)
 endif()
 set(OpenCL_cmake_included true)

-if(DNNL_GPU_RUNTIME STREQUAL "OCL")
+if("${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|L0)$")
    message(STATUS "GPU support is enabled (OpenCL)")
 else()
    return()
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@ -283,7 +283,7 @@ set(DNNL_GPU_RUNTIME "NONE" CACHE STRING

    Using OpenCL for GPU requires setting OPENCLROOT if the libraries are
    installed in a non-standard location.")
-if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$")
+if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL|L0)$")
    message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}")
 endif()

--- a/doc/build/build_options.md
+++ b/doc/build/build_options.md
@ -7,7 +7,7 @@ oneDNN supports the following build-time options.
 |:--------------------------------|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------|
 | ONEDNN_LIBRARY_TYPE             | **SHARED**, STATIC                                  | Defines the resulting library type                                                                                 |
 | ONEDNN_CPU_RUNTIME              | NONE, **OMP**, TBB, SEQ, THREADPOOL, SYCL           | Defines the threading runtime for CPU engines                                                                      |
-| ONEDNN_GPU_RUNTIME              | **NONE**, OCL, SYCL                                 | Defines the offload runtime for GPU engines                                                                        |
+| ONEDNN_GPU_RUNTIME              | **NONE**, OCL, SYCL, L0                             | Defines the offload runtime for GPU engines                                                                        |
 | ONEDNN_BUILD_DOC                | **ON**, OFF                                         | Controls building the documentation                                                                                |
 | ONEDNN_DOC_VERSIONS_JSON        | **""**, *string*                                    | Location of JSON file for [PyData Sphinx Theme version switcher]. Enables documentation version switcher when set. |
 | ONEDNN_BUILD_EXAMPLES           | **ON**, OFF                                         | Controls building the examples                                                                                     |
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -136,6 +136,9 @@ foreach(f ${sources})
    if(NOT DNNL_WITH_SYCL AND ${f_name} MATCHES "^sycl")
        list(REMOVE_ITEM sources "${f}")
    endif()
+    if(NOT DNNL_GPU_RUNTIME STREQUAL "L0" AND ${f_name} MATCHES ".*level_zero")
+        list(REMOVE_ITEM sources "${f}")
+    endif()
 endforeach()

 # In case of SYCL, skip CPU examples that directly work with raw pointers
@ -180,7 +183,7 @@ foreach(src ${sources})
        endif()
    else()
        set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)")
-        set(gpu_rt_pattern "(OCL|SYCL|DPCPP)")
+        set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)")
        if(${example_name} MATCHES "sycl.*")
            set(cpu_rt_pattern "(SYCL|DPCPP)")
            set(gpu_rt_pattern "(SYCL|DPCPP)")
--- a/examples/CMakeLists.txt.in
+++ b/examples/CMakeLists.txt.in
@ -200,7 +200,7 @@ foreach(src ${sources})
        endif()
    else()
        set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)")
-        set(gpu_rt_pattern "(OCL|SYCL|DPCPP)")
+        set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)")
        if(${example_name} MATCHES "sycl.*")
            set(cpu_rt_pattern "(SYCL|DPCPP)")
            set(gpu_rt_pattern "(SYCL|DPCPP)")
--- a/examples/example_utils.h
+++ b/examples/example_utils.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -29,6 +29,9 @@
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include "dnnl_ocl.h"
 #endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "dnnl_l0.h"
+#endif

 #define COMPLAIN_DNNL_ERROR_AND_EXIT(what, status) \
    do { \
@ -160,7 +163,7 @@ static inline void write_to_dnnl_memory(void *handle, dnnl_memory_t mem) {
    }
 #endif

-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
    if (eng_kind == dnnl_gpu) {
        void *mapped_ptr = NULL;
        CHECK(dnnl_memory_map_data(mem, &mapped_ptr));
--- a/examples/example_utils.hpp
+++ b/examples/example_utils.hpp
@ -35,7 +35,11 @@

 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include "dnnl_ocl.hpp"
-#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "dnnl_l0.hpp"
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
 #include "dnnl_sycl.hpp"
 #endif

@ -228,7 +232,7 @@ inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) {
        return;
    }
 #endif
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
    if (eng.get_kind() == dnnl::engine::kind::gpu) {
        void *mapped_ptr = mem.map_data();
        if (mapped_ptr) std::memcpy(handle, mapped_ptr, size);
@ -287,7 +291,7 @@ inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {
        return;
    }
 #endif
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
    if (eng.get_kind() == dnnl::engine::kind::gpu) {
        void *mapped_ptr = mem.map_data();
        if (mapped_ptr) std::memcpy(mapped_ptr, handle, size);
--- a/include/dnnl_l0.h
+++ b/include/dnnl_l0.h
@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_L0_H
+#define DNNL_L0_H
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#endif /* DNNL_L0_H */
--- a/include/dnnl_l0.hpp
+++ b/include/dnnl_l0.hpp
@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_L0_HPP
+#define DNNL_L0_HPP
+
+#include "oneapi/dnnl/dnnl_l0.hpp"
+
+#endif /* DNNL_L0_HPP */
--- a/include/oneapi/dnnl/dnnl_config.h.in
+++ b/include/oneapi/dnnl/dnnl_config.h.in
@ -82,6 +82,9 @@
 /// DPC++ runtime
 #define DNNL_RUNTIME_DPCPP DNNL_RUNTIME_SYCL

+/// L0 runtime
+#define DNNL_RUNTIME_L0 1024u
+
 /// No vendor (corresponding runtime is disabled)
 #define DNNL_VENDOR_NONE 0u

@ -119,7 +122,8 @@
 #endif
 #if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL) \
-        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL)
+        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL) \
+        && (DNNL_GPU_RUNTIME != DNNL_RUNTIME_L0)
 #error "Unexpected DNNL_GPU_RUNTIME"
 #endif
 #if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \
@ -145,9 +149,6 @@
 // When defined, DPCPP is supported.
 #cmakedefine DNNL_WITH_SYCL

-// When defined, Level Zero is supported.
-#cmakedefine DNNL_WITH_LEVEL_ZERO
-
 // When defined, SYCL CUDA backend is used.
 #cmakedefine DNNL_SYCL_CUDA

--- a/include/oneapi/dnnl/dnnl_l0.h
+++ b/include/oneapi/dnnl/dnnl_l0.h
@ -0,0 +1,203 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_L0_H
+#define ONEAPI_DNNL_DNNL_L0_H
+
+#include "oneapi/dnnl/dnnl.h"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include "level_zero/ze_api.h"
+/// @endcond
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_l0_interop
+/// @{
+
+/// Creates an engine associated with a Level Zero device and a Level Zero context.
+///
+/// @param engine Output engine.
+/// @param driver Pointer to the Level Zero driver to use for the engine.
+/// @param device Pointer to the Level Zero device to use for the engine.
+/// @param context Pointer to the Level Zero context to use for the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_engine_create(dnnl_engine_t *engine,
+        const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
+        const ze_context_handle_t acontext);
+
+/// Returns the Level Zero context associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param context Pointer to the underlying Level Zero context of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_context(
+        dnnl_engine_t engine, ze_context_handle_t context);
+
+/// Returns the Level Zero device associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Pointer to the underlying Level Zero device of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_device(
+        dnnl_engine_t engine, ze_device_handle_t device);
+
+/// Returns the Level Zero driver associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Pointer to the underlying Level Zero driver of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_driver(
+        dnnl_engine_t engine, ze_driver_handle_t driver);
+
+/// Creates an execution stream for a given engine associated with a Level Zero
+/// queue.
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param queue Level Zero command queue to use.
+/// @param list Level Zero command list to use.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_stream_create(dnnl_stream_t *stream,
+        dnnl_engine_t engine, ze_command_list_handle_t list);
+
+/// Returns the Level Zero command list associated with an execution stream.
+///
+/// @param stream Execution stream to query.
+/// @param list Output Level Zero command list.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_stream_get_list(
+        dnnl_stream_t stream, ze_command_list_handle_t list);
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl_memory_set_data_handle() had been called, if @p memory_kind is equal
+///   to dnnl_l0_interop_usm, or
+/// - dnnl_l0_interop_memory_set_buffer() has been called, if @p memory_kind
+///   is equal to dnnl_l0_interop_buffer.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::l0_interop::memory_kind::usm.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        void *handle);
+
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handles.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::l0_interop::memory_kind::usm.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        size_t nhandles, void **handles);
+
+/// Returns an Level Zero memory object associated with a memory object.
+///
+/// @param memory Memory object.
+/// @param mem_object Output Level Zero memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object(
+        const_dnnl_memory_t memory, void **mem_object);
+
+/// Sets Level Zero memory object associated with a memory object.
+///
+/// For behavioral details, see dnnl_memory_set_data_handle().
+///
+/// @param memory Memory object.
+/// @param mem_object Level Zero memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object(
+        dnnl_memory_t memory, void *mem_object);
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a Level Zero event.
+///
+/// @param primitive Primitive to execute.
+/// @param stream Stream to use.
+/// @param nargs Number of arguments.
+/// @param args Array of arguments. Each argument is an
+///     <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
+///     values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
+///     #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
+///     descriptor as that returned by
+///     #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
+/// @param ndeps Number of dependencies.
+/// @param deps A pointer to a vector of size @p ndeps that contains
+///     dependencies.
+/// @param return_event Output event.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_l0_interop_primitive_execute(
+        const_dnnl_primitive_t primitive, dnnl_stream_t stream, size_t nargs,
+        const dnnl_exec_arg_t *args, size_t ndeps,
+        const ze_event_handle_t *deps, ze_event_handle_t *return_event);
+
+/// @} dnnl_api_l0_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // ONEAPI_DNNL_DNNL_L0_H
--- a/include/oneapi/dnnl/dnnl_l0.hpp
+++ b/include/oneapi/dnnl/dnnl_l0.hpp
@ -0,0 +1,259 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_L0_HPP
+#define ONEAPI_DNNL_DNNL_L0_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <vector>
+#include <unordered_map>
+
+#include "oneapi/dnnl/dnnl_l0.h"
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_l0_interop Level Zero interoperability API
+/// API extensions to interact with the underlying Level Zero run-time.
+///
+/// @sa @ref dev_guide_dpcpp_interoperability in developer guide
+/// @{
+
+/// Level Zero interoperability namespace
+namespace l0_interop {
+
+/// Constructs an engine from Level Zero device and context objects.
+///
+/// @param adriver Level Zero driver.
+/// @param adevice Level Zero device.
+/// @param acontext Level Zero context.
+///
+/// @returns Created engine.
+inline engine make_engine(const ze_driver_handle_t adriver,
+        const ze_device_handle_t adevice, const ze_context_handle_t acontext) {
+    dnnl_engine_t aengine;
+    error::wrap_c_api(
+            dnnl_l0_interop_engine_create(&aengine, adriver, adevice, acontext),
+            "could not create an engine");
+    return engine(aengine);
+}
+
+/// Returns the Level Zero context associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying Level Zero device of the engine.
+inline ze_context_handle_t get_context(const engine &aengine) {
+    ze_context_handle_t ctx = nullptr;
+    error::wrap_c_api(dnnl_l0_interop_engine_get_context(aengine.get(), ctx),
+            "could not get a context handle");
+    return ctx;
+}
+
+/// Returns the Level Zero device associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying Level Zero context of the engine.
+inline ze_device_handle_t get_device(const engine &aengine) {
+    ze_device_handle_t dev = nullptr;
+    error::wrap_c_api(dnnl_l0_interop_engine_get_device(aengine.get(), dev),
+            "could not get a device handle");
+    return dev;
+}
+
+/// Returns the Level Zero driver associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying Level Zero driver of the engine.
+inline ze_driver_handle_t get_driver(const engine &aengine) {
+    ze_driver_handle_t dri = nullptr;
+    error::wrap_c_api(dnnl_l0_interop_engine_get_driver(aengine.get(), dri),
+            "could not get a driver handle");
+    return dri;
+}
+
+/// Creates an execution stream for a given engine associated with a Level Zero
+/// queue.
+///
+/// @param aengine Engine object to use for the stream.
+/// @param alist Level Zero immediate command list to use for the stream.
+///
+/// @returns An execution stream.
+inline stream make_stream(
+        const engine &aengine, ze_command_list_handle_t alist) {
+    dnnl_stream_t astream;
+    error::wrap_c_api(
+            dnnl_l0_interop_stream_create(&astream, aengine.get(), alist),
+            "could not create a stream");
+    return stream(astream);
+}
+
+/// Returns the Level Zero immediate command list associated with an execution stream.
+///
+/// @param astream Execution stream to query.
+///
+/// @returns Level Zero immediate command list object.
+inline ze_command_list_handle_t get_list(const stream &astream) {
+    ze_command_list_handle_t list = nullptr;
+    error::wrap_c_api(dnnl_l0_interop_stream_get_list(astream.get(), list),
+            "could not get a stream handle");
+    return list;
+}
+
+/// Creates a memory object with multiple handles.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::l0_interop::memory_kind::usm.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+///  If the @p handles vector is not provided the library will allocate all
+///  buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, std::vector<void *> handles = {}) {
+    if (handles.empty()) {
+        const int nhandles = memory_desc.get_num_handles();
+        handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
+    }
+
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_l0_interop_memory_create_v2(&c_memory, memory_desc.get(),
+                    aengine.get(), handles.size(), handles.data()),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::l0_interop::memory_kind::usm, or
+/// - dnnl::l0_interop::set_buffer() has been called, if @p memory_kind is
+///   equal to dnnl::l0_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::l0_interop::memory_kind::usm.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(
+        const memory::desc &memory_desc, const engine &aengine, void *handle) {
+    return make_memory(memory_desc, aengine, std::vector<void *> {handle});
+}
+
+/// Returns the Level Zero memory object associated with the memory object.
+///
+/// @param amemory A memory object.
+/// @returns Underlying Level Zero memory object.
+inline void *get_mem_object(const memory &amemory) {
+    void *mem_object;
+    error::wrap_c_api(
+            dnnl_l0_interop_memory_get_mem_object(amemory.get(), &mem_object),
+            "could not get Level Zero buffer object from a memory object");
+    return mem_object;
+}
+
+/// Sets the Level Zero memory object associated with the memory object.
+///
+/// For behavioral details see memory::set_data_handle().
+///
+/// @param amemory A memory object.
+/// @param mem_object Level Zero cl_mem object to use as the underlying
+///     storage. It must have at least get_desc().get_size() bytes
+///     allocated.
+inline void set_mem_object(memory &amemory, void *mem_object) {
+    error::wrap_c_api(
+            dnnl_l0_interop_memory_set_mem_object(amemory.get(), mem_object),
+            "could not set Level Zero buffer object from a memory object");
+}
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a Level Zero event.
+///
+/// Arguments are passed via an arguments map containing
+/// <index, memory object> pairs. The index must be one of the `DNNL_ARG_*`
+/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
+/// matching the one returned by
+/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using
+/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
+///
+/// @param aprimitive Primitive to execute.
+/// @param astream Stream object. The stream must belong to the same engine
+///     as the primitive.
+/// @param args Arguments map.
+/// @param deps Optional vector with `ze_event_handle_t` dependencies.
+///
+/// @returns Output event.
+inline ze_event_handle_t execute(const dnnl::primitive &aprimitive,
+        const stream &astream, const std::unordered_map<int, memory> &args,
+        const std::vector<ze_event_handle_t> &deps = {}) {
+    std::vector<dnnl_exec_arg_t> c_args;
+    c_args.reserve(args.size());
+    for (const auto &a : args)
+        c_args.push_back({a.first, a.second.get()});
+
+    const ze_event_handle_t *c_deps = deps.empty() ? nullptr : deps.data();
+
+    ze_event_handle_t return_event;
+    error::wrap_c_api(dnnl_l0_interop_primitive_execute(aprimitive.get(),
+                              astream.get(), c_args.size(), c_args.data(),
+                              deps.size(), c_deps, &return_event),
+            "could not execute a primitive");
+    return return_event;
+}
+
+} // namespace l0_interop
+
+/// @} dnnl_api_l0_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif // ONEAPI_DNNL_DNNL_L0_HPP
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -284,7 +284,7 @@ if(DNNL_CPU_THREADING_RUNTIME STREQUAL "TBB")
    endif()
 endif()

-if(DNNL_GPU_RUNTIME STREQUAL "OCL" OR (DNNL_GPU_SYCL AND DNNL_GPU_VENDOR STREQUAL "INTEL"))
+if("${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|L0)$" OR (DNNL_GPU_SYCL AND DNNL_GPU_VENDOR STREQUAL "INTEL"))
    install(FILES
        "../cmake/FindOpenCL.cmake"
        DESTINATION ${LIB_CONFIG_INSTALL_DIR})
--- a/src/common/c_types_map.hpp
+++ b/src/common/c_types_map.hpp
@ -1953,6 +1953,7 @@ enum runtime_kind_t {
    dnnl_runtime_threadpool,
    dnnl_runtime_ocl,
    dnnl_runtime_sycl,
+    dnnl_runtime_l0,
 };

 namespace runtime_kind {
@ -1963,6 +1964,7 @@ const runtime_kind_t tbb = dnnl_runtime_tbb;
 const runtime_kind_t threadpool = dnnl_runtime_threadpool;
 const runtime_kind_t ocl = dnnl_runtime_ocl;
 const runtime_kind_t sycl = dnnl_runtime_sycl;
+const runtime_kind_t l0 = dnnl_runtime_l0;
 } // namespace runtime_kind

 using primitive_kind_t = dnnl_primitive_kind_t;
--- a/src/common/engine.cpp
+++ b/src/common/engine.cpp
@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -29,12 +29,16 @@
 #include "cpu/cpu_engine.hpp"
 #endif

+#ifdef DNNL_WITH_SYCL
+#include "xpu/sycl/engine_factory.hpp"
+#endif
+
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include "xpu/ocl/engine_factory.hpp"
 #endif

-#ifdef DNNL_WITH_SYCL
-#include "xpu/sycl/engine_factory.hpp"
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "gpu/intel/l0/engine_factory.hpp"
 #endif

 namespace dnnl {
@ -42,23 +46,27 @@ namespace impl {

 static inline std::unique_ptr<engine_factory_t> get_engine_factory(
        engine_kind_t kind, runtime_kind_t runtime_kind) {
-
 #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
    if (kind == engine_kind::cpu && is_native_runtime(runtime_kind)) {
        return std::unique_ptr<engine_factory_t>(
                new cpu::cpu_engine_factory_t());
    }
 #endif
-
+#ifdef DNNL_WITH_SYCL
+    if (runtime_kind == runtime_kind::sycl) {
+        return xpu::sycl::get_engine_factory(kind);
+    }
+#endif
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
    if (kind == engine_kind::gpu && runtime_kind == runtime_kind::ocl) {
        return std::unique_ptr<engine_factory_t>(
                new xpu::ocl::engine_factory_t(kind));
    }
 #endif
-#ifdef DNNL_WITH_SYCL
-    if (runtime_kind == runtime_kind::sycl)
-        return xpu::sycl::get_engine_factory(kind);
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+    if (kind == engine_kind::gpu && runtime_kind == runtime_kind::l0) {
+        return gpu::intel::l0::get_engine_factory(kind);
+    }
 #endif
    return nullptr;
 }
--- a/src/common/engine.hpp
+++ b/src/common/engine.hpp
@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -185,6 +185,8 @@ inline runtime_kind_t get_default_runtime(engine_kind_t kind) {
    if (kind == engine_kind::gpu) return runtime_kind::ocl;
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
    if (kind == engine_kind::gpu) return runtime_kind::sycl;
+#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+    if (kind == engine_kind::gpu) return runtime_kind::l0;
 #endif
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SEQ
    return runtime_kind::seq;
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -291,7 +291,8 @@ std::string get_jit_profiling_jitdumpdir() {

 bool is_destroying_cache_safe() {
 #if defined(_WIN32) \
-        && (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL)
+        && (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 \
+                || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL)
    // The ntdll.dll library is located in system32, therefore setting
    // additional environment is not required.
    HMODULE handle = LoadLibraryExA(
--- a/src/gpu/intel/CMakeLists.txt
+++ b/src/gpu/intel/CMakeLists.txt
@ -58,9 +58,12 @@ add_subdirectory(jit)

 if(DNNL_GPU_RUNTIME STREQUAL "OCL")
    add_subdirectory(ocl)
+elseif(DNNL_GPU_RUNTIME STREQUAL "L0")
+    add_subdirectory(l0)
 elseif(DNNL_WITH_SYCL)
    add_subdirectory(sycl)
    add_subdirectory(ocl)
+    add_subdirectory(l0/utils)
 endif()

 set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel)
--- a/src/gpu/intel/compute/ukernels.cpp
+++ b/src/gpu/intel/compute/ukernels.cpp
@ -16,16 +16,21 @@

 #include "gpu/intel/compute/ukernels.hpp"

-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-#include "gpu/intel/ocl/engine.hpp"
-#include "gpu/intel/ocl/utils.hpp"
-#endif
-
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
 #include "gpu/intel/sycl/engine.hpp"
 #include "gpu/intel/sycl/utils.hpp"
 #endif

+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
+#endif
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@ -51,6 +56,11 @@ bool mayiuse_microkernels(const engine_t *engine) {

    auto mayiuse_mk = [](const engine_t *engine) {
        switch (engine->runtime_kind()) {
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+            case runtime_kind::sycl:
+                return sycl::mayiuse_microkernels(
+                        utils::downcast<const sycl::engine_t *>(engine));
+#endif
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
            case runtime_kind::ocl: {
                auto *ocl_engine
@ -60,10 +70,10 @@ bool mayiuse_microkernels(const engine_t *engine) {
                        cl_microkernels_check_kernel_code);
            }
 #endif
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
-            case runtime_kind::sycl:
-                return sycl::mayiuse_microkernels(
-                        utils::downcast<const sycl::engine_t *>(engine));
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+            case runtime_kind::l0:
+                return utils::downcast<const l0::engine_t *>(engine)
+                        ->mayiuse_microkernels();
 #endif
            default: return false;
        }
--- a/src/gpu/intel/compute/utils.cpp
+++ b/src/gpu/intel/compute/utils.cpp
@ -0,0 +1,119 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/compute/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace compute {
+
+status_t preprocess_headers(stringstream_t &pp_code, const char *code,
+        const compute::kernel_ctx_t &kernel_ctx) {
+    stringstream_t code_stream(code);
+
+    for (std::string line; std::getline(code_stream, line);) {
+        const size_t include_pos = line.find("#include");
+        if (include_pos != std::string::npos) {
+            static constexpr size_t include_len = 8;
+            const size_t first_quote_pos
+                    = line.find("\"", include_pos + include_len);
+            const size_t second_quote_pos
+                    = line.find("\"", first_quote_pos + 1);
+            const size_t kernel_name_len
+                    = second_quote_pos - first_quote_pos - 1;
+            const auto header_name
+                    = line.substr(first_quote_pos + 1, kernel_name_len);
+            const char *header_source
+                    = kernel_ctx.get_custom_header(header_name);
+            if (!header_source) header_source = get_kernel_header(header_name);
+            CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
+        } else {
+            pp_code << line << std::endl;
+        }
+    }
+    return status::success;
+}
+
+void debugdump_processed_source(const std::string &source,
+        const std::string &options, const std::string &cl_options) {
+#if defined(__linux__) && defined(DNNL_DEV_MODE)
+    if (get_verbose(verbose_t::debuginfo) >= 10) {
+        auto get_defines = [](const std::string &from) {
+            std::string ret;
+            size_t pos = 0;
+            while (pos < from.length()) {
+                // Find next define argument
+                pos = from.find("-D", pos);
+
+                // Generate argument, quotes are interpreted literally, but
+                // other special shell characters need escaped. Does not
+                // currently handle quotes with the ' character or nested quotes
+                char quote_parity = true;
+                while (pos < from.length()) {
+                    if (quote_parity
+                            && utils::one_of(from[pos], '~', '#', '$', '&', '*',
+                                    '(', ')', '\\', '|', '[', ']', '{', '}',
+                                    ';', '\'', '<', '>', '/', '?', '!')) {
+                        ret += '\\';
+                    }
+                    ret += from[pos];
+                    if (from[pos] == '"') quote_parity ^= true;
+                    if (from[pos] == ' ' && quote_parity) break;
+
+                    pos++;
+                }
+            }
+            return ret;
+        };
+        auto execute_command = [](const std::string &cmd,
+                                       const std::string &stdin) {
+            std::string result;
+            std::array<char, 256> buffer;
+            FILE *pipe = popen(cmd.c_str(), "w");
+            fputs(stdin.c_str(), pipe);
+            if (pipe) {
+                while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+                    result += buffer.data();
+                }
+            }
+            pclose(pipe);
+            return result;
+        };
+
+        // Run utilities to evaluate preprocessor defines and format the file
+        // Theoretically, we can accomplish this task with libclang, but it
+        // seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
+        // so that calls to the system are not included in the default build.
+
+        // Due to the use of a different C preprocessor, warnings should not be
+        // ignored, as they may correspond to a different behavior in the OpenCL
+        // C preprocessor
+        auto o = get_defines(options) + get_defines(cl_options);
+        std::string preprocess_cmd
+                = std::string() + "cpp -P " + o + " | clang-format";
+        execute_command(preprocess_cmd, source);
+        std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
+    }
+#endif
+}
+
+} // namespace compute
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/compute/utils.hpp
+++ b/src/gpu/intel/compute/utils.hpp
@ -17,14 +17,8 @@
 #ifndef GPU_INTEL_COMPUTE_UTILS_HPP
 #define GPU_INTEL_COMPUTE_UTILS_HPP

-#include <array>
-#include <cassert>
-#include <sstream>
-#include <tuple>
-#include <vector>
-
 #include "common/utils.hpp"
-#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/compute/kernel_ctx.hpp"
 #include "gpu/intel/utils.hpp"

 namespace dnnl {
@ -155,6 +149,12 @@ private:
    range_t local_range_;
 };

+status_t preprocess_headers(stringstream_t &pp_code, const char *code,
+        const compute::kernel_ctx_t &kernel_ctx);
+
+void debugdump_processed_source(const std::string &source,
+        const std::string &options, const std::string &ocl_options);
+
 } // namespace compute
 } // namespace intel
 } // namespace gpu
--- a/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp
+++ b/src/gpu/intel/gemm/jit/include/gemmstone/generator.hpp
@ -46,7 +46,15 @@ GEMMSTONE_NAMESPACE_START
 #ifndef GENERATOR_BASE

 #define GENERATOR_SUPER(hw) ngen::OpenCLCodeGenerator<hw>
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#define FORWARD(hw) NGEN_FORWARD_SYCL(hw);
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw);
+#endif
 #define GENERATOR_DEBUGINFO {__FILE__, __LINE__}

 #define GENERATOR_BASE(hw) GENERATOR_SUPER(hw)
--- a/src/gpu/intel/jit/binary_format.cpp
+++ b/src/gpu/intel/jit/binary_format.cpp
@ -39,6 +39,16 @@
 #define MAGICSIZEY 2
 #define MAGICSIZEZ 1

+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#define FORWARD(hw) NGEN_FORWARD_SYCL(hw);
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw);
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@ -49,7 +59,7 @@ using namespace ngen;

 template <HW hw>
 class binary_format_kernel_t : public generator_t<hw> {
-    NGEN_FORWARD_OPENCL(hw);
+    FORWARD(hw);

 public:
    binary_format_kernel_t()
--- a/src/gpu/intel/jit/codegen/codegen.cpp
+++ b/src/gpu/intel/jit/codegen/codegen.cpp
@ -37,6 +37,9 @@
 #ifdef WITH_OPENCL_RUNTIME
 #include "ngen_opencl.hpp"
 #endif
+#ifdef WITH_L0_RUNTIME
+#include "ngen_level_zero.hpp"
+#endif

 namespace dnnl {
 namespace impl {
@ -1816,6 +1819,29 @@ cl_kernel make_kernel(const kernel::iface_t &iface, const stmt_t &body,
 }
 #endif

+#ifdef WITH_L0_RUNTIME
+std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
+        const kernel::iface_t &iface, const stmt_t &body,
+        const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
+        ze_context_handle_t ctx, ze_device_handle_t dev) {
+    ngen::NEOInterfaceHandler interface = generate_ngen_interface(
+            iface, options, false, body);
+
+#define GPU_HW_CASE(hw) \
+    ir_to_ngen_generator_t<ngen::LevelZeroCodeGenerator<(hw)>> g( \
+            iface, options, debug_cfg); \
+    g.setInterface(std::move(interface)); \
+    convert_ir_to_ngen(body, g); \
+    auto module = g.getModule(ctx, dev); \
+    auto kernel = g.getKernel(module); \
+    return std::make_pair(module, kernel);
+
+    GPU_HW_SWITCH(options.hw().ngen_hw());
+#undef GPU_HW_CASE
+    return {};
+}
+#endif
+
 } // namespace jit
 } // namespace intel
 } // namespace gpu
--- a/src/gpu/intel/jit/codegen/codegen.hpp
+++ b/src/gpu/intel/jit/codegen/codegen.hpp
@ -24,9 +24,13 @@
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
 #include <sycl/sycl.hpp>
 #define WITH_SYCL_RUNTIME
-#endif
-#define WITH_OPENCL_RUNTIME
+#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include <CL/cl.h>
+#define WITH_OPENCL_RUNTIME
+#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "level_zero/ze_api.h"
+#define WITH_L0_RUNTIME
+#endif

 namespace dnnl {
 namespace impl {
@ -44,6 +48,12 @@ cl_kernel make_kernel(const kernel::iface_t &iface, const stmt_t &body,
        const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
        cl_context ctx, cl_device_id dev);
 #endif
+#ifdef WITH_L0_RUNTIME
+std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
+        const kernel::iface_t &iface, const stmt_t &body,
+        const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
+        ze_context_handle_t ctx, ze_device_handle_t dev);
+#endif

 } // namespace jit
 } // namespace intel
--- a/src/gpu/intel/jit/config/gemmstone_config.hpp
+++ b/src/gpu/intel/jit/config/gemmstone_config.hpp
@ -34,7 +34,7 @@
 #define DNNL
 #define MICROKERNEL_INTERFACE

-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
 #define ZEBIN_OUTPUT
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #define OPENCL_OUTPUT
--- a/src/gpu/intel/jit/dsl/runtime.hpp
+++ b/src/gpu/intel/jit/dsl/runtime.hpp
@ -40,6 +40,14 @@ inline cl_kernel make_kernel(
            kernel.debug_cfg, ctx, dev);
 }
 #endif
+#ifdef WITH_L0_RUNTIME
+inline std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
+        const kernel_t &kernel, ze_context_handle_t ctx,
+        ze_device_handle_t dev) {
+    return make_kernel(kernel.iface, kernel.body, kernel.options,
+            kernel.debug_cfg, ctx, dev);
+}
+#endif

 } // namespace dsl
 } // namespace jit
--- a/src/gpu/intel/jit/generator.hpp
+++ b/src/gpu/intel/jit/generator.hpp
@ -44,6 +44,12 @@
 #include "ngen_opencl.hpp"
 #endif

+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/kernel.hpp"
+#include "ngen_level_zero.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@ -91,6 +97,11 @@ template <gpu_gen_t hw>
 using ngen_code_generator_t = ngen::OpenCLCodeGenerator<hw>;
 #endif

+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+template <gpu_gen_t hw>
+using ngen_code_generator_t = ngen::LevelZeroCodeGenerator<hw>;
+#endif
+
 void check_kernel_size(const std::string &kernel_name, size_t kernel_size,
        const intel::engine_t *engine);

@ -123,6 +134,15 @@ public:
        auto ocl_kernel = ngen_code_generator_t<hw>::getKernel(
                ocl_engine->context(), ocl_engine->device());
        return ocl::kernel_t::make(kernel, ocl_kernel, {});
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
+        auto *l0_engine = utils::downcast<const l0::engine_t *>(engine);
+        auto l0_module = std::make_shared<l0::module_wrapper_t>(
+                ngen_code_generator_t<hw>::getModule(
+                        l0_engine->context(), l0_engine->device()));
+        auto l0_kernel
+                = ngen_code_generator_t<hw>::getKernel(*(l0_module.get()));
+        return l0::kernel_t::make(kernel, l0_module, l0_kernel, kernel_name());
 #endif
    }
 };
--- a/src/gpu/intel/l0/CMakeLists.txt
+++ b/src/gpu/intel/l0/CMakeLists.txt
@ -0,0 +1,24 @@
+#===============================================================================
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+file(GLOB_RECURSE SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
+    )
+
+set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0)
+add_library(${OBJ_LIB} OBJECT ${SOURCES})
+set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
+    $<TARGET_OBJECTS:${OBJ_LIB}>)
--- a/src/gpu/intel/l0/capi/engine.cpp
+++ b/src/gpu/intel/l0/capi/engine.cpp
@ -0,0 +1,73 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#include "common/utils.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/engine_factory.hpp"
+
+using namespace dnnl::impl;
+
+dnnl_status_t dnnl_l0_interop_engine_create(dnnl_engine_t *engine,
+        const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
+        const ze_context_handle_t acontext) {
+    bool args_ok = !utils::any_null(engine, adriver, adevice, acontext);
+    if (!args_ok) return status::invalid_arguments;
+
+    gpu::intel::l0::engine_factory_t f(engine_kind::gpu);
+
+    size_t index;
+    CHECK(gpu::intel::l0::get_device_index(adevice, &index));
+
+    return f.engine_create(engine, adriver, adevice, acontext, index);
+}
+
+dnnl_status_t dnnl_l0_interop_engine_get_context(
+        dnnl_engine_t engine, ze_context_handle_t context) {
+    bool args_ok = !utils::any_null(engine, context)
+            && (engine->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    context = l0_engine->context();
+
+    return status::success;
+}
+
+dnnl_status_t dnnl_l0_interop_engine_get_device(
+        dnnl_engine_t engine, ze_device_handle_t device) {
+    bool args_ok = !utils::any_null(engine, device)
+            && (engine->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    device = l0_engine->device();
+
+    return status::success;
+}
+
+dnnl_status_t dnnl_l0_interop_engine_get_driver(
+        dnnl_engine_t engine, ze_driver_handle_t driver) {
+    bool args_ok = !utils::any_null(engine, driver)
+            && (engine->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    driver = l0_engine->driver();
+
+    return status::success;
+}
--- a/src/gpu/intel/l0/capi/memory.cpp
+++ b/src/gpu/intel/l0/capi/memory.cpp
@ -0,0 +1,129 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#include "common/utils.hpp"
+#include "gpu/intel/l0/memory_storage.hpp"
+
+using namespace dnnl::impl;
+
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        void *handle) {
+    bool ok = !utils::any_null(memory, memory_desc, engine)
+            && engine->runtime_kind() == runtime_kind::l0;
+    if (!ok) return status::invalid_arguments;
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto kind = gpu::intel::l0::get_memory_storage_kind(
+            gpu::intel::l0::get_pointer_type(l0_engine->context(), handle));
+    if (handle != DNNL_MEMORY_NONE && handle != DNNL_MEMORY_ALLOCATE
+            && kind == gpu::intel::l0::memory_storage_kind_t::unknown
+            && !engine->mayiuse_system_memory_allocators())
+        return status::invalid_arguments;
+
+    const auto mdw = memory_desc_wrapper(memory_desc);
+    if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
+        return status::invalid_arguments;
+
+    unsigned flags = (handle == DNNL_MEMORY_ALLOCATE)
+            ? memory_flags_t::alloc
+            : memory_flags_t::use_runtime_ptr;
+    handle = (handle == DNNL_MEMORY_ALLOCATE) ? nullptr : handle;
+
+    std::unique_ptr<memory_storage_t> mem_storage;
+    mem_storage.reset(new gpu::intel::l0::memory_storage_t(
+            engine, gpu::intel::l0::memory_storage_kind_t::device));
+    if (!mem_storage) return status::out_of_memory;
+
+    CHECK(mem_storage->init(
+            flags, dnnl_memory_desc_get_size(memory_desc), handle));
+
+    return safe_ptr_assign(
+            *memory, new memory_t(engine, memory_desc, std::move(mem_storage)));
+}
+
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        size_t nhandles, void **handles) {
+    bool ok = !utils::any_null(memory, memory_desc, engine, handles)
+            && nhandles > 0 && engine->runtime_kind() == runtime_kind::l0;
+    if (!ok) return status::invalid_arguments;
+
+    const auto mdw = memory_desc_wrapper(memory_desc);
+    if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
+        return status::invalid_arguments;
+
+    std::vector<unsigned> flags_vec(nhandles);
+    std::vector<void *> handles_vec(nhandles);
+    for (size_t i = 0; i < nhandles; i++) {
+        unsigned f = (handles[i] == DNNL_MEMORY_ALLOCATE)
+                ? memory_flags_t::alloc
+                : memory_flags_t::use_runtime_ptr;
+        void *h = (handles[i] == DNNL_MEMORY_ALLOCATE) ? nullptr : handles[i];
+        flags_vec[i] = f;
+        handles_vec[i] = h;
+    }
+
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    std::vector<std::unique_ptr<memory_storage_t>> mem_storages(nhandles);
+    for (size_t i = 0; i < nhandles; i++) {
+        auto kind = gpu::intel::l0::get_memory_storage_kind(
+                gpu::intel::l0::get_pointer_type(
+                        l0_engine->context(), handles[i]));
+        if (handles[i] != DNNL_MEMORY_NONE && handles[i] != DNNL_MEMORY_ALLOCATE
+                && kind == gpu::intel::l0::memory_storage_kind_t::unknown
+                && !engine->mayiuse_system_memory_allocators()) {
+            return status::invalid_arguments;
+        }
+        size_t sz = dnnl_memory_desc_get_size_v2(
+                memory_desc, static_cast<int>(i));
+        mem_storages[i].reset(new gpu::intel::l0::memory_storage_t(
+                engine, gpu::intel::l0::memory_storage_kind_t::device));
+        if (!mem_storages[i]) return status::out_of_memory;
+        CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i]));
+    }
+
+    return safe_ptr_assign(*memory,
+            new memory_t(engine, memory_desc, std::move(mem_storages)));
+}
+
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object(
+        const memory_t *memory, void **mem_object) {
+    if (utils::any_null(mem_object)) return status::invalid_arguments;
+
+    if (!memory) {
+        mem_object = nullptr;
+        return status::success;
+    }
+    bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    void *handle;
+    status_t status = memory->get_data_handle(&handle);
+    if (status == status::success) mem_object = &handle;
+
+    return status;
+}
+
+dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object(
+        memory_t *memory, void *mem_object) {
+    bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0);
+    if (!args_ok) return status::invalid_arguments;
+
+    return memory->set_data_handle(mem_object);
+}
--- a/src/gpu/intel/l0/capi/primitive.cpp
+++ b/src/gpu/intel/l0/capi/primitive.cpp
@ -0,0 +1,67 @@
+/*******************************************************************************
+* Copyright 2023-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#include "common/primitive_desc_iface.hpp"
+#include "common/primitive_iface.hpp"
+#include "common/utils.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+using namespace dnnl::impl;
+
+dnnl_status_t dnnl_l0_interop_primitive_execute(
+        const primitive_iface_t *primitive_iface, dnnl_stream_t stream,
+        size_t nargs, const dnnl_exec_arg_t *args, size_t ndeps,
+        const ze_event_handle_t *deps, ze_event_handle_t *return_event) {
+    const bool ok = !utils::any_null(primitive_iface, stream)
+            && primitive_iface->engine() == stream->engine()
+            && primitive_iface->engine()->runtime_kind() == runtime_kind::l0
+            && IMPLICATION(nargs > 0, args != nullptr)
+            && IMPLICATION(ndeps > 0, deps != nullptr);
+    if (!ok) return status::invalid_arguments;
+
+    auto *l0_stream = utils::downcast<gpu::intel::l0::stream_t *>(stream);
+    stream->before_exec_hook();
+
+    if (deps != nullptr) {
+        std::vector<ze_event_handle_t> events(ndeps);
+        for (size_t i = 0; i < ndeps; i++)
+            events[i] = deps[i];
+        l0_stream->l0_ctx().set_deps(events);
+    }
+
+    // run primitive
+    exec_args_t exec_args;
+    CHECK(cvt_primitive_args(primitive_iface->pd()->impl().get(),
+            static_cast<int>(nargs), args, exec_args));
+
+    exec_ctx_t ctx(stream, std::move(exec_args));
+    CHECK(primitive_execute(primitive_iface, ctx));
+
+    // return output event
+    if (return_event != nullptr) {
+        if (l0_stream->impl()->flags() & stream_flags::in_order) {
+            *return_event = nullptr;
+        } else {
+            *return_event = l0_stream->get_output_event();
+        }
+    }
+
+    stream->after_exec_hook();
+
+    return status::success;
+}
--- a/src/gpu/intel/l0/capi/stream.cpp
+++ b/src/gpu/intel/l0/capi/stream.cpp
@ -0,0 +1,51 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_l0.h"
+
+#include "common/utils.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+using namespace dnnl::impl;
+
+dnnl_status_t dnnl_l0_interop_stream_create(dnnl_stream_t *stream,
+        dnnl_engine_t engine, ze_command_list_handle_t list) {
+    bool args_ok = !utils::any_null(stream, engine, list)
+            && engine->runtime_kind() == runtime_kind::l0;
+    if (!args_ok) return status::invalid_arguments;
+
+    std::unique_ptr<stream_impl_t> stream_impl(
+            new gpu::intel::l0::stream_impl_t(
+                    stream_flags::default_flags, list));
+    if (!stream_impl) return status::out_of_memory;
+
+    CHECK(engine->create_stream(stream, stream_impl.get()));
+    stream_impl.release();
+
+    return status::success;
+}
+
+dnnl_status_t dnnl_l0_interop_stream_get_list(
+        dnnl_stream_t stream, ze_command_list_handle_t list) {
+    bool args_ok = !utils::any_null(list, stream)
+            && stream->engine()->runtime_kind() == runtime_kind::l0;
+    if (!args_ok) return status::invalid_arguments;
+
+    auto *l0_stream = utils::downcast<const gpu::intel::l0::stream_t *>(stream);
+    list = l0_stream->list();
+
+    return status::success;
+}
--- a/src/gpu/intel/l0/compiler.cpp
+++ b/src/gpu/intel/l0/compiler.cpp
@ -0,0 +1,189 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/compiler.hpp"
+
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include "windows.h"
+#else
+#include <dlfcn.h>
+#endif
+
+#include "ocloc_api.h"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+inline void *find_ocloc_symbol(const char *symbol) {
+#ifdef _WIN32
+    // Use LOAD_LIBRARY_SEARCH_SYSTEM32 flag to avoid DLL hijacking issue.
+    HMODULE handle = LoadLibraryExA(
+            "ocloc64.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
+    if (!handle) return nullptr;
+    return reinterpret_cast<void *>(GetProcAddress(handle, symbol));
+#else
+    void *handle = dlopen("libocloc.so", RTLD_NOW | RTLD_LOCAL);
+    if (!handle) return nullptr;
+    return dlsym(handle, symbol);
+#endif
+}
+
+template <typename F>
+F find_ocloc_symbol(const char *symbol) {
+    return (F)find_ocloc_symbol(symbol);
+}
+
+status_t ocloc_invoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources,
+        const uint8_t **DataSources, const uint64_t *LenSources,
+        const char **NameSources, uint32_t NumInputHeaders,
+        const uint8_t **DataInputHeaders, const uint64_t *LenInputHeaders,
+        const char **NameInputHeaders, uint32_t *NumOutputs,
+        uint8_t ***DataOutputs, uint64_t **LenOutputs, char ***NameOutputs) {
+    static auto f = find_ocloc_symbol<decltype(&oclocInvoke)>("oclocInvoke");
+    if (!f) return status::runtime_error;
+
+    if (f(NumArgs, Argv, NumSources, DataSources, LenSources, NameSources,
+                NumInputHeaders, DataInputHeaders, LenInputHeaders,
+                NameInputHeaders, NumOutputs, DataOutputs, LenOutputs,
+                NameOutputs))
+        return status::runtime_error;
+
+    return status::success;
+}
+
+status_t ocloc_free(uint32_t *numOutputs, uint8_t ***dataOutputs,
+        uint64_t **lenOutputs, char ***nameOutputs) {
+    static auto f
+            = find_ocloc_symbol<decltype(&oclocFreeOutput)>("oclocFreeOutput");
+    if (!f) return status::runtime_error;
+
+    if (f(numOutputs, dataOutputs, lenOutputs, nameOutputs))
+        return status::runtime_error;
+
+    return status::success;
+}
+
+status_t ocloc_get_extensions(std::string &extensions) {
+    std::vector<const char *> args = {"ocloc", "query", "CL_DEVICE_EXTENSIONS"};
+
+    uint32_t num_outputs = 0;
+    uint8_t **data_outputs = nullptr;
+    uint64_t *len_outputs = nullptr;
+    char **name_outputs = nullptr;
+
+    CHECK(ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(), 0,
+            nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr, &num_outputs,
+            &data_outputs, &len_outputs, &name_outputs));
+
+    for (uint32_t i = 0; i < num_outputs; i++) {
+        if (!strcmp(name_outputs[i], "stdout.log")) {
+            if (len_outputs[i] > 0) {
+                extensions = std::string(
+                        reinterpret_cast<const char *>(data_outputs[i]));
+                break;
+            }
+        }
+    }
+
+    CHECK(ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs));
+
+    return status::success;
+}
+
+bool ocloc_mayiuse_microkernels(const std::string &kernel_code) {
+    std::vector<const char *> args
+            = {"ocloc", "compile", "-q", "-file", "test.cl"};
+    const uint8_t *data_sources[]
+            = {reinterpret_cast<const uint8_t *>(kernel_code.c_str())};
+    const uint64_t len_sources[] = {kernel_code.length() + 1};
+    const char *name_sources[] = {"test.cl"};
+
+    uint32_t num_outputs = 0;
+    uint8_t **data_outputs = nullptr;
+    uint64_t *len_outputs = nullptr;
+    char **name_outputs = nullptr;
+
+    bool compilation_successful = true;
+    if (ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(), 1,
+                data_sources, len_sources, name_sources, 0, nullptr, nullptr,
+                nullptr, &num_outputs, &data_outputs, &len_outputs,
+                &name_outputs))
+        compilation_successful = false;
+    ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs);
+
+    return compilation_successful;
+}
+
+status_t ocloc_build_kernels(const std::string &kernel_code,
+        const std::string &options, const std::string &ip_version,
+        xpu::binary_t &binary) {
+    std::vector<const char *> args = {"ocloc", "compile", "-q", "--format",
+            "zebin", "-exclude_ir", "-output_no_suffix", "-file", "main.cl",
+            "-device", ip_version.c_str(), "-options", options.c_str()};
+    const uint8_t *data_sources[]
+            = {reinterpret_cast<const uint8_t *>(kernel_code.c_str())};
+    const uint64_t len_sources[] = {kernel_code.length() + 1};
+    const char *name_sources[] = {"main.cl"};
+
+    uint32_t num_outputs = 0;
+    uint8_t **data_outputs = nullptr;
+    uint64_t *len_outputs = nullptr;
+    char **name_outputs = nullptr;
+
+    status_t ret = ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(),
+            1, data_sources, len_sources, name_sources, 0, nullptr, nullptr,
+            nullptr, &num_outputs, &data_outputs, &len_outputs, &name_outputs);
+    if (ret != status::success) {
+        std::string output_string;
+        for (uint32_t i = 0; i < num_outputs; i++) {
+            if (!strcmp(name_outputs[i], "stdout.log")) {
+                if (len_outputs[i] > 0) {
+                    output_string = std::string(
+                            reinterpret_cast<const char *>(data_outputs[i]));
+                }
+            }
+        }
+        CHECK(ocloc_free(
+                &num_outputs, &data_outputs, &len_outputs, &name_outputs));
+        throw std::runtime_error(output_string);
+    }
+
+    for (uint32_t i = 0; i < num_outputs; i++) {
+        if (!strcmp(name_outputs[i], "main.bin")) {
+            if (len_outputs[i] > 0) {
+                binary.resize(len_outputs[i]);
+                std::memcpy(binary.data(), data_outputs[i], len_outputs[i]);
+                break;
+            }
+        }
+    }
+
+    CHECK(ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs));
+
+    return status::success;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/compiler.hpp
+++ b/src/gpu/intel/l0/compiler.hpp
@ -0,0 +1,40 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_COMPILER_HPP
+#define GPU_INTEL_L0_COMPILER_HPP
+
+#include "gpu/intel/l0/utils/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+status_t ocloc_get_extensions(std::string &extensions);
+bool ocloc_mayiuse_microkernels(const std::string &kernel_code);
+status_t ocloc_build_kernels(const std::string &kernel_code,
+        const std::string &options, const std::string &ip_version,
+        xpu::binary_t &binary);
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_COMPILER_HPP
--- a/src/gpu/intel/l0/context.cpp
+++ b/src/gpu/intel/l0/context.cpp
@ -0,0 +1,28 @@
+/*******************************************************************************
+* Copyright 2023-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/context.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/context.hpp
+++ b/src/gpu/intel/l0/context.hpp
@ -0,0 +1,103 @@
+/*******************************************************************************
+* Copyright 2023-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_CONTEXT_HPP
+#define GPU_INTEL_L0_CONTEXT_HPP
+
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "xpu/context.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+struct event_t : public xpu::event_t {
+    event_t() = default;
+    event_t(const event_t &) = default;
+    event_t(const std::vector<ze_event_handle_t> &event) : events_(event) {}
+    event_t(std::vector<ze_event_handle_t> &&event)
+        : events_(std::move(event)) {}
+    event_t(ze_event_handle_t &&event) {
+        events_.emplace_back(std::move(event));
+    }
+    ~event_t() override = default;
+
+    event_t &operator=(event_t &&other) {
+        std::swap(events_, other.events_);
+        return *this;
+    }
+    event_t &operator=(const event_t &other) {
+        events_ = other.events_;
+        return *this;
+    }
+
+    const ze_event_handle_t &operator[](size_t i) const { return events_[i]; }
+    ze_event_handle_t &operator[](size_t i) { return events_[i]; }
+    size_t size() const { return events_.size(); }
+
+    static event_t &from(xpu::event_t &event) {
+        return *utils::downcast<event_t *>(&event);
+    }
+    static const event_t &from(const xpu::event_t &event) {
+        return *utils::downcast<const event_t *>(&event);
+    }
+    std::unique_ptr<xpu::event_t> clone() const override {
+        return std::unique_ptr<xpu::event_t>(new event_t(*this));
+    }
+    void append(const xpu::event_t &event) {
+        auto &other = *utils::downcast<const event_t *>(&event);
+        events_.insert(
+                events_.end(), other.events_.begin(), other.events_.end());
+    }
+
+    std::vector<ze_event_handle_t> events_;
+};
+
+class context_t final : public xpu::context_t {
+public:
+    context_t() = default;
+    ~context_t() override = default;
+
+    context_t &operator=(const context_t &other) {
+        events_ = other.events_;
+        return *this;
+    }
+    void set_deps(std::vector<ze_event_handle_t> &&event) {
+        events_ = event_t(event);
+    }
+    void set_deps(event_t &&events) { events_ = std::move(events); }
+
+    xpu::event_t &get_deps() override { return events_; }
+    const xpu::event_t &get_deps() const override { return events_; }
+    void append_deps(const xpu::event_t &event) override {
+        events_.append(event);
+    }
+
+    status_t get_event(ze_event_handle_t *new_event);
+
+private:
+    event_t events_;
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_CONTEXT_HPP
--- a/src/gpu/intel/l0/device_info.cpp
+++ b/src/gpu/intel/l0/device_info.cpp
@ -0,0 +1,145 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/device_info.hpp"
+#include "gpu/intel/l0/compiler.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "ngen_level_zero.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+status_t device_info_t::init_arch(impl::engine_t *engine) {
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto context = l0_engine->context();
+    auto device = l0_engine->device();
+
+    return init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_,
+            gpu_product_, native_extensions_, mayiuse_systolic_,
+            mayiuse_ngen_kernels_);
+}
+
+status_t device_info_t::init_device_name(impl::engine_t *engine) {
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto device = l0_engine->device();
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = nullptr;
+
+    CHECK(l0::zeDeviceGetProperties(device, &device_properties));
+    name_ = std::string(device_properties.name);
+
+    return status::success;
+}
+
+status_t device_info_t::init_runtime_version(impl::engine_t *engine) {
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto driver = l0_engine->driver();
+
+    ze_driver_properties_t driver_properties = {};
+    driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
+    driver_properties.pNext = nullptr;
+
+    l0::zeDriverGetProperties(driver, &driver_properties);
+
+    runtime_version_.major
+            = (driver_properties.driverVersion & 0xFF000000) >> 24;
+    runtime_version_.minor
+            = (driver_properties.driverVersion & 0x00FF0000) >> 16;
+    runtime_version_.build = driver_properties.driverVersion & 0x0000FFFF;
+
+    return status::success;
+}
+
+status_t device_info_t::init_extensions(impl::engine_t *engine) {
+    std::string extension_string;
+    CHECK(ocloc_get_extensions(extension_string));
+
+    for (uint64_t i_ext = 1; i_ext < (uint64_t)compute::device_ext_t::last;
+            i_ext <<= 1) {
+        const char *s_ext = ext2cl_str((compute::device_ext_t)i_ext);
+
+        if (s_ext && extension_string.find(s_ext) != std::string::npos) {
+            extensions_ |= i_ext;
+        }
+    }
+
+    extensions_
+            |= (uint64_t)get_future_extensions(gpu_arch(), mayiuse_systolic());
+
+    return status::success;
+}
+
+status_t device_info_t::init_attributes(impl::engine_t *engine) {
+    auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
+    auto device = l0_engine->device();
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = nullptr;
+
+    CHECK(l0::zeDeviceGetProperties(device, &device_properties));
+
+    eu_count_ = device_properties.numSlices
+            * device_properties.numSubslicesPerSlice
+            * device_properties.numEUsPerSubslice;
+
+    ze_device_compute_properties_t device_compute_properties = {};
+    device_compute_properties.stype
+            = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
+    device_compute_properties.pNext = nullptr;
+
+    CHECK(l0::zeDeviceGetComputeProperties(device, &device_compute_properties));
+
+    max_wg_size_ = device_compute_properties.maxTotalGroupSize;
+
+    uint32_t device_cache_properties_count = 0;
+    CHECK(l0::zeDeviceGetCacheProperties(
+            device, &device_cache_properties_count, nullptr));
+
+    std::vector<ze_device_cache_properties_t> device_cache_properties(
+            device_cache_properties_count);
+    for (ze_device_cache_properties_t &p : device_cache_properties) {
+        p.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
+        p.pNext = nullptr;
+    }
+
+    CHECK(l0::zeDeviceGetCacheProperties(device, &device_cache_properties_count,
+            device_cache_properties.data()));
+    l3_cache_size_ = device_cache_properties[0].cacheSize;
+
+    ze_device_memory_access_properties_t device_memory_access_properties = {};
+    device_memory_access_properties.stype
+            = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
+    device_memory_access_properties.pNext = nullptr;
+
+    l0::zeDeviceGetMemoryAccessProperties(
+            device, &device_memory_access_properties);
+    mayiuse_system_memory_allocators_
+            = device_memory_access_properties.sharedSystemAllocCapabilities;
+
+    return status::success;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/device_info.hpp
+++ b/src/gpu/intel/l0/device_info.hpp
@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_DEVICE_INFO_HPP
+#define GPU_INTEL_L0_DEVICE_INFO_HPP
+
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class device_info_t : public compute::device_info_t {
+protected:
+    status_t init_device_name(impl::engine_t *engine) override;
+    status_t init_arch(impl::engine_t *engine) override;
+    status_t init_runtime_version(impl::engine_t *engine) override;
+    status_t init_extensions(impl::engine_t *engine) override;
+    status_t init_attributes(impl::engine_t *engine) override;
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_DEVICE_INFO_HPP
--- a/src/gpu/intel/l0/engine.cpp
+++ b/src/gpu/intel/l0/engine.cpp
@ -0,0 +1,292 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/compiler.hpp"
+#include "gpu/intel/l0/device_info.hpp"
+#include "gpu/intel/l0/kernel.hpp"
+#include "gpu/intel/l0/memory_storage.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+#include "gpu/intel/compute/ukernels.hpp"
+#include "gpu/intel/jit/dsl/runtime.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "gpu/intel/microkernels/fuser.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class engine_impl_t : public impl::engine_impl_t {
+public:
+    engine_impl_t(engine_kind_t kind, const ze_driver_handle_t driver,
+            const ze_device_handle_t device, const ze_context_handle_t context,
+            size_t index)
+        : impl::engine_impl_t(kind, runtime_kind::l0, index)
+        , driver_(driver)
+        , device_(device)
+        , context_(context) {}
+    ~engine_impl_t() override { l0::zeContextDestroy(context_); }
+
+    const ze_driver_handle_t driver() const { return driver_; }
+    const ze_device_handle_t device() const { return device_; }
+    const ze_context_handle_t context() const { return context_; }
+
+    status_t create_stream_impl(
+            impl::stream_impl_t **stream_impl, unsigned flags) const override {
+        auto *si = new stream_impl_t(flags, context_, device_);
+        if (!si) return status::out_of_memory;
+
+        *stream_impl = si;
+
+        return status::success;
+    }
+
+    status_t create_memory_storage(impl::memory_storage_t **storage,
+            impl::engine_t *engine, unsigned flags, size_t size,
+            void *handle) const override {
+        std::unique_ptr<memory_storage_t> _storage;
+        _storage.reset(
+                new memory_storage_t(engine, memory_storage_kind_t::device));
+        if (!_storage) return status::out_of_memory;
+
+        status_t status = _storage->init(flags, size, handle);
+        if (status != status::success) return status;
+
+        *storage = _storage.release();
+
+        return status::success;
+    }
+
+    engine_id_t engine_id() const override {
+        return engine_id_t(new engine_id_impl_t(
+                device(), context(), kind(), runtime_kind(), index()));
+    }
+
+    int get_buffer_alignment() const override { return 128; }
+
+private:
+    ze_driver_handle_t driver_;
+    ze_device_handle_t device_;
+    ze_context_handle_t context_;
+
+    engine_impl_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(engine_impl_t);
+};
+
+status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
+        const ze_driver_handle_t dri, const ze_device_handle_t dev,
+        const ze_context_handle_t ctx, size_t index) {
+    std::unique_ptr<gpu::intel::l0::engine_t, engine_deleter_t> e(
+            (new gpu::intel::l0::engine_t(dri, dev, ctx, index)));
+    if (!e) return status::out_of_memory;
+
+    CHECK(e->init());
+    *engine = e.release();
+
+    return status::success;
+}
+
+engine_t::engine_t(ze_driver_handle_t driver, ze_device_handle_t device,
+        ze_context_handle_t context, size_t index)
+    : gpu::intel::engine_t(new engine_impl_t(
+            engine_kind::gpu, driver, device, context, index)) {}
+
+status_t engine_t::init() {
+    CHECK(init_impl());
+    CHECK(gpu::intel::engine_t::init());
+
+    return status::success;
+}
+
+status_t engine_t::create_stream(
+        impl::stream_t **stream, impl::stream_impl_t *stream_impl) {
+    return gpu::intel::l0::stream_t::create_stream(stream, this, stream_impl);
+}
+
+status_t engine_t::create_kernel(
+        compute::kernel_t *kernel, jit::generator_base_t *jitter) const {
+    if (kind() != engine_kind::gpu) {
+        assert(!"not expected");
+        return status::invalid_arguments;
+    }
+
+    return jitter->get_kernel(*kernel, this);
+}
+
+status_t engine_t::create_kernel(
+        compute::kernel_t &kernel, const jit::dsl::kernel_t &kernel_dsl) const {
+    if (kind() != engine_kind::gpu) {
+        assert(!"not expected");
+        return status::invalid_arguments;
+    }
+
+    auto module_and_kernel
+            = jit::dsl::make_kernel(kernel_dsl, context(), device());
+    auto l0_module_ptr
+            = std::make_shared<module_wrapper_t>(module_and_kernel.first);
+    return kernel_t::make(kernel, l0_module_ptr, module_and_kernel.second, {});
+}
+
+status_t engine_t::convert_to_l0(
+        std::vector<gpu::intel::compute::kernel_t> &kernels,
+        const std::vector<const char *> &kernel_names,
+        xpu::binary_t &binary) const {
+    ze_module_handle_t l0_module = nullptr;
+    std::vector<ze_kernel_handle_t> l0_kernels;
+    CHECK(gpu::intel::l0::create_kernels_from_binary(
+            device(), context(), kernel_names, binary, &l0_module, l0_kernels));
+    auto l0_module_ptr = std::make_shared<module_wrapper_t>(l0_module);
+
+    kernels = std::vector<gpu::intel::compute::kernel_t>(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (!l0_kernels[i]) continue;
+        CHECK(kernel_t::make(
+                kernels[i], l0_module_ptr, l0_kernels[i], kernel_names[i]));
+    }
+
+    return status::success;
+}
+
+status_t engine_t::create_kernels(std::vector<compute::kernel_t> *kernels,
+        const std::vector<const char *> &kernel_names,
+        const compute::kernel_ctx_t &kernel_ctx) const {
+    if (kind() != engine_kind::gpu) {
+        assert(!"not expected");
+        return status::invalid_arguments;
+    }
+
+    const char *source = nullptr;
+    for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++)
+        source = intel::get_kernel_source(kernel_names[i]);
+
+    std::string options = kernel_ctx.options();
+    auto *dev_info = utils::downcast<const device_info_t *>(device_info());
+    options += " " + dev_info->get_cl_ext_options();
+
+    stringstream_t code_ss;
+    CHECK(compute::preprocess_headers(code_ss, source, kernel_ctx));
+    std::string code = code_ss.str();
+
+    gpu::intel::compute::program_src_t src(code);
+    if (src) { options += " -g -s " + std::string(src.name()); }
+
+    compute::debugdump_processed_source(
+            code, options, dev_info->get_cl_ext_options());
+
+    xpu::binary_t binary;
+#if 1
+    CHECK(ocloc_build_kernels(
+            code, options, std::to_string(dev_info->ip_version()), binary));
+#else
+    CHECK(gpu::intel::l0::compile_ocl_module(
+            device(), context(), code, options, binary));
+#endif
+
+    const char *code_c = code.c_str();
+    if (kernel_ctx.has_custom_headers() && micro::hasMicrokernels(code_c)) {
+        try {
+            micro::fuseMicrokernels(binary, code_c);
+        } catch (...) { return status::runtime_error; }
+    }
+
+    CHECK(convert_to_l0(*kernels, kernel_names, binary));
+
+    return status::success;
+}
+
+status_t engine_t::create_kernel_from_binary(compute::kernel_t &kernel,
+        const xpu::binary_t &binary, const char *kernel_name,
+        const compute::program_src_t &src) const {
+    std::vector<const char *> kernel_names = {kernel_name};
+    ze_module_handle_t l0_module = nullptr;
+    std::vector<ze_kernel_handle_t> l0_kernels;
+    CHECK(gpu::intel::l0::create_kernels_from_binary(
+            device(), context(), kernel_names, binary, &l0_module, l0_kernels));
+    auto l0_module_ptr = std::make_shared<module_wrapper_t>(l0_module);
+
+    CHECK(kernel_t::make(kernel, l0_module_ptr, l0_kernels[0], kernel_name));
+
+    return status::success;
+}
+
+status_t engine_t::create_kernels_from_cache_blob(
+        const cache_blob_t &cache_blob, std::vector<compute::kernel_t> &kernels,
+        const std::vector<const char *> &kernel_names) const {
+    if (kind() != engine_kind::gpu) {
+        assert(!"not expected");
+        return status::invalid_arguments;
+    }
+
+    kernels = std::vector<compute::kernel_t>(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (!kernel_names[i] && kernel_names.size() > 1) continue;
+        std::string kernel_name(kernel_names[i] ? kernel_names[i] : "");
+
+        const uint8_t *binary_data = nullptr;
+        size_t binary_size = 0;
+        CHECK(cache_blob.get_binary(&binary_data, &binary_size));
+
+        xpu::binary_t binary(binary_data, binary_data + binary_size);
+        CHECK(create_kernel_from_binary(kernels[i], binary, kernel_names[i],
+                gpu::intel::compute::program_src_t()));
+    }
+
+    return status::success;
+}
+
+gpu_utils::device_id_t engine_t::device_id() const {
+    return std::tuple_cat(
+            std::make_tuple(1), gpu::intel::l0::get_device_uuid(device()));
+}
+
+const ze_driver_handle_t engine_t::driver() const {
+    return static_cast<const engine_impl_t *>(impl())->driver();
+}
+
+const ze_device_handle_t engine_t::device() const {
+    return static_cast<const engine_impl_t *>(impl())->device();
+}
+
+const ze_context_handle_t engine_t::context() const {
+    return static_cast<const engine_impl_t *>(impl())->context();
+}
+
+bool engine_t::mayiuse_microkernels() const {
+    return ocloc_mayiuse_microkernels(
+            std::string(compute::cl_microkernels_check_kernel_code));
+}
+
+status_t engine_t::init_device_info() {
+    device_info_ = std::make_shared<gpu::intel::l0::device_info_t>();
+    CHECK(device_info_->init(this));
+
+    return status::success;
+}
+
+status_t engine_t::init_device_info(const std::vector<uint8_t> &cache_blob) {
+    gpu_assert(false) << "unimplemented function init_device_info() called";
+
+    return status::runtime_error;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/engine.hpp
+++ b/src/gpu/intel/l0/engine.hpp
@ -0,0 +1,117 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_ENGINE_HPP
+#define GPU_INTEL_L0_ENGINE_HPP
+
+// #include <list>
+
+#include "gpu/intel/engine.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+struct engine_id_impl_t : public impl::engine_id_impl_t {
+    engine_id_impl_t(const ze_device_handle_t device,
+            const ze_context_handle_t context, engine_kind_t kind,
+            runtime_kind_t runtime_kind, size_t index)
+        : impl::engine_id_impl_t(kind, runtime_kind, index)
+        , device_(device)
+        , context_(context) {}
+    ~engine_id_impl_t() override = default;
+
+private:
+    bool compare_resource(
+            const impl::engine_id_impl_t *id_impl) const override {
+        const auto *typed_id
+                = utils::downcast<const engine_id_impl_t *>(id_impl);
+        return device_ == typed_id->device_ && context_ == typed_id->context_;
+    }
+
+    size_t hash_resource() const override {
+        size_t seed = 0;
+        seed = hash_combine(seed, device_);
+        seed = hash_combine(seed, context_);
+        return seed;
+    }
+
+    ze_device_handle_t device_;
+    ze_context_handle_t context_;
+
+    engine_id_impl_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(engine_id_impl_t);
+};
+
+status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
+        const ze_driver_handle_t dri, const ze_device_handle_t dev,
+        const ze_context_handle_t ctx, size_t index);
+
+class engine_t : public intel::engine_t {
+public:
+    engine_t(ze_driver_handle_t driver, ze_device_handle_t device,
+            ze_context_handle_t context, size_t index);
+    ~engine_t() override = default;
+
+    status_t init() override;
+
+    status_t create_stream(
+            impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
+
+    status_t create_kernel(compute::kernel_t *kernel,
+            jit::generator_base_t *jitter) const override;
+    status_t create_kernel(compute::kernel_t &kernel,
+            const jit::kernel_t &kernel_ir) const override;
+    status_t create_kernels(std::vector<compute::kernel_t> *kernels,
+            const std::vector<const char *> &kernel_names,
+            const compute::kernel_ctx_t &kernel_ctx) const override;
+    status_t create_kernel_from_binary(compute::kernel_t &kernel,
+            const xpu::binary_t &binary, const char *kernel_name,
+            const compute::program_src_t &src) const override;
+    status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob,
+            std::vector<compute::kernel_t> &kernels,
+            const std::vector<const char *> &kernel_names) const override;
+
+    gpu::intel::gpu_utils::device_id_t device_id() const override;
+
+    const ze_driver_handle_t driver() const;
+    const ze_device_handle_t device() const;
+    const ze_context_handle_t context() const;
+
+    bool mayiuse_microkernels() const;
+
+private:
+    status_t init_device_info() override;
+    status_t init_device_info(const std::vector<uint8_t> &cache_blob) override;
+
+    status_t convert_to_l0(std::vector<gpu::intel::compute::kernel_t> &kernels,
+            const std::vector<const char *> &kernel_names,
+            xpu::binary_t &binary) const;
+
+    engine_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(engine_t);
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_ENGINE_HPP
--- a/src/gpu/intel/l0/engine_factory.cpp
+++ b/src/gpu/intel/l0/engine_factory.cpp
@ -0,0 +1,88 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/engine_factory.hpp"
+#include "gpu/intel/l0/engine.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+engine_factory_t::engine_factory_t(engine_kind_t engine_kind)
+    : engine_kind_(engine_kind) {
+    assert(utils::one_of(engine_kind_, engine_kind::gpu));
+}
+
+size_t engine_factory_t::count() const {
+    uint32_t driver_count = 0;
+    l0::zeDriverGet(&driver_count, nullptr);
+
+    std::vector<ze_driver_handle_t> drivers(driver_count);
+    l0::zeDriverGet(&driver_count, drivers.data());
+
+    uint32_t device_count = 0;
+    l0::zeDeviceGet(drivers[0], &device_count, nullptr);
+
+    return device_count;
+}
+
+status_t engine_factory_t::engine_create(
+        impl::engine_t **engine, size_t index) const {
+    ze_driver_handle_t driver = nullptr;
+    ze_device_handle_t device = nullptr;
+    ze_context_handle_t context = nullptr;
+
+    uint32_t driver_count = 0;
+    CHECK(l0::zeDriverGet(&driver_count, nullptr));
+
+    std::vector<ze_driver_handle_t> drivers(driver_count);
+    CHECK(l0::zeDriverGet(&driver_count, drivers.data()));
+    driver = drivers[0];
+
+    uint32_t device_count = 0;
+    CHECK(l0::zeDeviceGet(driver, &device_count, nullptr));
+    VERROR_ENGINE(index < device_count, status::invalid_arguments,
+            "asked for device %zu but only %u devices are found", index,
+            device_count);
+
+    std::vector<ze_device_handle_t> devices(device_count);
+    CHECK(l0::zeDeviceGet(driver, &device_count, devices.data()));
+    device = devices[index];
+
+    ze_context_desc_t context_desc = {};
+    context_desc.stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC;
+    context_desc.pNext = nullptr;
+    context_desc.flags = 0;
+
+    CHECK(l0::zeContextCreate(driver, &context_desc, &context));
+
+    return engine_create(engine, driver, device, context, index);
+}
+
+status_t engine_factory_t::engine_create(impl::engine_t **engine,
+        const ze_driver_handle_t driver, const ze_device_handle_t device,
+        const ze_context_handle_t context, size_t index) const {
+    return gpu::intel::l0::engine_create(
+            engine, engine_kind_, driver, device, context, index);
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/engine_factory.hpp
+++ b/src/gpu/intel/l0/engine_factory.hpp
@ -0,0 +1,59 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_ENGINE_FACTORY_HPP
+#define GPU_INTEL_L0_ENGINE_FACTORY_HPP
+
+#include "common/engine.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class engine_factory_t : public impl::engine_factory_t {
+public:
+    engine_factory_t(engine_kind_t engine_kind);
+    ~engine_factory_t() override = default;
+
+    size_t count() const override;
+    status_t engine_create(
+            impl::engine_t **engine, size_t index) const override;
+    status_t engine_create(impl::engine_t **engine,
+            const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
+            const ze_context_handle_t acontext, size_t index) const;
+
+private:
+    engine_kind_t engine_kind_;
+
+    engine_factory_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(engine_factory_t);
+};
+
+inline std::unique_ptr<engine_factory_t> get_engine_factory(
+        engine_kind_t engine_kind) {
+    return std::unique_ptr<engine_factory_t>(new engine_factory_t(engine_kind));
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_ENGINE_FACTORY_HPP
--- a/src/gpu/intel/l0/kernel.cpp
+++ b/src/gpu/intel/l0/kernel.cpp
@ -0,0 +1,215 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/kernel.hpp"
+#include "gpu/intel/l0/context.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/memory_storage.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+// This class is to get around std::make_shared requirement to have a public
+// constructor. We keep the original constructor as private but expose it here
+// to use with std::make_shared.
+class kernel_compat_t : public kernel_t {
+public:
+    template <typename... Args>
+    kernel_compat_t(Args &&...args) : kernel_t(std::forward<Args>(args)...) {}
+};
+
+status_t kernel_t::make(compute::kernel_t &compute_kernel,
+        const std::shared_ptr<module_wrapper_t> module_ptr,
+        const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name) {
+    compute_kernel = compute::kernel_t(std::make_shared<kernel_compat_t>(
+            module_ptr, kernel_ptr, kernel_name));
+    return status::success;
+}
+
+kernel_t::kernel_t(const std::shared_ptr<module_wrapper_t> module_ptr,
+        const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name)
+    : module_(module_ptr), kernel_(kernel_ptr), kernel_name_(kernel_name) {}
+
+kernel_t::~kernel_t() {
+    l0::zeKernelDestroy(kernel_);
+}
+
+status_t kernel_t::check_alignment(
+        const compute::kernel_arg_list_t &arg_list) const {
+    for (int i = 0; i < arg_list.nargs(); ++i) {
+        auto &arg = arg_list.get(i);
+        if (!arg.is_global()) continue;
+
+        auto *mem_storage = static_cast<const memory_storage_t *>(arg.value());
+        if (!*mem_storage) continue;
+
+        CHECK(compute::kernel_impl_t::check_alignment(
+                mem_storage->data_handle(), i));
+    }
+
+    return status::success;
+}
+
+status_t kernel_t::set_arg(
+        int arg_index, size_t arg_size, const void *arg_value) const {
+    return l0::zeKernelSetArgumentValue(
+            kernel_, arg_index, arg_size, arg_value);
+}
+
+status_t kernel_t::parallel_for(impl::stream_t &stream,
+        const compute::nd_range_t &range,
+        const compute::kernel_arg_list_t &arg_list, const xpu::event_t &deps,
+        xpu::event_t &out_dep) {
+    CHECK(check_scalar_arguments(arg_list));
+    CHECK(check_alignment(arg_list));
+
+    auto l0_stream = utils::downcast<stream_t *>(&stream);
+    auto l0_engine = l0_stream->l0_engine();
+    auto l0_device_info = l0_engine->device_info();
+
+    const size_t pointer_size = l0_device_info->device_address_bits() / 8;
+
+    size_t param_bytes = 0;
+    for (int i = 0; i < arg_list.nargs(); ++i) {
+        auto &arg = arg_list.get(i);
+        if (arg.is_global()) {
+            auto *mem_storage
+                    = static_cast<const memory_storage_t *>(arg.value());
+            if (!mem_storage->is_null()) {
+                auto memory_storage_ctx
+                        = utils::downcast<engine_t *>(mem_storage->engine())
+                                  ->context();
+                if (l0_engine->context() != memory_storage_ctx) {
+                    VERROR(primitive, gpu,
+                            "mismatched Level Zero context for "
+                            "primitive/memory");
+                    return status::invalid_arguments;
+                }
+
+                void *ptr = mem_storage->ptr();
+                CHECK(set_arg(i, pointer_size, &ptr));
+                param_bytes += pointer_size;
+            } else {
+                CHECK(set_arg(i, pointer_size, nullptr));
+                param_bytes += pointer_size;
+            }
+        } else if (arg.is_local()) {
+            CHECK(set_arg(i, arg.size(), arg.value()));
+            param_bytes += pointer_size;
+        } else {
+            CHECK(set_arg(i, arg.size(), arg.value()));
+            param_bytes += arg.size();
+        }
+    }
+    if (param_bytes > l0_device_info->max_kernel_param_size()) {
+        VERROR(primitive, gpu,
+                "parameter bytes requirements greater than device supports");
+        return status::invalid_arguments;
+    }
+
+    if (range.is_zero()) { return status::success; }
+
+    std::vector<uint32_t> global_size(3, 1);
+    switch (range.global_range().ndims()) {
+        case 3: global_size[2] = static_cast<uint32_t>(range.global_range()[2]);
+        case 2: global_size[1] = static_cast<uint32_t>(range.global_range()[1]);
+        case 1:
+            global_size[0] = static_cast<uint32_t>(range.global_range()[0]);
+            break;
+        default:
+            VERROR(primitive, gpu,
+                    "incorrect number of global range dimensions");
+            return status::invalid_arguments;
+    }
+
+    std::vector<uint32_t> group_size(3, 1);
+    if (range.local_range()) {
+        switch (range.local_range().ndims()) {
+            case 3:
+                group_size[2] = static_cast<uint32_t>(range.local_range()[2]);
+            case 2:
+                group_size[1] = static_cast<uint32_t>(range.local_range()[1]);
+            case 1:
+                group_size[0] = static_cast<uint32_t>(range.local_range()[0]);
+                break;
+            default:
+                VERROR(primitive, gpu,
+                        "incorrect number of local range dimensions");
+                return status::invalid_arguments;
+        }
+    } else {
+        CHECK(l0::zeKernelSuggestGroupSize(kernel_, global_size[0],
+                global_size[1], global_size[2], &group_size[0], &group_size[1],
+                &group_size[2]));
+    }
+
+    for (size_t i = 0; i < global_size.size(); i++) {
+        if (global_size[i] % group_size[i] != 0) {
+            VERROR(primitive, gpu, "only uniform work-groups are supported");
+            return status::invalid_arguments;
+        }
+    }
+
+    CHECK(l0::zeKernelSetGroupSize(
+            kernel_, group_size[0], group_size[1], group_size[2]));
+    ze_group_count_t group_count = {global_size[0] / group_size[0],
+            global_size[1] / group_size[1], global_size[2] / group_size[2]};
+
+    std::vector<ze_event_handle_t> l0_deps
+            = utils::downcast<const event_t *>(&deps)->events_;
+    std::vector<ze_event_handle_t> l0_out_deps
+            = utils::downcast<const event_t *>(&out_dep)->events_;
+
+    event_ = l0_stream->create_event();
+    ze_event_handle_t out_event = *(event_.get());
+
+    CHECK(l0::zeCommandListAppendLaunchKernel(l0_stream->list(), kernel_,
+            &group_count, out_event, static_cast<uint32_t>(l0_deps.size()),
+            l0_deps.size() ? l0_deps.data() : nullptr));
+
+    if (out_event) l0_out_deps.push_back(out_event);
+    if (stream.is_profiling_enabled()) {
+        l0_stream->profiler().register_event(
+                utils::make_unique<event_t>(std::move(out_event)));
+    }
+
+    return status::success;
+}
+
+status_t kernel_t::get_kernel_binary(xpu::binary_t &binary) const {
+    return l0::get_kernel_binary(kernel_, binary);
+}
+
+std::string kernel_t::name() const {
+    return kernel_name_;
+}
+
+status_t kernel_t::dump() const {
+    xpu::binary_t binary;
+    CHECK(get_kernel_binary(binary));
+
+    return gpu_utils::dump_kernel_binary(binary, kernel_name_);
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/kernel.hpp
+++ b/src/gpu/intel/l0/kernel.hpp
@ -0,0 +1,76 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_KERNEL_HPP
+#define GPU_INTEL_L0_KERNEL_HPP
+
+#include <thread>
+
+#include "common/rw_mutex.hpp"
+#include "gpu/intel/compute/kernel.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class kernel_t : public compute::kernel_impl_t {
+public:
+    static status_t make(compute::kernel_t &compute_kernel,
+            const std::shared_ptr<module_wrapper_t> module_ptr,
+            const ze_kernel_handle_t kernel_ptr,
+            const std::string &kernel_name);
+    ~kernel_t() override;
+
+    status_t check_alignment(
+            const compute::kernel_arg_list_t &arg_list) const override;
+    status_t set_arg(
+            int arg_index, size_t arg_size, const void *arg_value) const;
+    status_t parallel_for(impl::stream_t &stream,
+            const compute::nd_range_t &range,
+            const compute::kernel_arg_list_t &arg_list,
+            const xpu::event_t &deps, xpu::event_t &out_dep) override;
+
+    status_t get_kernel_binary(xpu::binary_t &binary) const override;
+    std::string name() const override;
+    status_t dump() const override;
+
+private:
+    friend class kernel_compat_t;
+    kernel_t(const std::shared_ptr<module_wrapper_t> module_ptr,
+            const ze_kernel_handle_t kernel_ptr,
+            const std::string &kernel_name);
+
+    std::shared_ptr<module_wrapper_t> module_;
+    ze_kernel_handle_t kernel_;
+    std::string kernel_name_;
+
+    std::shared_ptr<ze_event_pool_handle_t> event_pool_;
+    std::shared_ptr<event_wrapper_t> event_;
+
+    kernel_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(kernel_t);
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_KERNEL_HPP
--- a/src/gpu/intel/l0/memory_storage.cpp
+++ b/src/gpu/intel/l0/memory_storage.cpp
@ -0,0 +1,215 @@
+/*******************************************************************************
+* Copyright 2021-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/memory_storage.hpp"
+#include "common/memory_map_manager.hpp"
+#include "gpu/intel/l0/engine.hpp"
+#include "gpu/intel/l0/stream.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+status_t memory_storage_t::get_data_handle(void **handle) const {
+    *handle = ptr_.get();
+
+    return status::success;
+}
+
+status_t memory_storage_t::set_data_handle(void *handle) {
+    ptr_ = decltype(ptr_)(handle, [](void *) {});
+    kind_ = get_memory_storage_kind(
+            get_pointer_type(l0_engine()->context(), handle));
+
+    return status::success;
+}
+
+bool memory_storage_t::is_host_accessible() const {
+    return utils::one_of(kind_, memory_storage_kind_t::host,
+            memory_storage_kind_t::shared, memory_storage_kind_t::unknown);
+}
+
+struct map_usm_tag;
+
+status_t memory_storage_t::map_data(
+        void **mapped_ptr, impl::stream_t *stream, size_t size) const {
+    if (is_host_accessible()) {
+        *mapped_ptr = ptr();
+        return status::success;
+    }
+
+    if (!ptr() || size == 0) {
+        *mapped_ptr = nullptr;
+        return status::success;
+    }
+
+    if (!stream) CHECK(engine()->get_service_stream(stream));
+
+    void *host_ptr = malloc_host(size);
+    if (!host_ptr) return status::out_of_memory;
+
+    auto leak_guard = decltype(ptr_)(host_ptr, [this](void *p) { free(p); });
+    CHECK(memcpy(stream, host_ptr, ptr(), size));
+    CHECK(stream->wait());
+    leak_guard.release();
+
+    auto *usm_ptr_for_unmap = ptr();
+    auto unmap_callback = [size, usm_ptr_for_unmap, this](
+                                  impl::stream_t *stream, void *mapped_ptr) {
+        CHECK(memcpy(stream, usm_ptr_for_unmap, mapped_ptr, size));
+        CHECK(stream->wait());
+        free(mapped_ptr);
+
+        return status::success;
+    };
+
+    auto &map_manager = memory_map_manager_t<map_usm_tag>::instance();
+
+    *mapped_ptr = host_ptr;
+
+    return map_manager.map(this, stream, *mapped_ptr, unmap_callback);
+}
+
+status_t memory_storage_t::unmap_data(
+        void *mapped_ptr, impl::stream_t *stream) const {
+    if (!mapped_ptr || is_host_accessible()) return status::success;
+
+    if (!stream) CHECK(engine()->get_service_stream(stream));
+
+    auto &map_manager = memory_map_manager_t<map_usm_tag>::instance();
+
+    return map_manager.unmap(this, stream, mapped_ptr);
+}
+
+std::unique_ptr<impl::memory_storage_t> memory_storage_t::get_sub_storage(
+        size_t offset, size_t size) const {
+    void *sub_ptr
+            = ptr_ ? reinterpret_cast<uint8_t *>(ptr_.get()) + offset : nullptr;
+
+    auto storage = utils::make_unique<memory_storage_t>(engine(), kind_);
+    if (!storage) return nullptr;
+
+    auto status = storage->init(memory_flags_t::use_runtime_ptr, size, sub_ptr);
+    if (status != status::success) return nullptr;
+
+    // XXX: Clang has a bug that prevents implicit conversion.
+    return std::unique_ptr<memory_storage_t>(storage.release());
+}
+
+std::unique_ptr<impl::memory_storage_t> memory_storage_t::clone() const {
+    auto storage = utils::make_unique<memory_storage_t>(engine(), kind_);
+    if (!storage) return nullptr;
+
+    auto status = storage->init(memory_flags_t::use_runtime_ptr, 0, nullptr);
+    if (status != status::success) return nullptr;
+
+    storage->ptr_ = decltype(ptr_)(ptr_.get(), [](void *) {});
+    storage->kind_ = kind_;
+
+    // XXX: Clang has a bug that prevents implicit conversion.
+    return std::unique_ptr<memory_storage_t>(storage.release());
+}
+
+status_t memory_storage_t::init_allocate(size_t size) {
+    if (kind_ == memory_storage_kind_t::unknown)
+        kind_ = memory_storage_kind_t::device;
+
+    void *ptr_alloc = nullptr;
+
+    switch (kind_) {
+        case memory_storage_kind_t::host: ptr_alloc = malloc_host(size); break;
+        case memory_storage_kind_t::device:
+            ptr_alloc = malloc_device(size);
+            break;
+        case memory_storage_kind_t::shared:
+            ptr_alloc = malloc_shared(size);
+            break;
+        default: break;
+    }
+    if (!ptr_alloc) return status::out_of_memory;
+
+    ptr_ = decltype(ptr_)(ptr_alloc, [&](void *ptr) { free(ptr); });
+
+    return status::success;
+}
+
+void *memory_storage_t::malloc_host(size_t size) const {
+    void *pptr = nullptr;
+
+    ze_host_mem_alloc_desc_t host_mem_alloc_desc = {};
+    host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+    host_mem_alloc_desc.pNext = nullptr;
+    host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
+
+    l0::zeMemAllocHost(
+            l0_engine()->context(), &host_mem_alloc_desc, size, 0, &pptr);
+
+    return pptr;
+}
+
+void *memory_storage_t::malloc_device(size_t size) const {
+    void *pptr = nullptr;
+
+    ze_device_mem_alloc_desc_t device_mem_alloc_desc = {};
+    device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+    device_mem_alloc_desc.pNext = nullptr;
+    device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
+    device_mem_alloc_desc.ordinal = 0;
+
+    l0::zeMemAllocDevice(l0_engine()->context(), &device_mem_alloc_desc, size,
+            0, l0_engine()->device(), &pptr);
+
+    return pptr;
+}
+
+void *memory_storage_t::malloc_shared(size_t size) const {
+    void *pptr = nullptr;
+
+    ze_device_mem_alloc_desc_t device_mem_alloc_desc = {};
+    device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+    device_mem_alloc_desc.pNext = nullptr;
+    device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
+    device_mem_alloc_desc.ordinal = 0;
+
+    ze_host_mem_alloc_desc_t host_mem_alloc_desc = {};
+    host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+    host_mem_alloc_desc.pNext = nullptr;
+    host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
+
+    l0::zeMemAllocShared(l0_engine()->context(), &device_mem_alloc_desc,
+            &host_mem_alloc_desc, size, 0, l0_engine()->device(), &pptr);
+
+    return pptr;
+}
+
+void memory_storage_t::free(void *ptr) const {
+    l0::zeMemFree(l0_engine()->context(), ptr);
+}
+
+status_t memory_storage_t::memcpy(
+        impl::stream_t *stream, void *dst, const void *src, size_t size) const {
+    auto *l0_stream = utils::downcast<stream_t *>(stream);
+    return l0::zeCommandListAppendMemoryCopy(
+            l0_stream->list(), dst, src, size, nullptr, 0, nullptr);
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/memory_storage.hpp
+++ b/src/gpu/intel/l0/memory_storage.hpp
@ -0,0 +1,91 @@
+/*******************************************************************************
+* Copyright 2021-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_MEMORY_STORAGE_HPP
+#define GPU_INTEL_L0_MEMORY_STORAGE_HPP
+
+#include <functional>
+
+#include "common/c_types_map.hpp"
+#include "common/memory_storage.hpp"
+#include "common/utils.hpp"
+
+#include "gpu/intel/l0/engine.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+enum class memory_storage_kind_t { unknown, host, device, shared };
+inline memory_storage_kind_t get_memory_storage_kind(ze_memory_type_t type) {
+    switch (type) {
+        case ZE_MEMORY_TYPE_HOST: return memory_storage_kind_t::host;
+        case ZE_MEMORY_TYPE_DEVICE: return memory_storage_kind_t::device;
+        case ZE_MEMORY_TYPE_SHARED: return memory_storage_kind_t::shared;
+        default: return memory_storage_kind_t::unknown;
+    }
+};
+
+class memory_storage_t : public impl::memory_storage_t {
+public:
+    memory_storage_t(impl::engine_t *engine, memory_storage_kind_t kind)
+        : impl::memory_storage_t(engine), kind_(kind) {}
+
+    void *ptr() const { return ptr_.get(); }
+
+    status_t get_data_handle(void **handle) const override;
+    status_t set_data_handle(void *handle) override;
+
+    bool is_host_accessible() const override;
+
+    status_t map_data(void **mapped_ptr, impl::stream_t *stream,
+            size_t size) const override;
+    status_t unmap_data(
+            void *mapped_ptr, impl::stream_t *stream) const override;
+
+    std::unique_ptr<impl::memory_storage_t> get_sub_storage(
+            size_t offset, size_t size) const override;
+    std::unique_ptr<impl::memory_storage_t> clone() const override;
+
+private:
+    status_t init_allocate(size_t size) override;
+
+    gpu::intel::l0::engine_t *l0_engine() const {
+        return utils::downcast<gpu::intel::l0::engine_t *>(engine());
+    }
+
+    void *malloc_host(size_t size) const;
+    void *malloc_device(size_t size) const;
+    void *malloc_shared(size_t size) const;
+    void free(void *ptr) const;
+    status_t memcpy(impl::stream_t *stream, void *dst, const void *src,
+            size_t size) const;
+
+    std::unique_ptr<void, std::function<void(void *)>> ptr_;
+    memory_storage_kind_t kind_ = memory_storage_kind_t::unknown;
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(memory_storage_t);
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_MEMORY_STORAGE_HPP
--- a/src/gpu/intel/l0/stream.cpp
+++ b/src/gpu/intel/l0/stream.cpp
@ -0,0 +1,217 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/stream.hpp"
+#include "gpu/intel/l0/engine.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+status_t stream_t::create_stream(impl::stream_t **stream,
+        impl::engine_t *engine, impl::stream_impl_t *stream_impl) {
+    std::unique_ptr<intel::l0::stream_t> s(
+            new intel::l0::stream_t(engine, stream_impl));
+    if (!s) return status::out_of_memory;
+
+    *stream = s.release();
+
+    return status::success;
+}
+
+void stream_t::before_exec_hook() {
+    if (is_profiling_enabled()) profiler_->start_profiling();
+}
+
+void stream_t::after_exec_hook() {
+    l0_ctx().set_deps(event_t());
+
+    if (is_profiling_enabled()) profiler_->stop_profiling();
+}
+
+status_t stream_t::reset_profiling() {
+    if (!is_profiling_enabled()) return status::invalid_arguments;
+
+    profiler_->reset();
+
+    return status::success;
+}
+
+status_t stream_t::get_profiling_data(profiling_data_kind_t data_kind,
+        int *num_entries, uint64_t *data) const {
+    if (!is_profiling_enabled()) return status::invalid_arguments;
+
+    return profiler_->get_info(data_kind, num_entries, data);
+}
+
+stream_impl_t::stream_impl_t(unsigned flags, ze_command_list_handle_t list)
+    : impl::stream_impl_t(flags)
+    , allocated_(false)
+    , list_(list)
+    , event_pool_(nullptr) {
+    l0::zeCommandListGetContextHandle(list_, &context_);
+    if (flags & stream_flags::out_of_order || is_profiling_enabled())
+        create_event_pool();
+}
+
+stream_impl_t::stream_impl_t(
+        unsigned flags, ze_context_handle_t context, ze_device_handle_t device)
+    : impl::stream_impl_t(flags)
+    , context_(context)
+    , allocated_(true)
+    , event_pool_(nullptr) {
+    ze_command_queue_desc_t command_queue_desc = {};
+    command_queue_desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
+    command_queue_desc.pNext = nullptr;
+    command_queue_desc.ordinal = 0;
+    command_queue_desc.index = 0;
+    command_queue_desc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER;
+    command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT;
+    command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
+
+    l0::zeCommandListCreateImmediate(
+            context_, device, &command_queue_desc, &list_);
+
+    if (flags & stream_flags::out_of_order || is_profiling_enabled())
+        create_event_pool();
+}
+
+void stream_impl_t::create_event_pool() {
+    ze_event_pool_desc_t event_pool_desc = {};
+    event_pool_desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
+    event_pool_desc.pNext = nullptr;
+    event_pool_desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+    if (is_profiling_enabled())
+        event_pool_desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
+    event_pool_desc.count = 16384;
+
+    ze_event_pool_handle_t event_pool;
+    l0::zeEventPoolCreate(context_, &event_pool_desc, 0, nullptr, &event_pool);
+    event_pool_ = std::make_shared<event_pool_wrapper_t>(event_pool);
+}
+
+stream_impl_t::~stream_impl_t() {
+    wait();
+    if (allocated_) l0::zeCommandListDestroy(list_);
+}
+
+xpu::context_t &stream_impl_t::ctx() {
+    return l0_ctx();
+}
+
+const xpu::context_t &stream_impl_t::ctx() const {
+    return l0_ctx();
+}
+
+context_t &stream_impl_t::l0_ctx() {
+    const context_t &ctx = const_cast<const stream_impl_t *>(this)->l0_ctx();
+    return *const_cast<context_t *>(&ctx);
+}
+
+const context_t &stream_impl_t::l0_ctx() const {
+    static context_t empty_ctx;
+    return ctx_.get(empty_ctx);
+}
+
+ze_event_handle_t stream_impl_t::get_output_event() const {
+    auto &deps = event_t::from(ctx().get_deps()).events_;
+    if (deps.size()) return deps[0];
+
+    return nullptr;
+}
+
+std::shared_ptr<event_wrapper_t> stream_impl_t::create_event() {
+    if (!event_pool_.get()) return std::make_shared<event_wrapper_t>(nullptr);
+
+    ze_event_desc_t event_desc = {};
+    event_desc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
+    event_desc.pNext = nullptr;
+    event_desc.index = static_cast<uint32_t>(events_.size());
+    event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+    event_desc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
+
+    ze_event_handle_t event;
+    l0::zeEventCreate(*(event_pool_.get()), &event_desc, &event);
+
+    std::shared_ptr<event_wrapper_t> event_ptr
+            = std::make_shared<event_wrapper_t>(event);
+    events_.push_back(event_ptr);
+
+    return event_ptr;
+}
+
+std::shared_ptr<event_pool_wrapper_t> stream_impl_t::get_event_pool() {
+    return event_pool_;
+}
+
+ze_command_list_handle_t stream_impl_t::list() {
+    return list_;
+}
+
+status_t stream_impl_t::wait() {
+    CHECK(l0::zeCommandListHostSynchronize(list_, UINT64_MAX));
+
+    return status::success;
+}
+
+status_t stream_impl_t::barrier() {
+    CHECK(l0::zeCommandListAppendBarrier(list_, nullptr, 0, nullptr));
+
+    return status::success;
+}
+
+status_t stream_impl_t::copy(const impl::memory_storage_t &src,
+        const impl::memory_storage_t &dst, size_t size,
+        const xpu::event_t &deps, xpu::event_t &out_dep) {
+    if (size == 0) return status::success;
+    std::vector<ze_event_handle_t> l0_deps
+            = utils::downcast<const event_t *>(&deps)->events_;
+
+    ze_event_handle_t out_event = *(create_event().get());
+    CHECK(l0::zeCommandListAppendMemoryCopy(list_, dst.data_handle(),
+            src.data_handle(), size, out_event,
+            static_cast<uint32_t>(l0_deps.size()),
+            l0_deps.size() ? l0_deps.data() : nullptr));
+    if (out_event)
+        utils::downcast<event_t *>(&out_dep)->events_.push_back(out_event);
+
+    return status::success;
+}
+
+status_t stream_impl_t::fill(const impl::memory_storage_t &dst, uint8_t pattern,
+        size_t size, const xpu::event_t &deps, xpu::event_t &out_dep) {
+    if (size == 0) return status::success;
+    std::vector<ze_event_handle_t> l0_deps
+            = utils::downcast<const event_t *>(&deps)->events_;
+
+    ze_event_handle_t out_event = *(create_event().get());
+    CHECK(l0::zeCommandListAppendMemoryFill(list_, dst.data_handle(), &pattern,
+            sizeof(pattern), size, out_event,
+            static_cast<uint32_t>(l0_deps.size()),
+            l0_deps.size() ? l0_deps.data() : nullptr));
+    if (out_event)
+        utils::downcast<event_t *>(&out_dep)->events_.push_back(out_event);
+
+    return status::success;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/stream.hpp
+++ b/src/gpu/intel/l0/stream.hpp
@ -0,0 +1,138 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_STREAM_HPP
+#define GPU_INTEL_L0_STREAM_HPP
+
+#include <list>
+
+#include "common/thread_local_storage.hpp"
+#include "gpu/intel/l0/context.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "gpu/intel/stream.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+class stream_impl_t : public impl::stream_impl_t {
+public:
+    stream_impl_t(unsigned flags, ze_command_list_handle_t list);
+    stream_impl_t(unsigned flags, ze_context_handle_t context,
+            ze_device_handle_t device);
+    ~stream_impl_t();
+
+    context_t &l0_ctx();
+    const context_t &l0_ctx() const;
+    xpu::context_t &ctx();
+    const xpu::context_t &ctx() const;
+    ze_event_handle_t get_output_event() const;
+    std::shared_ptr<event_wrapper_t> create_event();
+    std::shared_ptr<event_pool_wrapper_t> get_event_pool();
+
+    ze_command_list_handle_t list();
+
+    status_t wait();
+    status_t barrier();
+
+    status_t copy(const impl::memory_storage_t &src,
+            const impl::memory_storage_t &dst, size_t size,
+            const xpu::event_t &deps, xpu::event_t &out_dep);
+    status_t fill(const impl::memory_storage_t &dst, uint8_t pattern,
+            size_t size, const xpu::event_t &deps, xpu::event_t &out_dep);
+
+private:
+    void create_event_pool();
+
+    ze_context_handle_t context_;
+    bool allocated_;
+    ze_command_list_handle_t list_;
+
+    std::shared_ptr<event_pool_wrapper_t> event_pool_;
+    std::list<std::shared_ptr<event_wrapper_t>> events_;
+
+    mutable utils::thread_local_storage_t<context_t> ctx_;
+
+    stream_impl_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(stream_impl_t);
+};
+
+class stream_t : public intel::stream_t {
+public:
+    static status_t create_stream(impl::stream_t **stream,
+            impl::engine_t *engine, impl::stream_impl_t *stream_impl);
+
+    stream_impl_t *impl() const {
+        return static_cast<stream_impl_t *>(impl::stream_t::impl_.get());
+    }
+
+    engine_t *l0_engine() const {
+        return utils::downcast<engine_t *>(engine());
+    }
+
+    context_t &l0_ctx() { return impl()->l0_ctx(); }
+    const context_t &l0_ctx() const { return impl()->l0_ctx(); }
+    xpu::context_t &ctx() override { return impl()->ctx(); }
+    const xpu::context_t &ctx() const override { return impl()->ctx(); }
+    ze_event_handle_t get_output_event() const {
+        return impl()->get_output_event();
+    }
+    std::shared_ptr<event_wrapper_t> create_event() {
+        return impl()->create_event();
+    }
+    std::shared_ptr<event_pool_wrapper_t> get_event_pool() {
+        return impl()->get_event_pool();
+    }
+
+    const ze_command_list_handle_t list() const { return impl()->list(); }
+
+    status_t wait() override { return impl()->wait(); }
+    status_t barrier() override { return impl()->barrier(); }
+
+    void before_exec_hook() override;
+    void after_exec_hook() override;
+    status_t reset_profiling() override;
+    status_t get_profiling_data(profiling_data_kind_t data_kind,
+            int *num_entries, uint64_t *data) const override;
+
+    status_t copy(const impl::memory_storage_t &src,
+            const impl::memory_storage_t &dst, size_t size,
+            const xpu::event_t &deps, xpu::event_t &out_dep) override {
+        return impl()->copy(src, dst, size, deps, out_dep);
+    }
+    status_t fill(const impl::memory_storage_t &dst, uint8_t pattern,
+            size_t size, const xpu::event_t &deps,
+            xpu::event_t &out_dep) override {
+        return impl()->fill(dst, pattern, size, deps, out_dep);
+    }
+
+private:
+    stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl)
+        : gpu::intel::stream_t(engine, stream_impl) {}
+
+    stream_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(stream_t);
+};
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_STREAM_HPP
--- a/src/gpu/intel/l0/utils/CMakeLists.txt
+++ b/src/gpu/intel/l0/utils/CMakeLists.txt
@ -0,0 +1,24 @@
+#===============================================================================
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+file(GLOB_RECURSE SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
+    )
+
+set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0_utils)
+add_library(${OBJ_LIB} OBJECT ${SOURCES})
+set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
+    $<TARGET_OBJECTS:${OBJ_LIB}>)
--- a/src/gpu/intel/l0/utils/utils.cpp
+++ b/src/gpu/intel/l0/utils/utils.cpp
@ -0,0 +1,387 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "gpu/intel/jit/binary_format.hpp"
+#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
+#include "ngen_level_zero.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+event_wrapper_t::event_wrapper_t(ze_event_handle_t event) : event_(event) {}
+
+event_wrapper_t::~event_wrapper_t() {
+    if (event_) {
+        l0::zeEventHostSynchronize(event_, UINT64_MAX);
+        l0::zeEventDestroy(event_);
+    }
+}
+
+event_wrapper_t::operator ze_event_handle_t() const {
+    return event_;
+}
+
+event_pool_wrapper_t::event_pool_wrapper_t(ze_event_pool_handle_t event_pool)
+    : event_pool_(event_pool) {}
+
+event_pool_wrapper_t::~event_pool_wrapper_t() {
+    if (event_pool_) l0::zeEventPoolDestroy(event_pool_);
+}
+
+event_pool_wrapper_t::operator ze_event_pool_handle_t() const {
+    return event_pool_;
+}
+
+module_wrapper_t::module_wrapper_t(ze_module_handle_t module)
+    : module_(module) {};
+
+module_wrapper_t::~module_wrapper_t() {
+    if (module_) l0::zeModuleDestroy(module_);
+};
+
+module_wrapper_t::operator ze_module_handle_t() const {
+    return module_;
+}
+
+status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
+    ze_device_ip_version_ext_t device_ip_version_ext = {};
+    device_ip_version_ext.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+    device_ip_version_ext.pNext = nullptr;
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = &device_ip_version_ext;
+
+    CHECK(l0::zeDeviceGetProperties(device, &device_properties));
+
+    ip_version = device_ip_version_ext.ipVersion;
+
+    return status::success;
+}
+
+status_t get_l0_device_enabled_systolic_intel(
+        ze_device_handle_t device, bool &mayiuse_systolic) {
+    ze_intel_device_module_dp_exp_properties_t
+            intel_device_module_dp_exp_properties
+            = {};
+    intel_device_module_dp_exp_properties.stype
+            = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
+    intel_device_module_dp_exp_properties.pNext = nullptr;
+
+    ze_device_module_properties_t device_module_properties = {};
+    device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
+    device_module_properties.pNext = &intel_device_module_dp_exp_properties;
+
+    CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties));
+
+    mayiuse_systolic = intel_device_module_dp_exp_properties.flags
+            & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
+
+    return status::success;
+}
+
+status_t get_l0_device_enabled_native_float_atomics(
+        ze_device_handle_t device, uint64_t &native_extensions) {
+    using namespace gpu::intel::compute;
+
+    ze_float_atomic_ext_properties_t float_atomic_ext_properties = {};
+    float_atomic_ext_properties.stype
+            = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
+    float_atomic_ext_properties.pNext = nullptr;
+
+    ze_device_module_properties_t device_module_properties = {};
+    device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
+    device_module_properties.pNext = &float_atomic_ext_properties;
+
+    CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties));
+
+    ze_device_fp_atomic_ext_flags_t atomic_load_store
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
+    ze_device_fp_atomic_ext_flags_t atomic_add
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
+    ze_device_fp_atomic_ext_flags_t atomic_min_max
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
+
+    if ((float_atomic_ext_properties.fp16Flags & atomic_load_store)
+            == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
+    if ((float_atomic_ext_properties.fp16Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
+    if ((float_atomic_ext_properties.fp16Flags & atomic_min_max)
+            == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
+
+    if ((float_atomic_ext_properties.fp32Flags & atomic_load_store)
+            == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
+    if ((float_atomic_ext_properties.fp32Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
+    if ((float_atomic_ext_properties.fp32Flags & atomic_min_max)
+            == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
+
+    if ((float_atomic_ext_properties.fp64Flags & atomic_load_store)
+            == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
+    if ((float_atomic_ext_properties.fp64Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
+    if ((float_atomic_ext_properties.fp64Flags & atomic_min_max)
+            == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
+
+    return status::success;
+}
+
+status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
+    ze_eu_count_ext_t eu_count_ext = {};
+    eu_count_ext.stype = ZE_STRUCTURE_TYPE_EU_COUNT_EXT;
+    eu_count_ext.pNext = nullptr;
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = &eu_count_ext;
+
+    CHECK(l0::zeDeviceGetProperties(device, &device_properties));
+
+    eu_count = eu_count_ext.numTotalEUs;
+
+    return status::success;
+}
+
+status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
+        ze_context_handle_t context, uint32_t &ip_version,
+        compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_,
+        uint64_t &native_extensions, bool &mayiuse_systolic,
+        bool &mayiuse_ngen_kernels) {
+    using namespace ngen;
+    ngen::Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
+            context, device);
+
+    gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
+    std::memcpy(&product_, &product, sizeof(ngen::Product));
+
+    mayiuse_systolic = false;
+    if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic)
+            != status::success)
+        mayiuse_systolic = false;
+
+    /* Some old drivers do not report systolic availability. Manually override
+       systolic availability based on product family. */
+    switch (product.family) {
+        case ProductFamily::DG2:
+        case ProductFamily::ARL:
+        case ProductFamily::PVC: mayiuse_systolic = true;
+        default: break;
+    }
+
+    CHECK(get_l0_device_enabled_native_float_atomics(
+            device, native_extensions));
+
+    auto status
+            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
+    if (status != status::success) mayiuse_ngen_kernels = false;
+
+    ip_version = 0;
+
+    return get_device_ip(device, ip_version);
+}
+
+xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device) {
+    static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16,
+            "ZE_MAX_DEVICE_UUID_SIZE is expected to be 16");
+
+    ze_device_properties_t device_properties = {};
+    device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    device_properties.pNext = nullptr;
+
+    auto status = l0::zeDeviceGetProperties(device, &device_properties);
+    MAYBE_UNUSED(status);
+    assert(status == status::success);
+
+    const auto &device_id = device_properties.uuid.id;
+
+    uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {};
+    for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
+        size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
+        uuid[i / sizeof(uint64_t)] |= (((uint64_t)device_id[i]) << shift);
+    }
+
+    return xpu::device_uuid_t(uuid[0], uuid[1]);
+}
+
+status_t get_device_index(const ze_device_handle_t device, size_t *index) {
+    uint32_t driver_count = 0;
+    CHECK(l0::zeDriverGet(&driver_count, nullptr));
+
+    std::vector<ze_driver_handle_t> drivers(driver_count);
+    CHECK(l0::zeDriverGet(&driver_count, drivers.data()));
+
+    uint32_t device_count = 0;
+    CHECK(l0::zeDeviceGet(drivers[0], &device_count, nullptr));
+
+    std::vector<ze_device_handle_t> devices(device_count);
+    CHECK(l0::zeDeviceGet(drivers[0], &device_count, devices.data()));
+
+    for (size_t i = 0; i < device_count; i++) {
+        if (device == devices[i]) {
+            *index = i;
+
+            return status::success;
+        }
+    }
+
+    return status::invalid_arguments;
+}
+
+std::string get_kernel_name(const ze_kernel_handle_t kernel) {
+    std::string kernel_name;
+
+    size_t kernel_name_size = 0;
+    l0::zeKernelGetName(kernel, &kernel_name_size, nullptr);
+
+    kernel_name.resize(kernel_name_size, 0);
+    l0::zeKernelGetName(kernel, &kernel_name_size, &kernel_name[0]);
+
+    // Remove the null terminator as std::string already includes it
+    kernel_name.resize(kernel_name_size - 1);
+
+    return kernel_name;
+}
+
+status_t get_kernel_binary(
+        const ze_kernel_handle_t kernel, xpu::binary_t &binary) {
+    size_t binary_size = 0;
+    CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, nullptr));
+
+    binary.resize(binary_size);
+    CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, binary.data()));
+
+    return status::success;
+}
+
+status_t get_module_binary(
+        const ze_module_handle_t module, xpu::binary_t &binary) {
+    size_t module_binary_size;
+    CHECK(l0::zeModuleGetNativeBinary(module, &module_binary_size, nullptr));
+
+    binary.resize(module_binary_size);
+    CHECK(l0::zeModuleGetNativeBinary(
+            module, &module_binary_size, binary.data()));
+
+    return status::success;
+}
+
+#define ZE_MODULE_FORMAT_OCLC (ze_module_format_t)3U
+status_t compile_ocl_module(const ze_device_handle_t device,
+        const ze_context_handle_t context, std::string &code,
+        std::string &options, xpu::binary_t &binary) {
+    ze_module_desc_t module_desc;
+    module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+    module_desc.pNext = nullptr;
+    module_desc.format = ZE_MODULE_FORMAT_OCLC;
+    module_desc.inputSize = code.size();
+    module_desc.pInputModule = reinterpret_cast<const uint8_t *>(code.c_str());
+    module_desc.pBuildFlags = options.c_str();
+    module_desc.pConstants = nullptr;
+
+    ze_module_handle_t module;
+    ze_module_build_log_handle_t module_build_log;
+    if (l0::zeModuleCreate(
+                context, device, &module_desc, &module, &module_build_log)
+            != status::success) {
+        size_t build_log_size = 0;
+        CHECK(l0::zeModuleBuildLogGetString(
+                module_build_log, &build_log_size, nullptr));
+        char *build_log = new char[build_log_size];
+        CHECK(l0::zeModuleBuildLogGetString(
+                module_build_log, &build_log_size, build_log));
+        std::cout << std::endl << "Build log: " << build_log << std::endl;
+        delete[] build_log;
+
+        return status::runtime_error;
+    }
+    CHECK(l0::zeModuleBuildLogDestroy(module_build_log));
+
+    CHECK(get_module_binary(module, binary));
+
+    CHECK(l0::zeModuleDestroy(module));
+
+    return status::success;
+}
+
+status_t create_kernels_from_binary(const ze_device_handle_t device,
+        const ze_context_handle_t context,
+        const std::vector<const char *> &kernel_names,
+        const xpu::binary_t &binary, ze_module_handle_t *module,
+        std::vector<ze_kernel_handle_t> &kernels) {
+    ze_module_desc_t module_desc;
+    module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+    module_desc.pNext = nullptr;
+    module_desc.format = ZE_MODULE_FORMAT_NATIVE;
+    module_desc.inputSize = binary.size();
+    module_desc.pInputModule = binary.data();
+    module_desc.pBuildFlags = "";
+    module_desc.pConstants = nullptr;
+
+    CHECK(l0::zeModuleCreate(context, device, &module_desc, module, nullptr));
+
+    kernels.resize(kernel_names.size(), nullptr);
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (kernel_names[i] == nullptr) {
+            kernels[i] = nullptr;
+            continue;
+        }
+
+        ze_kernel_desc_t kernel_desc = {};
+        kernel_desc.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
+        kernel_desc.pNext = nullptr;
+        kernel_desc.flags = 0;
+        kernel_desc.pKernelName = kernel_names[i];
+
+        ze_kernel_handle_t kernel;
+        CHECK(l0::zeKernelCreate(*module, &kernel_desc, &kernel));
+
+        kernels[i] = kernel;
+    }
+
+    return status::success;
+}
+
+ze_memory_type_t get_pointer_type(
+        const ze_context_handle_t context, const void *ptr) {
+    ze_memory_allocation_properties_t memory_allocation_properties;
+    memory_allocation_properties.stype
+            = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
+    memory_allocation_properties.pNext = nullptr;
+
+    l0::zeMemGetAllocProperties(
+            context, ptr, &memory_allocation_properties, nullptr);
+
+    return memory_allocation_properties.type;
+}
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
--- a/src/gpu/intel/l0/utils/utils.hpp
+++ b/src/gpu/intel/l0/utils/utils.hpp
@ -0,0 +1,238 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_L0_UTILS_HPP
+#define GPU_INTEL_L0_UTILS_HPP
+
+#if defined(__linux__)
+#include <dlfcn.h>
+#elif defined(_WIN32)
+#include "windows.h"
+#else
+#error "Level Zero is supported on Linux and Windows only"
+#endif
+
+#include "gpu/intel/compute/kernel.hpp"
+
+#include "level_zero/ze_api.h"
+#include "level_zero/ze_intel_gpu.h"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace l0 {
+
+inline std::string to_string(ze_result_t r) {
+#define ZE_STATUS_CASE(status) \
+    case status: return #status
+    switch (r) {
+        ZE_STATUS_CASE(ZE_RESULT_SUCCESS);
+        ZE_STATUS_CASE(ZE_RESULT_NOT_READY);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS);
+        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN);
+        ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32);
+        default: return std::to_string((int)r);
+    }
+#undef ZE_STATUS_CASE
+};
+
+#define ZE_CHECK(f) \
+    do { \
+        ze_result_t res_ = (f); \
+        if (res_ != ZE_RESULT_SUCCESS) { \
+            std::string err_str_ = to_string(res_); \
+            VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \
+            return status::runtime_error; \
+        } \
+    } while (false)
+
+inline void *find_symbol(const char *symbol) {
+#if defined(_WIN32)
+    HMODULE handle = LoadLibraryExA(
+            "ze_loader.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
+    if (!handle) return nullptr;
+    return reinterpret_cast<void *>(GetProcAddress(handle, symbol));
+#elif defined(__linux__)
+    void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
+    if (!handle) return nullptr;
+    return dlsym(handle, symbol);
+#endif
+}
+
+template <typename F>
+F find_ze_symbol(const char *symbol) {
+    auto f = (F)find_symbol(symbol);
+    if (!f) VERROR(common, level_zero, "cannot find symbol: %s", symbol);
+    return f;
+}
+#undef L0_LIB_NAME
+
+#define INDIRECT_L0_CALL(f) \
+    template <typename... Args> \
+    status_t f(Args &&...args) { \
+        const ze_init_flags_t default_ze_flags = 0; \
+        static auto init_ = find_ze_symbol<decltype(&::zeInit)>("zeInit"); \
+        ZE_CHECK(init_(default_ze_flags)); \
+        static auto f_ = find_ze_symbol<decltype(&::f)>(#f); \
+        ZE_CHECK(f_(std::forward<Args>(args)...)); \
+        return status::success; \
+    }
+INDIRECT_L0_CALL(zeDriverGet)
+INDIRECT_L0_CALL(zeDriverGetProperties)
+INDIRECT_L0_CALL(zeDeviceGet)
+INDIRECT_L0_CALL(zeDeviceGetProperties)
+INDIRECT_L0_CALL(zeDeviceGetComputeProperties)
+INDIRECT_L0_CALL(zeDeviceGetModuleProperties)
+INDIRECT_L0_CALL(zeDeviceGetMemoryAccessProperties)
+INDIRECT_L0_CALL(zeDeviceGetCacheProperties)
+INDIRECT_L0_CALL(zeContextCreate)
+INDIRECT_L0_CALL(zeContextDestroy)
+INDIRECT_L0_CALL(zeCommandListCreateImmediate)
+INDIRECT_L0_CALL(zeCommandListDestroy)
+INDIRECT_L0_CALL(zeCommandListHostSynchronize)
+INDIRECT_L0_CALL(zeCommandListGetContextHandle)
+INDIRECT_L0_CALL(zeCommandListAppendBarrier)
+INDIRECT_L0_CALL(zeCommandListAppendMemoryCopy)
+INDIRECT_L0_CALL(zeCommandListAppendMemoryFill)
+INDIRECT_L0_CALL(zeEventPoolCreate)
+INDIRECT_L0_CALL(zeEventPoolDestroy)
+INDIRECT_L0_CALL(zeEventCreate)
+INDIRECT_L0_CALL(zeEventDestroy)
+INDIRECT_L0_CALL(zeEventHostSynchronize)
+INDIRECT_L0_CALL(zeMemAllocShared)
+INDIRECT_L0_CALL(zeMemAllocDevice)
+INDIRECT_L0_CALL(zeMemAllocHost)
+INDIRECT_L0_CALL(zeMemFree)
+INDIRECT_L0_CALL(zeMemGetAllocProperties)
+INDIRECT_L0_CALL(zeModuleCreate)
+INDIRECT_L0_CALL(zeModuleDestroy)
+INDIRECT_L0_CALL(zeModuleBuildLogDestroy)
+INDIRECT_L0_CALL(zeModuleBuildLogGetString)
+INDIRECT_L0_CALL(zeModuleGetNativeBinary)
+INDIRECT_L0_CALL(zeKernelCreate)
+INDIRECT_L0_CALL(zeKernelDestroy)
+INDIRECT_L0_CALL(zeKernelSetArgumentValue)
+INDIRECT_L0_CALL(zeKernelGetName)
+INDIRECT_L0_CALL(zeKernelGetBinaryExp)
+INDIRECT_L0_CALL(zeKernelSetGroupSize)
+INDIRECT_L0_CALL(zeKernelSuggestGroupSize)
+INDIRECT_L0_CALL(zeCommandListAppendLaunchKernel)
+#undef INDIRECT_L0_CALL
+
+class event_wrapper_t {
+public:
+    event_wrapper_t(ze_event_handle_t event);
+    ~event_wrapper_t();
+    operator ze_event_handle_t() const;
+
+private:
+    ze_event_handle_t event_;
+
+    event_wrapper_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(event_wrapper_t);
+};
+
+class event_pool_wrapper_t {
+public:
+    event_pool_wrapper_t(ze_event_pool_handle_t event_pool);
+    ~event_pool_wrapper_t();
+    operator ze_event_pool_handle_t() const;
+
+private:
+    ze_event_pool_handle_t event_pool_;
+
+    event_pool_wrapper_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(event_pool_wrapper_t);
+};
+
+class module_wrapper_t {
+public:
+    module_wrapper_t(ze_module_handle_t module);
+    ~module_wrapper_t();
+    operator ze_module_handle_t() const;
+
+private:
+    ze_module_handle_t module_;
+
+    module_wrapper_t() = delete;
+    DNNL_DISALLOW_COPY_AND_ASSIGN(module_wrapper_t);
+};
+
+status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
+        ze_context_handle_t context, uint32_t &ip_version,
+        compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product,
+        uint64_t &native_extensions, bool &mayiuse_systolic,
+        bool &mayiuse_ngen_kernels);
+xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device);
+status_t get_device_index(const ze_device_handle_t device, size_t *index);
+std::string get_kernel_name(const ze_kernel_handle_t kernel);
+status_t get_kernel_binary(
+        const ze_kernel_handle_t kernel, xpu::binary_t &binary);
+status_t get_module_binary(
+        const ze_module_handle_t module, xpu::binary_t &binary);
+status_t compile_ocl_module(const ze_device_handle_t device,
+        const ze_context_handle_t context, std::string &code,
+        std::string &options, xpu::binary_t &binary);
+status_t create_kernels_from_binary(const ze_device_handle_t device,
+        const ze_context_handle_t context,
+        const std::vector<const char *> &kernel_names,
+        const xpu::binary_t &binary, ze_module_handle_t *module,
+        std::vector<ze_kernel_handle_t> &kernels);
+ze_memory_type_t get_pointer_type(const ze_context_handle_t, const void *ptr);
+
+} // namespace l0
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_L0_UTILS_HPP
--- a/src/gpu/intel/ocl/CMakeLists.txt
+++ b/src/gpu/intel/ocl/CMakeLists.txt
@ -15,9 +15,7 @@
 #===============================================================================

 file(GLOB_RECURSE SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.h
    ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.c
    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
    )

--- a/src/gpu/intel/ocl/device_info.hpp
+++ b/src/gpu/intel/ocl/device_info.hpp
@ -22,7 +22,7 @@
 #include <CL/cl.h>

 #include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"

 namespace dnnl {
 namespace impl {
--- a/src/gpu/intel/ocl/engine.cpp
+++ b/src/gpu/intel/ocl/engine.cpp
@ -32,7 +32,7 @@
 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/kernel.hpp"
 #include "gpu/intel/ocl/stream.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"

 namespace dnnl {
 namespace impl {
@ -40,33 +40,6 @@ namespace gpu {
 namespace intel {
 namespace ocl {

-status_t preprocess_headers(stringstream_t &pp_code, const char *code,
-        const compute::kernel_ctx_t &kernel_ctx) {
-    stringstream_t code_stream(code);
-
-    for (std::string line; std::getline(code_stream, line);) {
-        const size_t include_pos = line.find("#include");
-        if (include_pos != std::string::npos) {
-            static constexpr size_t include_len = 8;
-            const size_t first_quote_pos
-                    = line.find("\"", include_pos + include_len);
-            const size_t second_quote_pos
-                    = line.find("\"", first_quote_pos + 1);
-            const size_t kernel_name_len
-                    = second_quote_pos - first_quote_pos - 1;
-            const auto header_name
-                    = line.substr(first_quote_pos + 1, kernel_name_len);
-            const char *header_source
-                    = kernel_ctx.get_custom_header(header_name);
-            if (!header_source) header_source = get_kernel_header(header_name);
-            CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
-        } else {
-            pp_code << line << std::endl;
-        }
-    }
-    return status::success;
-}
-
 status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
        cl_device_id dev, cl_context ctx, size_t index,
        const std::vector<uint8_t> &cache_blob) {
@ -249,14 +222,14 @@ status_t engine_t::build_program_from_source(
    // `clCompileProgram` `clBuildProgram` doesn't take headers. Because of
    // that, a manual preprocessing of `include` header directives in the
    // OpenCL kernels is required.
-    CHECK(preprocess_headers(pp_code, code_string, kernel_ctx));
+    CHECK(compute::preprocess_headers(pp_code, code_string, kernel_ctx));
    std::string pp_code_str = pp_code.str();
    const char *pp_code_str_ptr = pp_code_str.c_str();

    src = {pp_code_str};
    if (src) { options += " -g -s " + std::string(src.name()); }

-    debugdump_processed_source(
+    compute::debugdump_processed_source(
            pp_code_str, options, dev_info->get_cl_ext_options());

    auto ctx = context();
--- a/src/gpu/intel/ocl/engine.hpp
+++ b/src/gpu/intel/ocl/engine.hpp
@ -21,6 +21,7 @@
 #include "common/utils.hpp"
 #include "gpu/gpu_impl_list.hpp"
 #include "gpu/intel/engine.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "xpu/ocl/engine_impl.hpp"
 #include "xpu/utils.hpp"

@ -30,9 +31,6 @@ namespace gpu {
 namespace intel {
 namespace ocl {

-status_t preprocess_headers(stringstream_t &pp_code, const char *code,
-        const compute::kernel_ctx_t &kernel_ctx);
-
 status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
        cl_device_id dev, cl_context ctx, size_t index,
        const std::vector<uint8_t> &cache_blob);
--- a/src/gpu/intel/ocl/hw_info.cpp
+++ b/src/gpu/intel/ocl/hw_info.cpp
@ -15,7 +15,7 @@
 *******************************************************************************/

 #include "gpu/intel/ocl/hw_info.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"

 #include "gpu/intel/jit/binary_format.hpp"
 #include "gpu/intel/jit/generator.hpp"
--- a/src/gpu/intel/ocl/kernel.cpp
+++ b/src/gpu/intel/ocl/kernel.cpp
@ -31,7 +31,7 @@

 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/stream.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"

 namespace dnnl {
 namespace impl {
--- a/src/gpu/intel/ocl/mdapi_utils.cpp
+++ b/src/gpu/intel/ocl/mdapi_utils.cpp
@ -28,7 +28,7 @@
 #include <dlfcn.h>
 #include <vector>

-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "mdapi/metrics_discovery_api.h"

 #ifndef CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL
--- a/src/gpu/intel/ocl/stream.cpp
+++ b/src/gpu/intel/ocl/stream.cpp
@ -26,7 +26,7 @@

 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/stream.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"

 namespace dnnl {
 namespace impl {
--- a/src/gpu/intel/ocl/stream.hpp
+++ b/src/gpu/intel/ocl/stream.hpp
@ -26,9 +26,11 @@
 #include "xpu/ocl/context.hpp"
 #include "xpu/ocl/stream_impl.hpp"

-#include "gpu/intel/ocl/mdapi_utils.hpp"
 #include "gpu/intel/stream.hpp"

+#include "gpu/intel/ocl/mdapi_utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
--- a/src/gpu/intel/ocl/utils/CMakeLists.txt
+++ b/src/gpu/intel/ocl/utils/CMakeLists.txt
@ -0,0 +1,24 @@
+#===============================================================================
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+file(GLOB_RECURSE SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
+    )
+
+set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_ocl_utils)
+add_library(${OBJ_LIB} OBJECT ${SOURCES})
+set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
+    $<TARGET_OBJECTS:${OBJ_LIB}>)
--- a/src/gpu/intel/ocl/utils/utils.cpp
+++ b/src/gpu/intel/ocl/utils/utils.cpp
@ -23,7 +23,7 @@
 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/hw_info.hpp"
 #include "gpu/intel/ocl/kernel.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "xpu/ocl/utils.hpp"

 #ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
@ -217,69 +217,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary) {
    return status::success;
 }

-void debugdump_processed_source(const std::string &source,
-        const std::string &options, const std::string &cl_options) {
-#if defined(__linux__) && defined(DNNL_DEV_MODE)
-    if (get_verbose(verbose_t::debuginfo) >= 10) {
-        auto get_defines = [](const std::string &from) {
-            std::string ret;
-            size_t pos = 0;
-            while (pos < from.length()) {
-                // Find next define argument
-                pos = from.find("-D", pos);
-
-                // Generate argument, quotes are interpreted literally, but
-                // other special shell characters need escaped. Does not
-                // currently handle quotes with the ' character or nested quotes
-                char quote_parity = true;
-                while (pos < from.length()) {
-                    if (quote_parity
-                            && utils::one_of(from[pos], '~', '#', '$', '&', '*',
-                                    '(', ')', '\\', '|', '[', ']', '{', '}',
-                                    ';', '\'', '<', '>', '/', '?', '!')) {
-                        ret += '\\';
-                    }
-                    ret += from[pos];
-                    if (from[pos] == '"') quote_parity ^= true;
-                    if (from[pos] == ' ' && quote_parity) break;
-
-                    pos++;
-                }
-            }
-            return ret;
-        };
-        auto execute_command = [](const std::string &cmd,
-                                       const std::string &stdin) {
-            std::string result;
-            std::array<char, 256> buffer;
-            FILE *pipe = popen(cmd.c_str(), "w");
-            fputs(stdin.c_str(), pipe);
-            if (pipe) {
-                while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
-                    result += buffer.data();
-                }
-            }
-            pclose(pipe);
-            return result;
-        };
-
-        // Run utilities to evaluate preprocessor defines and format the file
-        // Theoretically, we can accomplish this task with libclang, but it
-        // seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
-        // so that calls to the system are not included in the default build.
-
-        // Due to the use of a different C preprocessor, warnings should not be
-        // ignored, as they may correspond to a different behavior in the OpenCL
-        // C preprocessor
-        auto o = get_defines(options) + get_defines(cl_options);
-        std::string preprocess_cmd
-                = std::string() + "cpp -P " + o + " | clang-format";
-        execute_command(preprocess_cmd, source);
-        std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
-    }
-#endif
-}
-
 status_t get_kernel_arg_types(cl_kernel ocl_kernel,
        std::vector<gpu::intel::compute::scalar_type_t> *arg_types) {
    cl_uint nargs;
--- a/src/gpu/intel/ocl/utils/utils.hpp
+++ b/src/gpu/intel/ocl/utils/utils.hpp
@ -55,9 +55,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary);
 status_t get_ocl_program_binary_size(
        cl_kernel kernel, cl_device_id device, size_t *size);

-void debugdump_processed_source(const std::string &source,
-        const std::string &options, const std::string &ocl_options);
-
 status_t get_kernel_arg_types(cl_kernel ocl_kernel,
        std::vector<gpu::intel::compute::scalar_type_t> *arg_types);

--- a/src/gpu/intel/sycl/compat.cpp
+++ b/src/gpu/intel/sycl/compat.cpp
@ -35,7 +35,7 @@
 #include "gpu/intel/compute/device_info.hpp"
 #include "gpu/intel/sycl/compat.hpp"
 #include "gpu/intel/sycl/engine.hpp"
-#include "gpu/intel/sycl/l0/utils.hpp"
+#include "gpu/intel/sycl/utils.hpp"

 namespace dnnl {
 namespace impl {
--- a/src/gpu/intel/sycl/device_info.cpp
+++ b/src/gpu/intel/sycl/device_info.cpp
@ -19,11 +19,11 @@
 #include "gpu/intel/sycl/compat.hpp"
 #include "gpu/intel/sycl/device_info.hpp"
 #include "gpu/intel/sycl/engine.hpp"
-#include "gpu/intel/sycl/l0/utils.hpp"
-#include "gpu/intel/sycl/utils.hpp"

 #include "gpu/intel/ocl/hw_info.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
+
+#include "gpu/intel/l0/utils/utils.hpp"

 namespace dnnl {
 namespace impl {
@ -56,7 +56,7 @@ status_t device_info_t::init_arch(impl::engine_t *engine) {
        auto ze_dev = xpu::sycl::compat::get_native<ze_device_handle_t>(device);
        auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(ctx);

-        status = gpu::intel::sycl::init_gpu_hw_info(engine, ze_dev, ze_ctx,
+        status = gpu::intel::l0::init_gpu_hw_info(engine, ze_dev, ze_ctx,
                ip_version_, gpu_arch_, gpu_product_, native_extensions_,
                mayiuse_systolic_, mayiuse_ngen_kernels_);
    } else {
--- a/src/gpu/intel/sycl/engine.cpp
+++ b/src/gpu/intel/sycl/engine.cpp
@ -114,18 +114,25 @@ status_t engine_t::create_kernels(

    const char *source = nullptr;
    for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++)
-        source = ocl::get_kernel_source(kernel_names[i]);
+        source = get_kernel_source(kernel_names[i]);
    VERROR_ENGINE(source, status::runtime_error,
            "No OpenCL source was found for kernel");

    stringstream_t pp_code;
-    CHECK(gpu::intel::ocl::preprocess_headers(pp_code, source, kernel_ctx));
+    CHECK(compute::preprocess_headers(pp_code, source, kernel_ctx));
+    std::string code_str = pp_code.str();

    std::string build_options = kernel_ctx.options();
    build_options += " " + device_info()->get_cl_ext_options();

+    gpu::intel::compute::program_src_t src(code_str);
+    if (src) { build_options += " -g -s " + std::string(src.name()); }
+
+    compute::debugdump_processed_source(
+            code_str, build_options, device_info()->get_cl_ext_options());
+
    auto kb_src = syclex::create_kernel_bundle_from_source(
-            context(), syclex::source_language::opencl, pp_code.str());
+            context(), syclex::source_language::opencl, code_str);
    auto kb_exe = syclex::build(
            kb_src, syclex::properties {syclex::build_options(build_options)});
    *kernels = std::vector<compute::kernel_t>(kernel_names.size());
@ -133,8 +140,7 @@ status_t engine_t::create_kernels(
        if (!kernel_names[i]) continue;

        CHECK(interop_kernel_t::make((*kernels)[i],
-                kb_exe.ext_oneapi_get_kernel(kernel_names[i]),
-                gpu::intel::compute::program_src_t(pp_code.str())));
+                kb_exe.ext_oneapi_get_kernel(kernel_names[i]), src));
    }

    return status::success;
--- a/src/gpu/intel/sycl/engine.hpp
+++ b/src/gpu/intel/sycl/engine.hpp
@ -31,7 +31,7 @@
 #include "gpu/intel/ocl/engine.hpp"
 #include "gpu/intel/ocl/kernel.hpp"

-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "gpu/intel/sycl/compat.hpp"
 #include "gpu/intel/sycl/utils.hpp"

--- a/src/gpu/intel/sycl/interop_kernel.cpp
+++ b/src/gpu/intel/sycl/interop_kernel.cpp
@ -19,8 +19,7 @@
 #include "common/verbose.hpp"
 #include "gpu/intel/compute/types_interop.hpp"
 #include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/ocl/utils.hpp"
-#include "gpu/intel/sycl/l0/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "gpu/intel/sycl/stream.hpp"
 #include "gpu/intel/sycl/utils.hpp"
 #include "xpu/sycl/c_types_map.hpp"
--- a/src/gpu/intel/sycl/l0/utils.cpp
+++ b/src/gpu/intel/sycl/l0/utils.cpp
@ -1,436 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2025 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/sycl/l0/utils.hpp"
-#include "oneapi/dnnl/dnnl_config.h"
-
-#include "gpu/intel/jit/binary_format.hpp"
-#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
-#include "ngen_level_zero.hpp"
-
-#if defined(__linux__)
-#include <dlfcn.h>
-#elif defined(_WIN32)
-#include "windows.h"
-#else
-#error "Level Zero is supported on Linux and Windows only"
-#endif
-
-#include "level_zero/ze_api.h"
-#include "level_zero/ze_intel_gpu.h"
-
-#if !defined(__SYCL_COMPILER_VERSION)
-#error "Unsupported compiler"
-#endif
-
-#if (__SYCL_COMPILER_VERSION < 20200818)
-#error "Level Zero is not supported with this compiler version"
-#endif
-
-#include "common/c_types_map.hpp"
-#include "common/verbose.hpp"
-
-#include "gpu/intel/sycl/utils.hpp"
-#include <sycl/ext/oneapi/backend/level_zero.hpp>
-
-#include "gpu/intel/sycl/engine.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace sycl {
-
-namespace {
-
-std::string to_string(ze_result_t r) {
-#define ZE_STATUS_CASE(status) \
-    case status: return #status
-    switch (r) {
-        ZE_STATUS_CASE(ZE_RESULT_SUCCESS);
-        ZE_STATUS_CASE(ZE_RESULT_NOT_READY);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS);
-        ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN);
-        ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32);
-        default: return std::to_string((int)r);
-    }
-#undef ZE_STATUS_CASE
-};
-
-#define ZE_CHECK_COMMON(f, retval) \
-    do { \
-        ze_result_t res_ = (f); \
-        if (res_ != ZE_RESULT_SUCCESS) { \
-            std::string err_str_ = to_string(res_); \
-            VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \
-            return retval; \
-        } \
-    } while (false)
-
-#define ZE_CHECK(f) ZE_CHECK_COMMON(f, status::runtime_error)
-#define ZE_CHECK_VP(f) ZE_CHECK_COMMON(f, nullptr)
-
-void *find_ze_symbol(const char *symbol) {
-#if defined(__linux__)
-    void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
-#elif defined(_WIN32)
-    // Use LOAD_LIBRARY_SEARCH_SYSTEM32 flag to avoid DLL hijacking issue.
-    HMODULE handle = LoadLibraryExA(
-            "ze_loader.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
-#endif
-    if (!handle) {
-        VERROR(common, level_zero, "cannot find loader library");
-        assert(!"not expected");
-        return nullptr;
-    }
-
-    using zeInit_decl_t = ze_result_t (*)(ze_init_flags_t flags);
-    const ze_init_flags_t default_ze_flags = 0;
-#if defined(__linux__)
-    static const ze_result_t ze_result = reinterpret_cast<zeInit_decl_t>(
-            dlsym(handle, "zeInit"))(default_ze_flags);
-    void *f = reinterpret_cast<void *>(dlsym(handle, symbol));
-#elif defined(_WIN32)
-    static const ze_result_t ze_result = reinterpret_cast<zeInit_decl_t>(
-            GetProcAddress(handle, "zeInit"))(default_ze_flags);
-    void *f = reinterpret_cast<void *>(GetProcAddress(handle, symbol));
-#endif
-    ZE_CHECK_VP(ze_result);
-
-    if (!f) {
-        VERROR(common, level_zero, "cannot find symbol: %s", symbol);
-        assert(!"not expected");
-    }
-    return f;
-}
-
-template <typename F>
-F find_ze_symbol(const char *symbol) {
-    return (F)find_ze_symbol(symbol);
-}
-
-status_t func_zeModuleCreate(ze_context_handle_t hContext,
-        ze_device_handle_t hDevice, const ze_module_desc_t *desc,
-        ze_module_handle_t *phModule,
-        ze_module_build_log_handle_t *phBuildLog) {
-    static auto f = find_ze_symbol<decltype(&zeModuleCreate)>("zeModuleCreate");
-
-    if (!f) return status::runtime_error;
-    ZE_CHECK(f(hContext, hDevice, desc, phModule, phBuildLog));
-    return status::success;
-}
-
-status_t func_zeDeviceGetProperties(
-        ze_device_handle_t hDevice, ze_device_properties_t *pDeviceProperties) {
-    static auto f = find_ze_symbol<decltype(&zeDeviceGetProperties)>(
-            "zeDeviceGetProperties");
-
-    if (!f) return status::runtime_error;
-    ZE_CHECK(f(hDevice, pDeviceProperties));
-    return status::success;
-}
-
-status_t func_zeDeviceGetModuleProperties(ze_device_handle_t hDevice,
-        ze_device_module_properties_t *pDeviceProperties) {
-    static auto f = find_ze_symbol<decltype(&zeDeviceGetModuleProperties)>(
-            "zeDeviceGetModuleProperties");
-
-    if (!f) {
-        VERROR(common, level_zero,
-                "failed to find systolic query extension (maybe update the "
-                "driver?)");
-        return status::runtime_error;
-    }
-    ZE_CHECK(f(hDevice, pDeviceProperties));
-    return status::success;
-}
-
-} // namespace
-
-// This function is called from compatibility layer that ensures compatibility
-// with SYCL 2017 API. Once the compatibility layer is removed this function
-// can be moved to the anonymous namespace above and a function with SYCL
-// data types in its interface can be created to call it.
-status_t func_zeKernelCreate(ze_module_handle_t hModule,
-        const ze_kernel_desc_t *desc, ze_kernel_handle_t *phKernel) {
-    static auto f = find_ze_symbol<decltype(&zeKernelCreate)>("zeKernelCreate");
-
-    if (!f) return status::runtime_error;
-    ZE_CHECK(f(hModule, desc, phKernel));
-    return status::success;
-}
-
-#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-status_t func_zeGetKernelBinary(
-        ze_kernel_handle_t hKernel, size_t *pSize, uint8_t *pKernelBinary) {
-    static auto f = find_ze_symbol<decltype(&zeKernelGetBinaryExp)>(
-            "zeKernelGetBinaryExp");
-
-    if (!f) return status::runtime_error;
-    ZE_CHECK(f(hKernel, pSize, pKernelBinary));
-    return status::success;
-}
-#else
-status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize,
-        uint8_t *pModuleNativeBinary) {
-    static auto f = find_ze_symbol<decltype(&zeModuleGetNativeBinary)>(
-            "zeModuleGetNativeBinary");
-
-    if (!f) return status::runtime_error;
-    ZE_CHECK(f(hModule, pSize, pModuleNativeBinary));
-    return status::success;
-}
-#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-
-// FIXME: Currently SYCL doesn't provide any API to get device UUID so
-// we query it directly from Level0 with the zeDeviceGetProperties function.
-// The `get_device_uuid` function packs 128 bits of the device UUID, which are
-// represented as an uint8_t array of size 16, to 2 uint64_t values.
-xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
-    static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16,
-            "ZE_MAX_DEVICE_UUID_SIZE is expected to be 16");
-
-    auto ze_device_properties = ze_device_properties_t();
-    ze_device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-
-    auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(dev);
-    auto status = func_zeDeviceGetProperties(ze_device, &ze_device_properties);
-    MAYBE_UNUSED(status);
-    assert(status == status::success);
-
-    const auto &ze_device_id = ze_device_properties.uuid.id;
-
-    uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {};
-    for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
-        size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
-        uuid[i / sizeof(uint64_t)] |= (((uint64_t)ze_device_id[i]) << shift);
-    }
-    return xpu::device_uuid_t(uuid[0], uuid[1]);
-}
-
-status_t sycl_create_kernels_with_level_zero(
-        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
-        const std::vector<const char *> &kernel_names,
-        const gpu::intel::sycl::engine_t *sycl_engine,
-        const xpu::binary_t &binary) {
-    auto desc = ze_module_desc_t();
-    desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
-    desc.format = ZE_MODULE_FORMAT_NATIVE;
-    desc.inputSize = binary.size();
-    desc.pInputModule = binary.data();
-    desc.pBuildFlags = "";
-    desc.pConstants = nullptr;
-
-    ze_module_handle_t ze_module;
-
-    auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(
-            sycl_engine->device());
-    auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(
-            sycl_engine->context());
-
-    CHECK(func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr));
-    ::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle
-            = ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero,
-                    ::sycl::bundle_state::executable>(
-                    {ze_module}, sycl_engine->context());
-
-    sycl_kernels.resize(kernel_names.size());
-    for (size_t i = 0; i < kernel_names.size(); i++) {
-        if (kernel_names[i] == nullptr) continue;
-        ze_kernel_handle_t ze_kernel;
-        ze_kernel_desc_t ze_kernel_desc {
-                ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_names[i]};
-        CHECK(func_zeKernelCreate(ze_module, &ze_kernel_desc, &ze_kernel));
-        auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
-                {kernel_bundle, ze_kernel}, sycl_engine->context());
-        sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
-    }
-
-    return status::success;
-}
-
-bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
-    auto lhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(lhs);
-    auto rhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(rhs);
-
-    return lhs_ze_handle == rhs_ze_handle;
-}
-
-status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
-    auto devicePropsIP = ze_device_ip_version_ext_t();
-    devicePropsIP.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
-
-    auto deviceProps = ze_device_properties_t();
-    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-    deviceProps.pNext = &devicePropsIP;
-
-    CHECK(func_zeDeviceGetProperties(device, &deviceProps));
-    ip_version = devicePropsIP.ipVersion;
-    return status::success;
-}
-
-status_t get_l0_device_enabled_systolic_intel(
-        ze_device_handle_t device, bool &mayiuse_systolic) {
-    // Note: supported by Intel Driver 24.05 and onwards
-    auto deviceModPropsExt = ze_intel_device_module_dp_exp_properties_t();
-    deviceModPropsExt.stype
-            = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
-
-    auto deviceModProps = ze_device_module_properties_t();
-    deviceModProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
-    deviceModProps.pNext = &deviceModPropsExt;
-
-    CHECK(func_zeDeviceGetModuleProperties(device, &deviceModProps));
-    mayiuse_systolic
-            = deviceModPropsExt.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
-    return status::success;
-}
-
-status_t get_l0_device_enabled_native_float_atomics(
-        ze_device_handle_t device, uint64_t &native_extensions) {
-    using namespace gpu::intel::compute;
-
-    auto fltAtom = ze_float_atomic_ext_properties_t();
-    fltAtom.stype = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
-
-    auto deviceProps = ze_device_module_properties_t();
-    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
-    deviceProps.pNext = &fltAtom;
-
-    CHECK(func_zeDeviceGetModuleProperties(device, &deviceProps));
-
-    ze_device_fp_atomic_ext_flags_t atomic_load_store
-            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
-            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
-    ze_device_fp_atomic_ext_flags_t atomic_add
-            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
-            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
-    ze_device_fp_atomic_ext_flags_t atomic_min_max
-            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
-            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
-
-    if ((fltAtom.fp16Flags & atomic_load_store) == atomic_load_store)
-        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
-    if ((fltAtom.fp16Flags & atomic_add) == atomic_add)
-        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
-    if ((fltAtom.fp16Flags & atomic_min_max) == atomic_min_max)
-        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
-
-    if ((fltAtom.fp32Flags & atomic_load_store) == atomic_load_store)
-        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
-    if ((fltAtom.fp32Flags & atomic_add) == atomic_add)
-        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
-    if ((fltAtom.fp32Flags & atomic_min_max) == atomic_min_max)
-        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
-
-    if ((fltAtom.fp64Flags & atomic_load_store) == atomic_load_store)
-        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
-    if ((fltAtom.fp64Flags & atomic_add) == atomic_add)
-        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
-    if ((fltAtom.fp64Flags & atomic_min_max) == atomic_min_max)
-        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
-
-    return status::success;
-}
-
-status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
-    auto eucnt = ze_eu_count_ext_t();
-    auto deviceProps = ze_device_properties_t();
-    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-    deviceProps.pNext = &eucnt;
-
-    CHECK(func_zeDeviceGetProperties(device, &deviceProps));
-    eu_count = eucnt.numTotalEUs;
-    return status::success;
-}
-
-status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
-        ze_context_handle_t context, uint32_t &ip_version,
-        compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_,
-        uint64_t &native_extensions, bool &mayiuse_systolic,
-        bool &mayiuse_ngen_kernels) {
-    using namespace ngen;
-    ngen::Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
-            context, device);
-
-    gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
-    std::memcpy(&product_, &product, sizeof(ngen::Product));
-
-    mayiuse_systolic = false;
-    if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic)
-            != status::success)
-        mayiuse_systolic = false;
-
-    /* Some old drivers do not report systolic availability. Manually override
-       systolic availability based on product family. */
-    switch (product.family) {
-        case ProductFamily::DG2:
-        case ProductFamily::ARL:
-        case ProductFamily::PVC: mayiuse_systolic = true;
-        default: break;
-    }
-
-    CHECK(get_l0_device_enabled_native_float_atomics(
-            device, native_extensions));
-
-    auto status
-            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
-    if (status != status::success) mayiuse_ngen_kernels = false;
-
-    ip_version = 0;
-    return get_device_ip(device, ip_version);
-}
-
-} // namespace sycl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
--- a/src/gpu/intel/sycl/l0/utils.hpp
+++ b/src/gpu/intel/sycl/l0/utils.hpp
@ -1,65 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2025 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_SYCL_L0_UTILS_HPP
-#define GPU_INTEL_SYCL_L0_UTILS_HPP
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "gpu/intel/compute/kernel.hpp"
-#include "gpu/intel/sycl/compat.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace sycl {
-
-class engine_t;
-
-xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
-
-status_t sycl_create_kernels_with_level_zero(
-        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
-        const std::vector<const char *> &kernel_names,
-        const gpu::intel::sycl::engine_t *sycl_engine,
-        const xpu::binary_t &binary);
-
-bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
-
-#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-status_t func_zeGetKernelBinary(
-        ze_kernel_handle_t hKernel, size_t *pSize, uint8_t *pKernelBinary);
-#else
-status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize,
-        uint8_t *pModuleNativeBinary);
-#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
-
-status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
-        ze_context_handle_t context, uint32_t &ip_version,
-        compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product,
-        uint64_t &native_extensions, bool &mayiuse_systolic,
-        bool &mayiuse_ngen_kernels);
-
-} // namespace sycl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_SYCL_L0_UTILS_HPP
--- a/src/gpu/intel/sycl/stream.cpp
+++ b/src/gpu/intel/sycl/stream.cpp
@ -24,7 +24,7 @@

 #include "gpu/intel/sycl/stream.hpp"

-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"

 namespace dnnl {
 namespace impl {
--- a/src/gpu/intel/sycl/stream.hpp
+++ b/src/gpu/intel/sycl/stream.hpp
@ -35,6 +35,8 @@
 #include "gpu/intel/engine.hpp"
 #include "gpu/intel/stream.hpp"

+#include "gpu/intel/ocl/utils/utils.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
--- a/src/gpu/intel/sycl/utils.cpp
+++ b/src/gpu/intel/sycl/utils.cpp
@ -17,9 +17,9 @@
 #include "gpu/intel/sycl/utils.hpp"

 #include "gpu/intel/compute/ukernels.hpp"
-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/l0/utils/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"
 #include "gpu/intel/sycl/engine.hpp"
-#include "gpu/intel/sycl/l0/utils.hpp"
 #include "xpu/ocl/engine_factory.hpp"
 #include "xpu/ocl/utils.hpp"
 #include "xpu/sycl/compat.hpp"
@ -32,6 +32,53 @@ namespace gpu {
 namespace intel {
 namespace sycl {

+// FIXME: Currently SYCL doesn't provide any API to get device UUID so
+// we query it directly from Level0 with the zeDeviceGetProperties function.
+// The `get_device_uuid` function packs 128 bits of the device UUID, which are
+// represented as an uint8_t array of size 16, to 2 uint64_t values.
+xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
+    return gpu::intel::l0::get_device_uuid(
+            xpu::sycl::compat::get_native<ze_device_handle_t>(dev));
+}
+
+bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
+    auto lhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(lhs);
+    auto rhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(rhs);
+
+    return lhs_ze_handle == rhs_ze_handle;
+}
+
+status_t sycl_create_kernels_with_level_zero(
+        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
+        const std::vector<const char *> &kernel_names,
+        const gpu::intel::sycl::engine_t *sycl_engine,
+        const xpu::binary_t &binary) {
+    auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(
+            sycl_engine->device());
+    auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(
+            sycl_engine->context());
+    ze_module_handle_t ze_module = nullptr;
+    std::vector<ze_kernel_handle_t> ze_kernels;
+
+    gpu::intel::l0::create_kernels_from_binary(
+            ze_device, ze_ctx, kernel_names, binary, &ze_module, ze_kernels);
+
+    ::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle
+            = ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero,
+                    ::sycl::bundle_state::executable>(
+                    {ze_module}, sycl_engine->context());
+
+    sycl_kernels.resize(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (kernel_names[i] == nullptr) continue;
+        auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
+                {kernel_bundle, ze_kernels[i]}, sycl_engine->context());
+        sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
+    }
+
+    return status::success;
+}
+
 ::sycl::nd_range<3> to_sycl_nd_range(
        const gpu::intel::compute::nd_range_t &range) {
    const auto &local_range = range.local_range();
@ -150,7 +197,6 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) {
    }

    *ocl_dev = d;
-
    return status::success;
 }

@ -216,12 +262,7 @@ status_t get_kernel_binary(
 #ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
            auto l0_kernel = ::sycl::get_native<
                    ::sycl::backend::ext_oneapi_level_zero>(kernel);
-            size_t binary_size = 0;
-            CHECK(gpu::intel::sycl::func_zeGetKernelBinary(
-                    l0_kernel, &binary_size, nullptr));
-            binary.resize(binary_size);
-            CHECK(gpu::intel::sycl::func_zeGetKernelBinary(
-                    l0_kernel, &binary_size, binary.data()));
+            CHECK(gpu::intel::l0::get_kernel_binary(l0_kernel, binary));
 #else
            auto bundle = kernel.get_kernel_bundle();
            auto module_vec = ::sycl::get_native<
@ -229,11 +270,7 @@ status_t get_kernel_binary(
            auto module = module_vec[0];
            size_t module_binary_size;
            xpu::binary_t module_binary;
-            CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary(
-                    module, &module_binary_size, nullptr));
-            module_binary.resize(module_binary_size);
-            CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary(
-                    module, &module_binary_size, module_binary.data()));
+            CHECK(gpu::intel::l0::get_mobule_binary(module, binary));
            {
                std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
                        ocl_engine;
--- a/src/gpu/intel/sycl/utils.hpp
+++ b/src/gpu/intel/sycl/utils.hpp
@ -29,6 +29,16 @@ namespace sycl {

 class engine_t;

+xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
+
+status_t sycl_create_kernels_with_level_zero(
+        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
+        const std::vector<const char *> &kernel_names,
+        const gpu::intel::sycl::engine_t *sycl_engine,
+        const xpu::binary_t &binary);
+
+bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
+
 ::sycl::nd_range<3> to_sycl_nd_range(
        const gpu::intel::compute::nd_range_t &range);

--- a/src/graph/backend/dnnl/scratchpad.hpp
+++ b/src/graph/backend/dnnl/scratchpad.hpp
@ -93,7 +93,6 @@ public:
 #ifdef DNNL_WITH_SYCL
    void set_deps(::sycl::event event) { e_ = std::move(event); }
 #endif
-
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
    void set_deps(cl_event event) { ocl_e_ = event; }
 #endif
@ -106,7 +105,6 @@ private:
 #ifdef DNNL_WITH_SYCL
    ::sycl::event e_;
 #endif
-
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
    cl_event ocl_e_;
 #endif
--- a/src/graph/utils/ocl_check.hpp
+++ b/src/graph/utils/ocl_check.hpp
@ -31,6 +31,6 @@
 #endif
 #endif

-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"

 #endif
--- a/src/xpu/ocl/buffer_memory_storage.hpp
+++ b/src/xpu/ocl/buffer_memory_storage.hpp
@ -24,7 +24,7 @@

 #include "xpu/ocl/memory_storage_base.hpp"

-#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/ocl/utils/utils.hpp"

 namespace dnnl {
 namespace impl {
--- a/src/xpu/ocl/utils.cpp
+++ b/src/xpu/ocl/utils.cpp
@ -32,131 +32,6 @@ namespace impl {
 namespace xpu {
 namespace ocl {

-status_t convert_to_dnnl(cl_int cl_status) {
-    switch (cl_status) {
-        case CL_SUCCESS: return status::success;
-        case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-        case CL_OUT_OF_RESOURCES:
-        case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
-        case CL_DEVICE_NOT_FOUND:
-        case CL_DEVICE_NOT_AVAILABLE:
-        case CL_COMPILER_NOT_AVAILABLE:
-        case CL_PROFILING_INFO_NOT_AVAILABLE:
-        case CL_MEM_COPY_OVERLAP:
-        case CL_IMAGE_FORMAT_MISMATCH:
-        case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-        case CL_BUILD_PROGRAM_FAILURE:
-        case CL_MAP_FAILURE:
-        case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-        case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-        case CL_COMPILE_PROGRAM_FAILURE:
-        case CL_LINKER_NOT_AVAILABLE:
-        case CL_LINK_PROGRAM_FAILURE:
-        case CL_DEVICE_PARTITION_FAILED:
-        case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
-        case CL_INVALID_PLATFORM:
-        case CL_INVALID_DEVICE: return status::runtime_error;
-        case CL_INVALID_VALUE:
-        case CL_INVALID_DEVICE_TYPE:
-        case CL_INVALID_CONTEXT:
-        case CL_INVALID_QUEUE_PROPERTIES:
-        case CL_INVALID_COMMAND_QUEUE:
-        case CL_INVALID_HOST_PTR:
-        case CL_INVALID_MEM_OBJECT:
-        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-        case CL_INVALID_IMAGE_SIZE:
-        case CL_INVALID_SAMPLER:
-        case CL_INVALID_BINARY:
-        case CL_INVALID_BUILD_OPTIONS:
-        case CL_INVALID_PROGRAM:
-        case CL_INVALID_PROGRAM_EXECUTABLE:
-        case CL_INVALID_KERNEL_NAME:
-        case CL_INVALID_KERNEL_DEFINITION:
-        case CL_INVALID_KERNEL:
-        case CL_INVALID_ARG_INDEX:
-        case CL_INVALID_ARG_VALUE:
-        case CL_INVALID_ARG_SIZE:
-        case CL_INVALID_KERNEL_ARGS:
-        case CL_INVALID_WORK_DIMENSION:
-        case CL_INVALID_WORK_GROUP_SIZE:
-        case CL_INVALID_WORK_ITEM_SIZE:
-        case CL_INVALID_GLOBAL_OFFSET:
-        case CL_INVALID_EVENT_WAIT_LIST:
-        case CL_INVALID_EVENT:
-        case CL_INVALID_OPERATION:
-        case CL_INVALID_GL_OBJECT:
-        case CL_INVALID_BUFFER_SIZE:
-        case CL_INVALID_MIP_LEVEL:
-        case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments;
-
-        default: return status::runtime_error;
-    }
-}
-
-// Ordered by value as defined by opencl
-const char *convert_cl_int_to_str(cl_int cl_status) {
-#define CL_STATUS_CASE(status) \
-    case status: return #status
-    switch (cl_status) {
-        CL_STATUS_CASE(CL_SUCCESS);
-        CL_STATUS_CASE(CL_DEVICE_NOT_FOUND);
-        CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
-        CL_STATUS_CASE(CL_OUT_OF_RESOURCES);
-        CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY);
-        CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_MEM_COPY_OVERLAP);
-        CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH);
-        CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
-        CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE);
-        CL_STATUS_CASE(CL_MAP_FAILURE);
-        CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
-        CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
-        CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE);
-        CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE);
-        CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED);
-        CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
-        CL_STATUS_CASE(CL_INVALID_VALUE);
-        CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE);
-        CL_STATUS_CASE(CL_INVALID_PLATFORM);
-        CL_STATUS_CASE(CL_INVALID_DEVICE);
-        CL_STATUS_CASE(CL_INVALID_CONTEXT);
-        CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES);
-        CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE);
-        CL_STATUS_CASE(CL_INVALID_HOST_PTR);
-        CL_STATUS_CASE(CL_INVALID_MEM_OBJECT);
-        CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-        CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE);
-        CL_STATUS_CASE(CL_INVALID_SAMPLER);
-        CL_STATUS_CASE(CL_INVALID_BINARY);
-        CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS);
-        CL_STATUS_CASE(CL_INVALID_PROGRAM);
-        CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
-        CL_STATUS_CASE(CL_INVALID_KERNEL_NAME);
-        CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION);
-        CL_STATUS_CASE(CL_INVALID_KERNEL);
-        CL_STATUS_CASE(CL_INVALID_ARG_INDEX);
-        CL_STATUS_CASE(CL_INVALID_ARG_VALUE);
-        CL_STATUS_CASE(CL_INVALID_ARG_SIZE);
-        CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS);
-        CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION);
-        CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE);
-        CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE);
-        CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET);
-        CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST);
-        CL_STATUS_CASE(CL_INVALID_EVENT);
-        CL_STATUS_CASE(CL_INVALID_OPERATION);
-        CL_STATUS_CASE(CL_INVALID_GL_OBJECT);
-        CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE);
-        CL_STATUS_CASE(CL_INVALID_MIP_LEVEL);
-        CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
-#undef CL_STATUS_CASE
-        default: return "unknown macro name";
-    }
-}
-
 template <typename T, typename F>
 static std::string get_ocl_name(T obj, F get_func, cl_uint name_query) {
    size_t name_size;
--- a/src/xpu/ocl/utils.hpp
+++ b/src/xpu/ocl/utils.hpp
@ -33,8 +33,130 @@ namespace impl {
 namespace xpu {
 namespace ocl {

-status_t convert_to_dnnl(cl_int cl_status);
-const char *convert_cl_int_to_str(cl_int cl_status);
+inline status_t convert_to_dnnl(cl_int cl_status) {
+    switch (cl_status) {
+        case CL_SUCCESS: return status::success;
+        case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+        case CL_OUT_OF_RESOURCES:
+        case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
+        case CL_DEVICE_NOT_FOUND:
+        case CL_DEVICE_NOT_AVAILABLE:
+        case CL_COMPILER_NOT_AVAILABLE:
+        case CL_PROFILING_INFO_NOT_AVAILABLE:
+        case CL_MEM_COPY_OVERLAP:
+        case CL_IMAGE_FORMAT_MISMATCH:
+        case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+        case CL_BUILD_PROGRAM_FAILURE:
+        case CL_MAP_FAILURE:
+        case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+        case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+        case CL_COMPILE_PROGRAM_FAILURE:
+        case CL_LINKER_NOT_AVAILABLE:
+        case CL_LINK_PROGRAM_FAILURE:
+        case CL_DEVICE_PARTITION_FAILED:
+        case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+        case CL_INVALID_PLATFORM:
+        case CL_INVALID_DEVICE: return status::runtime_error;
+        case CL_INVALID_VALUE:
+        case CL_INVALID_DEVICE_TYPE:
+        case CL_INVALID_CONTEXT:
+        case CL_INVALID_QUEUE_PROPERTIES:
+        case CL_INVALID_COMMAND_QUEUE:
+        case CL_INVALID_HOST_PTR:
+        case CL_INVALID_MEM_OBJECT:
+        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+        case CL_INVALID_IMAGE_SIZE:
+        case CL_INVALID_SAMPLER:
+        case CL_INVALID_BINARY:
+        case CL_INVALID_BUILD_OPTIONS:
+        case CL_INVALID_PROGRAM:
+        case CL_INVALID_PROGRAM_EXECUTABLE:
+        case CL_INVALID_KERNEL_NAME:
+        case CL_INVALID_KERNEL_DEFINITION:
+        case CL_INVALID_KERNEL:
+        case CL_INVALID_ARG_INDEX:
+        case CL_INVALID_ARG_VALUE:
+        case CL_INVALID_ARG_SIZE:
+        case CL_INVALID_KERNEL_ARGS:
+        case CL_INVALID_WORK_DIMENSION:
+        case CL_INVALID_WORK_GROUP_SIZE:
+        case CL_INVALID_WORK_ITEM_SIZE:
+        case CL_INVALID_GLOBAL_OFFSET:
+        case CL_INVALID_EVENT_WAIT_LIST:
+        case CL_INVALID_EVENT:
+        case CL_INVALID_OPERATION:
+        case CL_INVALID_GL_OBJECT:
+        case CL_INVALID_BUFFER_SIZE:
+        case CL_INVALID_MIP_LEVEL:
+        case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments;
+
+        default: return status::runtime_error;
+    }
+}
+
+// Ordered by value as defined by opencl
+inline const char *convert_cl_int_to_str(cl_int cl_status) {
+#define CL_STATUS_CASE(status) \
+    case status: return #status
+    switch (cl_status) {
+        CL_STATUS_CASE(CL_SUCCESS);
+        CL_STATUS_CASE(CL_DEVICE_NOT_FOUND);
+        CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
+        CL_STATUS_CASE(CL_OUT_OF_RESOURCES);
+        CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY);
+        CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_MEM_COPY_OVERLAP);
+        CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH);
+        CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
+        CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE);
+        CL_STATUS_CASE(CL_MAP_FAILURE);
+        CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
+        CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+        CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE);
+        CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE);
+        CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED);
+        CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
+        CL_STATUS_CASE(CL_INVALID_VALUE);
+        CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE);
+        CL_STATUS_CASE(CL_INVALID_PLATFORM);
+        CL_STATUS_CASE(CL_INVALID_DEVICE);
+        CL_STATUS_CASE(CL_INVALID_CONTEXT);
+        CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES);
+        CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE);
+        CL_STATUS_CASE(CL_INVALID_HOST_PTR);
+        CL_STATUS_CASE(CL_INVALID_MEM_OBJECT);
+        CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+        CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE);
+        CL_STATUS_CASE(CL_INVALID_SAMPLER);
+        CL_STATUS_CASE(CL_INVALID_BINARY);
+        CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS);
+        CL_STATUS_CASE(CL_INVALID_PROGRAM);
+        CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
+        CL_STATUS_CASE(CL_INVALID_KERNEL_NAME);
+        CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION);
+        CL_STATUS_CASE(CL_INVALID_KERNEL);
+        CL_STATUS_CASE(CL_INVALID_ARG_INDEX);
+        CL_STATUS_CASE(CL_INVALID_ARG_VALUE);
+        CL_STATUS_CASE(CL_INVALID_ARG_SIZE);
+        CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS);
+        CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION);
+        CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE);
+        CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE);
+        CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET);
+        CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST);
+        CL_STATUS_CASE(CL_INVALID_EVENT);
+        CL_STATUS_CASE(CL_INVALID_OPERATION);
+        CL_STATUS_CASE(CL_INVALID_GL_OBJECT);
+        CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE);
+        CL_STATUS_CASE(CL_INVALID_MIP_LEVEL);
+        CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
+#undef CL_STATUS_CASE
+        default: return "unknown macro name";
+    }
+}

 #define MAYBE_REPORT_ERROR(msg) \
    do { \
--- a/src/xpu/sycl/utils.cpp
+++ b/src/xpu/sycl/utils.cpp
@ -24,7 +24,7 @@
 #include "common/engine.hpp"

 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/sycl/l0/utils.hpp"
+#include "gpu/intel/sycl/utils.hpp"
 #endif

 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
--- a/third_party/ngen/ngen_level_zero.hpp
+++ b/third_party/ngen/ngen_level_zero.hpp
@ -91,6 +91,7 @@ public:

    explicit LevelZeroCodeGenerator(DebugConfig debugConfig) : LevelZeroCodeGenerator({genericProductFamily(hw), 0}, debugConfig) {}

+    inline ze_kernel_handle_t getKernel(ze_module_handle_t module);
    inline ze_module_handle_t getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = "");
    static inline HW detectHW(ze_context_handle_t context, ze_device_handle_t device);
    static inline Product detectHWInfo(ze_context_handle_t context, ze_device_handle_t device);
@ -138,6 +139,17 @@ static inline std::vector<uint8_t> getDummyModuleBinary(ze_context_handle_t cont

 }; /* namespace detail */

+template <HW hw>
+ze_kernel_handle_t LevelZeroCodeGenerator<hw>::getKernel(ze_module_handle_t module)
+{
+    auto kernelName = ELFCodeGenerator<hw>::interface_.getExternalName().c_str();
+    ze_kernel_handle_t kernelL0;
+    ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernelName};
+    detail::handleL0(dynamic::zeKernelCreate(module, &kernelDesc, &kernelL0));
+
+    return kernelL0;
+}
+
 template <HW hw>
 ze_module_handle_t LevelZeroCodeGenerator<hw>::getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options)
 {