gpu: intel: add Level Zero backend

This commit is contained in:
Palicki, Stefan
2025-07-22 08:34:47 -07:00
committed by Stefan Palicki
parent 3a756b982b
commit 633a03d736
83 changed files with 4100 additions and 816 deletions

View File

@ -287,6 +287,7 @@ Runtime-specific dependencies:
| `ONEDNN_CPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), TBB (`tbb.dll`), OpenCL loader (`OpenCL.dll`)
| `ONEDNN_GPU_RUNTIME=OCL` | any | OpenCL loader (`OpenCL.dll`)
| `ONEDNN_GPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), OpenCL loader (`OpenCL.dll`), oneAPI Level Zero loader (`ze_loader.dll`)
| `ONEDNN_GPU_RUNTIME=L0` | any | oneAPI Level Zero loader (`ze_loader.dll`)
#### macOS

View File

@ -1,5 +1,5 @@
#===============================================================================
# Copyright 2019-2021 Intel Corporation
# Copyright 2019-2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -22,7 +22,7 @@ if(OpenCL_cmake_included)
endif()
set(OpenCL_cmake_included true)
if(DNNL_GPU_RUNTIME STREQUAL "OCL")
if("${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|L0)$")
message(STATUS "GPU support is enabled (OpenCL)")
else()
return()

View File

@ -283,7 +283,7 @@ set(DNNL_GPU_RUNTIME "NONE" CACHE STRING
Using OpenCL for GPU requires setting OPENCLROOT if the libraries are
installed in a non-standard location.")
if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$")
if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL|L0)$")
message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}")
endif()

View File

@ -7,7 +7,7 @@ oneDNN supports the following build-time options.
|:--------------------------------|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------|
| ONEDNN_LIBRARY_TYPE | **SHARED**, STATIC | Defines the resulting library type |
| ONEDNN_CPU_RUNTIME | NONE, **OMP**, TBB, SEQ, THREADPOOL, SYCL | Defines the threading runtime for CPU engines |
| ONEDNN_GPU_RUNTIME | **NONE**, OCL, SYCL | Defines the offload runtime for GPU engines |
| ONEDNN_GPU_RUNTIME | **NONE**, OCL, SYCL, L0 | Defines the offload runtime for GPU engines |
| ONEDNN_BUILD_DOC | **ON**, OFF | Controls building the documentation |
| ONEDNN_DOC_VERSIONS_JSON | **""**, *string* | Location of JSON file for [PyData Sphinx Theme version switcher]. Enables documentation version switcher when set. |
| ONEDNN_BUILD_EXAMPLES | **ON**, OFF | Controls building the examples |

View File

@ -136,6 +136,9 @@ foreach(f ${sources})
if(NOT DNNL_WITH_SYCL AND ${f_name} MATCHES "^sycl")
list(REMOVE_ITEM sources "${f}")
endif()
if(NOT DNNL_GPU_RUNTIME STREQUAL "L0" AND ${f_name} MATCHES ".*level_zero")
list(REMOVE_ITEM sources "${f}")
endif()
endforeach()
# In case of SYCL, skip CPU examples that directly work with raw pointers
@ -180,7 +183,7 @@ foreach(src ${sources})
endif()
else()
set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)")
set(gpu_rt_pattern "(OCL|SYCL|DPCPP)")
set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)")
if(${example_name} MATCHES "sycl.*")
set(cpu_rt_pattern "(SYCL|DPCPP)")
set(gpu_rt_pattern "(SYCL|DPCPP)")

View File

@ -200,7 +200,7 @@ foreach(src ${sources})
endif()
else()
set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)")
set(gpu_rt_pattern "(OCL|SYCL|DPCPP)")
set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)")
if(${example_name} MATCHES "sycl.*")
set(cpu_rt_pattern "(SYCL|DPCPP)")
set(gpu_rt_pattern "(SYCL|DPCPP)")

View File

@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2019-2022 Intel Corporation
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -29,6 +29,9 @@
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#include "dnnl_ocl.h"
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#include "dnnl_l0.h"
#endif
#define COMPLAIN_DNNL_ERROR_AND_EXIT(what, status) \
do { \
@ -160,7 +163,7 @@ static inline void write_to_dnnl_memory(void *handle, dnnl_memory_t mem) {
}
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
if (eng_kind == dnnl_gpu) {
void *mapped_ptr = NULL;
CHECK(dnnl_memory_map_data(mem, &mapped_ptr));

View File

@ -35,7 +35,11 @@
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#include "dnnl_ocl.hpp"
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#include "dnnl_l0.hpp"
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
#include "dnnl_sycl.hpp"
#endif
@ -228,7 +232,7 @@ inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) {
return;
}
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
if (eng.get_kind() == dnnl::engine::kind::gpu) {
void *mapped_ptr = mem.map_data();
if (mapped_ptr) std::memcpy(handle, mapped_ptr, size);
@ -287,7 +291,7 @@ inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {
return;
}
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
if (eng.get_kind() == dnnl::engine::kind::gpu) {
void *mapped_ptr = mem.map_data();
if (mapped_ptr) std::memcpy(mapped_ptr, handle, size);

22
include/dnnl_l0.h Normal file
View File

@ -0,0 +1,22 @@
/*******************************************************************************
* Copyright 2020-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef DNNL_L0_H
#define DNNL_L0_H
#include "oneapi/dnnl/dnnl_l0.h"
#endif /* DNNL_L0_H */

22
include/dnnl_l0.hpp Normal file
View File

@ -0,0 +1,22 @@
/*******************************************************************************
* Copyright 2020-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef DNNL_L0_HPP
#define DNNL_L0_HPP
#include "oneapi/dnnl/dnnl_l0.hpp"
#endif /* DNNL_L0_HPP */

View File

@ -82,6 +82,9 @@
/// DPC++ runtime
#define DNNL_RUNTIME_DPCPP DNNL_RUNTIME_SYCL
/// L0 runtime
#define DNNL_RUNTIME_L0 1024u
/// No vendor (corresponding runtime is disabled)
#define DNNL_VENDOR_NONE 0u
@ -119,7 +122,8 @@
#endif
#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
&& (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL) \
&& (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL)
&& (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL) \
&& (DNNL_GPU_RUNTIME != DNNL_RUNTIME_L0)
#error "Unexpected DNNL_GPU_RUNTIME"
#endif
#if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \
@ -145,9 +149,6 @@
// When defined, DPCPP is supported.
#cmakedefine DNNL_WITH_SYCL
// When defined, Level Zero is supported.
#cmakedefine DNNL_WITH_LEVEL_ZERO
// When defined, SYCL CUDA backend is used.
#cmakedefine DNNL_SYCL_CUDA

View File

@ -0,0 +1,203 @@
/*******************************************************************************
* Copyright 2020-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef ONEAPI_DNNL_DNNL_L0_H
#define ONEAPI_DNNL_DNNL_L0_H
#include "oneapi/dnnl/dnnl.h"
/// @cond DO_NOT_DOCUMENT_THIS
#include "level_zero/ze_api.h"
/// @endcond
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/// @addtogroup dnnl_api
/// @{
/// @addtogroup dnnl_api_interop
/// @{
/// @addtogroup dnnl_api_l0_interop
/// @{
/// Creates an engine associated with a Level Zero device and a Level Zero context.
///
/// @param engine Output engine.
/// @param driver Pointer to the Level Zero driver to use for the engine.
/// @param device Pointer to the Level Zero device to use for the engine.
/// @param context Pointer to the Level Zero context to use for the engine.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_engine_create(dnnl_engine_t *engine,
const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
const ze_context_handle_t acontext);
/// Returns the Level Zero context associated with an engine.
///
/// @param engine Engine to query.
/// @param context Pointer to the underlying Level Zero context of the engine.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_context(
dnnl_engine_t engine, ze_context_handle_t context);
/// Returns the Level Zero device associated with an engine.
///
/// @param engine Engine to query.
/// @param device Pointer to the underlying Level Zero device of the engine.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_device(
dnnl_engine_t engine, ze_device_handle_t device);
/// Returns the Level Zero driver associated with an engine.
///
/// @param engine Engine to query.
/// @param device Pointer to the underlying Level Zero driver of the engine.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_driver(
dnnl_engine_t engine, ze_driver_handle_t driver);
/// Creates an execution stream for a given engine associated with a Level Zero
/// queue.
///
/// @param stream Output execution stream.
/// @param engine Engine to create the execution stream on.
/// @param queue Level Zero command queue to use.
/// @param list Level Zero command list to use.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_stream_create(dnnl_stream_t *stream,
dnnl_engine_t engine, ze_command_list_handle_t list);
/// Returns the Level Zero command list associated with an execution stream.
///
/// @param stream Execution stream to query.
/// @param list Output Level Zero command list.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_stream_get_list(
dnnl_stream_t stream, ze_command_list_handle_t list);
/// Creates a memory object.
///
/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
/// constructed memory object will have the underlying buffer set. In this
/// case, the buffer will be initialized as if:
/// - dnnl_memory_set_data_handle() had been called, if @p memory_kind is equal
/// to dnnl_l0_interop_usm, or
/// - dnnl_l0_interop_memory_set_buffer() has been called, if @p memory_kind
/// is equal to dnnl_l0_interop_buffer.
///
/// @param memory Output memory object.
/// @param memory_desc Memory descriptor.
/// @param engine Engine to use.
/// @param handle Handle of the memory buffer to use as an underlying storage.
/// - A USM pointer to the user-allocated buffer. In this case the library
/// doesn't own the buffer. Requires @p memory_kind to be equal to
/// dnnl::l0_interop::memory_kind::usm.
/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
/// allocate the buffer for the memory object. In this case the library
/// owns the buffer.
/// - The DNNL_MEMORY_NONE specific value. Instructs the library to
/// create memory object without an underlying buffer.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory,
const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
void *handle);
/// Creates a memory object with multiple handles.
///
/// @param memory Output memory object.
/// @param memory_desc Memory descriptor.
/// @param engine Engine to use.
/// @param memory_kind Memory allocation kind to specify the type of handles.
/// @param nhandles Number of handles.
/// @param handles Handles of the memory buffers to use as underlying storages.
/// For each element of the @p handles array the following applies:
/// - A USM pointer to the user-allocated buffer. In this case the library
/// doesn't own the buffer. Requires @p memory_kind to be equal to
/// dnnl::l0_interop::memory_kind::usm.
/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
/// allocate the buffer for the memory object. In this case the library
/// owns the buffer.
/// - The DNNL_MEMORY_NONE specific value. Instructs the library to
/// create memory object without an underlying buffer.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory,
const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
size_t nhandles, void **handles);
/// Returns an Level Zero memory object associated with a memory object.
///
/// @param memory Memory object.
/// @param mem_object Output Level Zero memory object.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object(
const_dnnl_memory_t memory, void **mem_object);
/// Sets Level Zero memory object associated with a memory object.
///
/// For behavioral details, see dnnl_memory_set_data_handle().
///
/// @param memory Memory object.
/// @param mem_object Level Zero memory object.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object(
dnnl_memory_t memory, void *mem_object);
/// Executes computations specified by the primitive in a specified stream and
/// returns a Level Zero event.
///
/// @param primitive Primitive to execute.
/// @param stream Stream to use.
/// @param nargs Number of arguments.
/// @param args Array of arguments. Each argument is an
/// <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
/// values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
/// #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
/// descriptor as that returned by
/// #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
/// @param ndeps Number of dependencies.
/// @param deps A pointer to a vector of size @p ndeps that contains
/// dependencies.
/// @param return_event Output event.
/// @returns #dnnl_success on success and a status describing the error
/// otherwise.
dnnl_status_t DNNL_API dnnl_l0_interop_primitive_execute(
const_dnnl_primitive_t primitive, dnnl_stream_t stream, size_t nargs,
const dnnl_exec_arg_t *args, size_t ndeps,
const ze_event_handle_t *deps, ze_event_handle_t *return_event);
/// @} dnnl_api_l0_interop
/// @} dnnl_api_interop
/// @} dnnl_api
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // ONEAPI_DNNL_DNNL_L0_H

View File

@ -0,0 +1,259 @@
/*******************************************************************************
* Copyright 2020-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef ONEAPI_DNNL_DNNL_L0_HPP
#define ONEAPI_DNNL_DNNL_L0_HPP
#include "oneapi/dnnl/dnnl.hpp"
/// @cond DO_NOT_DOCUMENT_THIS
#include <vector>
#include <unordered_map>
#include "oneapi/dnnl/dnnl_l0.h"
/// @endcond
/// @addtogroup dnnl_api
/// @{
namespace dnnl {
/// @addtogroup dnnl_api_interop
/// @{
/// @addtogroup dnnl_api_l0_interop Level Zero interoperability API
/// API extensions to interact with the underlying Level Zero run-time.
///
/// @sa @ref dev_guide_dpcpp_interoperability in developer guide
/// @{
/// Level Zero interoperability namespace
namespace l0_interop {
/// Constructs an engine from Level Zero device and context objects.
///
/// @param adriver Level Zero driver.
/// @param adevice Level Zero device.
/// @param acontext Level Zero context.
///
/// @returns Created engine.
inline engine make_engine(const ze_driver_handle_t adriver,
const ze_device_handle_t adevice, const ze_context_handle_t acontext) {
dnnl_engine_t aengine;
error::wrap_c_api(
dnnl_l0_interop_engine_create(&aengine, adriver, adevice, acontext),
"could not create an engine");
return engine(aengine);
}
/// Returns the Level Zero context associated with an engine.
///
/// @param aengine Engine to query.
///
/// @returns The underlying Level Zero device of the engine.
inline ze_context_handle_t get_context(const engine &aengine) {
ze_context_handle_t ctx = nullptr;
error::wrap_c_api(dnnl_l0_interop_engine_get_context(aengine.get(), ctx),
"could not get a context handle");
return ctx;
}
/// Returns the Level Zero device associated with an engine.
///
/// @param aengine Engine to query.
///
/// @returns The underlying Level Zero context of the engine.
inline ze_device_handle_t get_device(const engine &aengine) {
ze_device_handle_t dev = nullptr;
error::wrap_c_api(dnnl_l0_interop_engine_get_device(aengine.get(), dev),
"could not get a device handle");
return dev;
}
/// Returns the Level Zero driver associated with an engine.
///
/// @param aengine Engine to query.
///
/// @returns The underlying Level Zero driver of the engine.
inline ze_driver_handle_t get_driver(const engine &aengine) {
ze_driver_handle_t dri = nullptr;
error::wrap_c_api(dnnl_l0_interop_engine_get_driver(aengine.get(), dri),
"could not get a driver handle");
return dri;
}
/// Creates an execution stream for a given engine associated with a Level Zero
/// queue.
///
/// @param aengine Engine object to use for the stream.
/// @param alist Level Zero immediate command list to use for the stream.
///
/// @returns An execution stream.
inline stream make_stream(
const engine &aengine, ze_command_list_handle_t alist) {
dnnl_stream_t astream;
error::wrap_c_api(
dnnl_l0_interop_stream_create(&astream, aengine.get(), alist),
"could not create a stream");
return stream(astream);
}
/// Returns the Level Zero immediate command list associated with an execution stream.
///
/// @param astream Execution stream to query.
///
/// @returns Level Zero immediate command list object.
inline ze_command_list_handle_t get_list(const stream &astream) {
ze_command_list_handle_t list = nullptr;
error::wrap_c_api(dnnl_l0_interop_stream_get_list(astream.get(), list),
"could not get a stream handle");
return list;
}
/// Creates a memory object with multiple handles.
///
/// @param memory_desc Memory descriptor.
/// @param aengine Engine to use.
/// @param handles Handles of the memory buffers to use as underlying storages.
/// For each element of the @p handles array the following applies:
/// - A USM pointer to the user-allocated buffer. In this case the library
/// doesn't own the buffer. Requires @p memory_kind to be equal to
/// dnnl::l0_interop::memory_kind::usm.
/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
/// allocate the buffer for the memory object. In this case the library
/// owns the buffer.
/// - The DNNL_MEMORY_NONE specific value. Instructs the library to
/// create memory object without an underlying buffer.
///
/// If the @p handles vector is not provided the library will allocate all
/// buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
///
/// @returns Created memory object.
inline memory make_memory(const memory::desc &memory_desc,
const engine &aengine, std::vector<void *> handles = {}) {
if (handles.empty()) {
const int nhandles = memory_desc.get_num_handles();
handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
}
dnnl_memory_t c_memory;
error::wrap_c_api(
dnnl_l0_interop_memory_create_v2(&c_memory, memory_desc.get(),
aengine.get(), handles.size(), handles.data()),
"could not create a memory");
return memory(c_memory);
}
/// Creates a memory object.
///
/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
/// constructed memory object will have the underlying buffer set. In this
/// case, the buffer will be initialized as if:
/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
/// equal to dnnl::l0_interop::memory_kind::usm, or
/// - dnnl::l0_interop::set_buffer() has been called, if @p memory_kind is
/// equal to dnnl::l0_interop::memory_kind::buffer.
///
/// @param memory_desc Memory descriptor.
/// @param aengine Engine to use.
/// @param handle Handle of the memory buffer to use as an underlying storage.
/// - A USM pointer to the user-allocated buffer. In this case the library
/// doesn't own the buffer. Requires @p memory_kind to be equal to
/// dnnl::l0_interop::memory_kind::usm.
/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
/// allocate the buffer for the memory object. In this case the library
/// owns the buffer.
/// - The DNNL_MEMORY_NONE specific value. Instructs the library to
/// create memory object without an underlying buffer.
///
/// @returns Created memory object.
inline memory make_memory(
const memory::desc &memory_desc, const engine &aengine, void *handle) {
return make_memory(memory_desc, aengine, std::vector<void *> {handle});
}
/// Returns the Level Zero memory object associated with the memory object.
///
/// @param amemory A memory object.
/// @returns Underlying Level Zero memory object.
inline void *get_mem_object(const memory &amemory) {
void *mem_object;
error::wrap_c_api(
dnnl_l0_interop_memory_get_mem_object(amemory.get(), &mem_object),
"could not get Level Zero buffer object from a memory object");
return mem_object;
}
/// Sets the Level Zero memory object associated with the memory object.
///
/// For behavioral details see memory::set_data_handle().
///
/// @param amemory A memory object.
/// @param mem_object Level Zero cl_mem object to use as the underlying
/// storage. It must have at least get_desc().get_size() bytes
/// allocated.
inline void set_mem_object(memory &amemory, void *mem_object) {
error::wrap_c_api(
dnnl_l0_interop_memory_set_mem_object(amemory.get(), mem_object),
"could not set Level Zero buffer object from a memory object");
}
/// Executes computations specified by the primitive in a specified stream and
/// returns a Level Zero event.
///
/// Arguments are passed via an arguments map containing
/// <index, memory object> pairs. The index must be one of the `DNNL_ARG_*`
/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
/// matching the one returned by
/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using
/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
///
/// @param aprimitive Primitive to execute.
/// @param astream Stream object. The stream must belong to the same engine
/// as the primitive.
/// @param args Arguments map.
/// @param deps Optional vector with `ze_event_handle_t` dependencies.
///
/// @returns Output event.
inline ze_event_handle_t execute(const dnnl::primitive &aprimitive,
const stream &astream, const std::unordered_map<int, memory> &args,
const std::vector<ze_event_handle_t> &deps = {}) {
std::vector<dnnl_exec_arg_t> c_args;
c_args.reserve(args.size());
for (const auto &a : args)
c_args.push_back({a.first, a.second.get()});
const ze_event_handle_t *c_deps = deps.empty() ? nullptr : deps.data();
ze_event_handle_t return_event;
error::wrap_c_api(dnnl_l0_interop_primitive_execute(aprimitive.get(),
astream.get(), c_args.size(), c_args.data(),
deps.size(), c_deps, &return_event),
"could not execute a primitive");
return return_event;
}
} // namespace l0_interop
/// @} dnnl_api_l0_interop
/// @} dnnl_api_interop
} // namespace dnnl
/// @} dnnl_api
#endif // ONEAPI_DNNL_DNNL_L0_HPP

View File

@ -284,7 +284,7 @@ if(DNNL_CPU_THREADING_RUNTIME STREQUAL "TBB")
endif()
endif()
if(DNNL_GPU_RUNTIME STREQUAL "OCL" OR (DNNL_GPU_SYCL AND DNNL_GPU_VENDOR STREQUAL "INTEL"))
if("${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|L0)$" OR (DNNL_GPU_SYCL AND DNNL_GPU_VENDOR STREQUAL "INTEL"))
install(FILES
"../cmake/FindOpenCL.cmake"
DESTINATION ${LIB_CONFIG_INSTALL_DIR})

View File

@ -1953,6 +1953,7 @@ enum runtime_kind_t {
dnnl_runtime_threadpool,
dnnl_runtime_ocl,
dnnl_runtime_sycl,
dnnl_runtime_l0,
};
namespace runtime_kind {
@ -1963,6 +1964,7 @@ const runtime_kind_t tbb = dnnl_runtime_tbb;
const runtime_kind_t threadpool = dnnl_runtime_threadpool;
const runtime_kind_t ocl = dnnl_runtime_ocl;
const runtime_kind_t sycl = dnnl_runtime_sycl;
const runtime_kind_t l0 = dnnl_runtime_l0;
} // namespace runtime_kind
using primitive_kind_t = dnnl_primitive_kind_t;

View File

@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2016-2024 Intel Corporation
* Copyright 2016-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -29,12 +29,16 @@
#include "cpu/cpu_engine.hpp"
#endif
#ifdef DNNL_WITH_SYCL
#include "xpu/sycl/engine_factory.hpp"
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#include "xpu/ocl/engine_factory.hpp"
#endif
#ifdef DNNL_WITH_SYCL
#include "xpu/sycl/engine_factory.hpp"
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#include "gpu/intel/l0/engine_factory.hpp"
#endif
namespace dnnl {
@ -42,23 +46,27 @@ namespace impl {
static inline std::unique_ptr<engine_factory_t> get_engine_factory(
engine_kind_t kind, runtime_kind_t runtime_kind) {
#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
if (kind == engine_kind::cpu && is_native_runtime(runtime_kind)) {
return std::unique_ptr<engine_factory_t>(
new cpu::cpu_engine_factory_t());
}
#endif
#ifdef DNNL_WITH_SYCL
if (runtime_kind == runtime_kind::sycl) {
return xpu::sycl::get_engine_factory(kind);
}
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
if (kind == engine_kind::gpu && runtime_kind == runtime_kind::ocl) {
return std::unique_ptr<engine_factory_t>(
new xpu::ocl::engine_factory_t(kind));
}
#endif
#ifdef DNNL_WITH_SYCL
if (runtime_kind == runtime_kind::sycl)
return xpu::sycl::get_engine_factory(kind);
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
if (kind == engine_kind::gpu && runtime_kind == runtime_kind::l0) {
return gpu::intel::l0::get_engine_factory(kind);
}
#endif
return nullptr;
}

View File

@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2016-2024 Intel Corporation
* Copyright 2016-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -185,6 +185,8 @@ inline runtime_kind_t get_default_runtime(engine_kind_t kind) {
if (kind == engine_kind::gpu) return runtime_kind::ocl;
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
if (kind == engine_kind::gpu) return runtime_kind::sycl;
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
if (kind == engine_kind::gpu) return runtime_kind::l0;
#endif
#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SEQ
return runtime_kind::seq;

View File

@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2018-2024 Intel Corporation
* Copyright 2018-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -291,7 +291,8 @@ std::string get_jit_profiling_jitdumpdir() {
bool is_destroying_cache_safe() {
#if defined(_WIN32) \
&& (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL)
&& (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 \
|| DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL)
// The ntdll.dll library is located in system32, therefore setting
// additional environment is not required.
HMODULE handle = LoadLibraryExA(

View File

@ -58,9 +58,12 @@ add_subdirectory(jit)
if(DNNL_GPU_RUNTIME STREQUAL "OCL")
add_subdirectory(ocl)
elseif(DNNL_GPU_RUNTIME STREQUAL "L0")
add_subdirectory(l0)
elseif(DNNL_WITH_SYCL)
add_subdirectory(sycl)
add_subdirectory(ocl)
add_subdirectory(l0/utils)
endif()
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel)

View File

@ -16,16 +16,21 @@
#include "gpu/intel/compute/ukernels.hpp"
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#include "gpu/intel/ocl/engine.hpp"
#include "gpu/intel/ocl/utils.hpp"
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
#include "gpu/intel/sycl/engine.hpp"
#include "gpu/intel/sycl/utils.hpp"
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#include "gpu/intel/ocl/engine.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#include "gpu/intel/l0/engine.hpp"
#include "gpu/intel/l0/utils/utils.hpp"
#endif
namespace dnnl {
namespace impl {
namespace gpu {
@ -51,6 +56,11 @@ bool mayiuse_microkernels(const engine_t *engine) {
auto mayiuse_mk = [](const engine_t *engine) {
switch (engine->runtime_kind()) {
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
case runtime_kind::sycl:
return sycl::mayiuse_microkernels(
utils::downcast<const sycl::engine_t *>(engine));
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
case runtime_kind::ocl: {
auto *ocl_engine
@ -60,10 +70,10 @@ bool mayiuse_microkernels(const engine_t *engine) {
cl_microkernels_check_kernel_code);
}
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
case runtime_kind::sycl:
return sycl::mayiuse_microkernels(
utils::downcast<const sycl::engine_t *>(engine));
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
case runtime_kind::l0:
return utils::downcast<const l0::engine_t *>(engine)
->mayiuse_microkernels();
#endif
default: return false;
}

View File

@ -0,0 +1,119 @@
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/compute/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace compute {
status_t preprocess_headers(stringstream_t &pp_code, const char *code,
const compute::kernel_ctx_t &kernel_ctx) {
stringstream_t code_stream(code);
for (std::string line; std::getline(code_stream, line);) {
const size_t include_pos = line.find("#include");
if (include_pos != std::string::npos) {
static constexpr size_t include_len = 8;
const size_t first_quote_pos
= line.find("\"", include_pos + include_len);
const size_t second_quote_pos
= line.find("\"", first_quote_pos + 1);
const size_t kernel_name_len
= second_quote_pos - first_quote_pos - 1;
const auto header_name
= line.substr(first_quote_pos + 1, kernel_name_len);
const char *header_source
= kernel_ctx.get_custom_header(header_name);
if (!header_source) header_source = get_kernel_header(header_name);
CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
} else {
pp_code << line << std::endl;
}
}
return status::success;
}
void debugdump_processed_source(const std::string &source,
const std::string &options, const std::string &cl_options) {
#if defined(__linux__) && defined(DNNL_DEV_MODE)
if (get_verbose(verbose_t::debuginfo) >= 10) {
auto get_defines = [](const std::string &from) {
std::string ret;
size_t pos = 0;
while (pos < from.length()) {
// Find next define argument
pos = from.find("-D", pos);
// Generate argument, quotes are interpreted literally, but
// other special shell characters need escaped. Does not
// currently handle quotes with the ' character or nested quotes
char quote_parity = true;
while (pos < from.length()) {
if (quote_parity
&& utils::one_of(from[pos], '~', '#', '$', '&', '*',
'(', ')', '\\', '|', '[', ']', '{', '}',
';', '\'', '<', '>', '/', '?', '!')) {
ret += '\\';
}
ret += from[pos];
if (from[pos] == '"') quote_parity ^= true;
if (from[pos] == ' ' && quote_parity) break;
pos++;
}
}
return ret;
};
auto execute_command = [](const std::string &cmd,
const std::string &stdin) {
std::string result;
std::array<char, 256> buffer;
FILE *pipe = popen(cmd.c_str(), "w");
fputs(stdin.c_str(), pipe);
if (pipe) {
while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
result += buffer.data();
}
}
pclose(pipe);
return result;
};
// Run utilities to evaluate preprocessor defines and format the file
// Theoretically, we can accomplish this task with libclang, but it
// seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
// so that calls to the system are not included in the default build.
// Due to the use of a different C preprocessor, warnings should not be
// ignored, as they may correspond to a different behavior in the OpenCL
// C preprocessor
auto o = get_defines(options) + get_defines(cl_options);
std::string preprocess_cmd
= std::string() + "cpp -P " + o + " | clang-format";
execute_command(preprocess_cmd, source);
std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
}
#endif
}
} // namespace compute
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -17,14 +17,8 @@
#ifndef GPU_INTEL_COMPUTE_UTILS_HPP
#define GPU_INTEL_COMPUTE_UTILS_HPP
#include <array>
#include <cassert>
#include <sstream>
#include <tuple>
#include <vector>
#include "common/utils.hpp"
#include "gpu/intel/compute/device_info.hpp"
#include "gpu/intel/compute/kernel_ctx.hpp"
#include "gpu/intel/utils.hpp"
namespace dnnl {
@ -155,6 +149,12 @@ private:
range_t local_range_;
};
status_t preprocess_headers(stringstream_t &pp_code, const char *code,
const compute::kernel_ctx_t &kernel_ctx);
void debugdump_processed_source(const std::string &source,
const std::string &options, const std::string &ocl_options);
} // namespace compute
} // namespace intel
} // namespace gpu

View File

@ -46,7 +46,15 @@ GEMMSTONE_NAMESPACE_START
#ifndef GENERATOR_BASE
#define GENERATOR_SUPER(hw) ngen::OpenCLCodeGenerator<hw>
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
#define FORWARD(hw) NGEN_FORWARD_SYCL(hw);
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw);
#endif
#define GENERATOR_DEBUGINFO {__FILE__, __LINE__}
#define GENERATOR_BASE(hw) GENERATOR_SUPER(hw)

View File

@ -39,6 +39,16 @@
#define MAGICSIZEY 2
#define MAGICSIZEZ 1
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
#define FORWARD(hw) NGEN_FORWARD_SYCL(hw);
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw);
#endif
namespace dnnl {
namespace impl {
namespace gpu {
@ -49,7 +59,7 @@ using namespace ngen;
template <HW hw>
class binary_format_kernel_t : public generator_t<hw> {
NGEN_FORWARD_OPENCL(hw);
FORWARD(hw);
public:
binary_format_kernel_t()

View File

@ -37,6 +37,9 @@
#ifdef WITH_OPENCL_RUNTIME
#include "ngen_opencl.hpp"
#endif
#ifdef WITH_L0_RUNTIME
#include "ngen_level_zero.hpp"
#endif
namespace dnnl {
namespace impl {
@ -1816,6 +1819,29 @@ cl_kernel make_kernel(const kernel::iface_t &iface, const stmt_t &body,
}
#endif
#ifdef WITH_L0_RUNTIME
std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
const kernel::iface_t &iface, const stmt_t &body,
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
ze_context_handle_t ctx, ze_device_handle_t dev) {
ngen::NEOInterfaceHandler interface = generate_ngen_interface(
iface, options, false, body);
#define GPU_HW_CASE(hw) \
ir_to_ngen_generator_t<ngen::LevelZeroCodeGenerator<(hw)>> g( \
iface, options, debug_cfg); \
g.setInterface(std::move(interface)); \
convert_ir_to_ngen(body, g); \
auto module = g.getModule(ctx, dev); \
auto kernel = g.getKernel(module); \
return std::make_pair(module, kernel);
GPU_HW_SWITCH(options.hw().ngen_hw());
#undef GPU_HW_CASE
return {};
}
#endif
} // namespace jit
} // namespace intel
} // namespace gpu

View File

@ -24,9 +24,13 @@
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
#include <sycl/sycl.hpp>
#define WITH_SYCL_RUNTIME
#endif
#define WITH_OPENCL_RUNTIME
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#include <CL/cl.h>
#define WITH_OPENCL_RUNTIME
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#include "level_zero/ze_api.h"
#define WITH_L0_RUNTIME
#endif
namespace dnnl {
namespace impl {
@ -44,6 +48,12 @@ cl_kernel make_kernel(const kernel::iface_t &iface, const stmt_t &body,
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
cl_context ctx, cl_device_id dev);
#endif
#ifdef WITH_L0_RUNTIME
std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
const kernel::iface_t &iface, const stmt_t &body,
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
ze_context_handle_t ctx, ze_device_handle_t dev);
#endif
} // namespace jit
} // namespace intel

View File

@ -34,7 +34,7 @@
#define DNNL
#define MICROKERNEL_INTERFACE
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#define ZEBIN_OUTPUT
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#define OPENCL_OUTPUT

View File

@ -40,6 +40,14 @@ inline cl_kernel make_kernel(
kernel.debug_cfg, ctx, dev);
}
#endif
#ifdef WITH_L0_RUNTIME
inline std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
const kernel_t &kernel, ze_context_handle_t ctx,
ze_device_handle_t dev) {
return make_kernel(kernel.iface, kernel.body, kernel.options,
kernel.debug_cfg, ctx, dev);
}
#endif
} // namespace dsl
} // namespace jit

View File

@ -44,6 +44,12 @@
#include "ngen_opencl.hpp"
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
#include "gpu/intel/l0/engine.hpp"
#include "gpu/intel/l0/kernel.hpp"
#include "ngen_level_zero.hpp"
#endif
namespace dnnl {
namespace impl {
namespace gpu {
@ -91,6 +97,11 @@ template <gpu_gen_t hw>
using ngen_code_generator_t = ngen::OpenCLCodeGenerator<hw>;
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
template <gpu_gen_t hw>
using ngen_code_generator_t = ngen::LevelZeroCodeGenerator<hw>;
#endif
void check_kernel_size(const std::string &kernel_name, size_t kernel_size,
const intel::engine_t *engine);
@ -123,6 +134,15 @@ public:
auto ocl_kernel = ngen_code_generator_t<hw>::getKernel(
ocl_engine->context(), ocl_engine->device());
return ocl::kernel_t::make(kernel, ocl_kernel, {});
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
auto *l0_engine = utils::downcast<const l0::engine_t *>(engine);
auto l0_module = std::make_shared<l0::module_wrapper_t>(
ngen_code_generator_t<hw>::getModule(
l0_engine->context(), l0_engine->device()));
auto l0_kernel
= ngen_code_generator_t<hw>::getKernel(*(l0_module.get()));
return l0::kernel_t::make(kernel, l0_module, l0_kernel, kernel_name());
#endif
}
};

View File

@ -0,0 +1,24 @@
#===============================================================================
# Copyright 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
file(GLOB_RECURSE SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
)
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0)
add_library(${OBJ_LIB} OBJECT ${SOURCES})
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
$<TARGET_OBJECTS:${OBJ_LIB}>)

View File

@ -0,0 +1,73 @@
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "oneapi/dnnl/dnnl_l0.h"
#include "common/utils.hpp"
#include "gpu/intel/l0/engine.hpp"
#include "gpu/intel/l0/engine_factory.hpp"
using namespace dnnl::impl;
dnnl_status_t dnnl_l0_interop_engine_create(dnnl_engine_t *engine,
const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
const ze_context_handle_t acontext) {
bool args_ok = !utils::any_null(engine, adriver, adevice, acontext);
if (!args_ok) return status::invalid_arguments;
gpu::intel::l0::engine_factory_t f(engine_kind::gpu);
size_t index;
CHECK(gpu::intel::l0::get_device_index(adevice, &index));
return f.engine_create(engine, adriver, adevice, acontext, index);
}
dnnl_status_t dnnl_l0_interop_engine_get_context(
dnnl_engine_t engine, ze_context_handle_t context) {
bool args_ok = !utils::any_null(engine, context)
&& (engine->runtime_kind() == runtime_kind::l0);
if (!args_ok) return status::invalid_arguments;
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
context = l0_engine->context();
return status::success;
}
dnnl_status_t dnnl_l0_interop_engine_get_device(
dnnl_engine_t engine, ze_device_handle_t device) {
bool args_ok = !utils::any_null(engine, device)
&& (engine->runtime_kind() == runtime_kind::l0);
if (!args_ok) return status::invalid_arguments;
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
device = l0_engine->device();
return status::success;
}
dnnl_status_t dnnl_l0_interop_engine_get_driver(
dnnl_engine_t engine, ze_driver_handle_t driver) {
bool args_ok = !utils::any_null(engine, driver)
&& (engine->runtime_kind() == runtime_kind::l0);
if (!args_ok) return status::invalid_arguments;
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
driver = l0_engine->driver();
return status::success;
}

View File

@ -0,0 +1,129 @@
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "oneapi/dnnl/dnnl_l0.h"
#include "common/utils.hpp"
#include "gpu/intel/l0/memory_storage.hpp"
using namespace dnnl::impl;
dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory,
const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
void *handle) {
bool ok = !utils::any_null(memory, memory_desc, engine)
&& engine->runtime_kind() == runtime_kind::l0;
if (!ok) return status::invalid_arguments;
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
auto kind = gpu::intel::l0::get_memory_storage_kind(
gpu::intel::l0::get_pointer_type(l0_engine->context(), handle));
if (handle != DNNL_MEMORY_NONE && handle != DNNL_MEMORY_ALLOCATE
&& kind == gpu::intel::l0::memory_storage_kind_t::unknown
&& !engine->mayiuse_system_memory_allocators())
return status::invalid_arguments;
const auto mdw = memory_desc_wrapper(memory_desc);
if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
return status::invalid_arguments;
unsigned flags = (handle == DNNL_MEMORY_ALLOCATE)
? memory_flags_t::alloc
: memory_flags_t::use_runtime_ptr;
handle = (handle == DNNL_MEMORY_ALLOCATE) ? nullptr : handle;
std::unique_ptr<memory_storage_t> mem_storage;
mem_storage.reset(new gpu::intel::l0::memory_storage_t(
engine, gpu::intel::l0::memory_storage_kind_t::device));
if (!mem_storage) return status::out_of_memory;
CHECK(mem_storage->init(
flags, dnnl_memory_desc_get_size(memory_desc), handle));
return safe_ptr_assign(
*memory, new memory_t(engine, memory_desc, std::move(mem_storage)));
}
dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory,
const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
size_t nhandles, void **handles) {
bool ok = !utils::any_null(memory, memory_desc, engine, handles)
&& nhandles > 0 && engine->runtime_kind() == runtime_kind::l0;
if (!ok) return status::invalid_arguments;
const auto mdw = memory_desc_wrapper(memory_desc);
if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
return status::invalid_arguments;
std::vector<unsigned> flags_vec(nhandles);
std::vector<void *> handles_vec(nhandles);
for (size_t i = 0; i < nhandles; i++) {
unsigned f = (handles[i] == DNNL_MEMORY_ALLOCATE)
? memory_flags_t::alloc
: memory_flags_t::use_runtime_ptr;
void *h = (handles[i] == DNNL_MEMORY_ALLOCATE) ? nullptr : handles[i];
flags_vec[i] = f;
handles_vec[i] = h;
}
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
std::vector<std::unique_ptr<memory_storage_t>> mem_storages(nhandles);
for (size_t i = 0; i < nhandles; i++) {
auto kind = gpu::intel::l0::get_memory_storage_kind(
gpu::intel::l0::get_pointer_type(
l0_engine->context(), handles[i]));
if (handles[i] != DNNL_MEMORY_NONE && handles[i] != DNNL_MEMORY_ALLOCATE
&& kind == gpu::intel::l0::memory_storage_kind_t::unknown
&& !engine->mayiuse_system_memory_allocators()) {
return status::invalid_arguments;
}
size_t sz = dnnl_memory_desc_get_size_v2(
memory_desc, static_cast<int>(i));
mem_storages[i].reset(new gpu::intel::l0::memory_storage_t(
engine, gpu::intel::l0::memory_storage_kind_t::device));
if (!mem_storages[i]) return status::out_of_memory;
CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i]));
}
return safe_ptr_assign(*memory,
new memory_t(engine, memory_desc, std::move(mem_storages)));
}
dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object(
const memory_t *memory, void **mem_object) {
if (utils::any_null(mem_object)) return status::invalid_arguments;
if (!memory) {
mem_object = nullptr;
return status::success;
}
bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0);
if (!args_ok) return status::invalid_arguments;
void *handle;
status_t status = memory->get_data_handle(&handle);
if (status == status::success) mem_object = &handle;
return status;
}
dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object(
memory_t *memory, void *mem_object) {
bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0);
if (!args_ok) return status::invalid_arguments;
return memory->set_data_handle(mem_object);
}

View File

@ -0,0 +1,67 @@
/*******************************************************************************
* Copyright 2023-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "oneapi/dnnl/dnnl_l0.h"
#include "common/primitive_desc_iface.hpp"
#include "common/primitive_iface.hpp"
#include "common/utils.hpp"
#include "gpu/intel/l0/stream.hpp"
using namespace dnnl::impl;
dnnl_status_t dnnl_l0_interop_primitive_execute(
const primitive_iface_t *primitive_iface, dnnl_stream_t stream,
size_t nargs, const dnnl_exec_arg_t *args, size_t ndeps,
const ze_event_handle_t *deps, ze_event_handle_t *return_event) {
const bool ok = !utils::any_null(primitive_iface, stream)
&& primitive_iface->engine() == stream->engine()
&& primitive_iface->engine()->runtime_kind() == runtime_kind::l0
&& IMPLICATION(nargs > 0, args != nullptr)
&& IMPLICATION(ndeps > 0, deps != nullptr);
if (!ok) return status::invalid_arguments;
auto *l0_stream = utils::downcast<gpu::intel::l0::stream_t *>(stream);
stream->before_exec_hook();
if (deps != nullptr) {
std::vector<ze_event_handle_t> events(ndeps);
for (size_t i = 0; i < ndeps; i++)
events[i] = deps[i];
l0_stream->l0_ctx().set_deps(events);
}
// run primitive
exec_args_t exec_args;
CHECK(cvt_primitive_args(primitive_iface->pd()->impl().get(),
static_cast<int>(nargs), args, exec_args));
exec_ctx_t ctx(stream, std::move(exec_args));
CHECK(primitive_execute(primitive_iface, ctx));
// return output event
if (return_event != nullptr) {
if (l0_stream->impl()->flags() & stream_flags::in_order) {
*return_event = nullptr;
} else {
*return_event = l0_stream->get_output_event();
}
}
stream->after_exec_hook();
return status::success;
}

View File

@ -0,0 +1,51 @@
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "oneapi/dnnl/dnnl_l0.h"
#include "common/utils.hpp"
#include "gpu/intel/l0/stream.hpp"
using namespace dnnl::impl;
dnnl_status_t dnnl_l0_interop_stream_create(dnnl_stream_t *stream,
dnnl_engine_t engine, ze_command_list_handle_t list) {
bool args_ok = !utils::any_null(stream, engine, list)
&& engine->runtime_kind() == runtime_kind::l0;
if (!args_ok) return status::invalid_arguments;
std::unique_ptr<stream_impl_t> stream_impl(
new gpu::intel::l0::stream_impl_t(
stream_flags::default_flags, list));
if (!stream_impl) return status::out_of_memory;
CHECK(engine->create_stream(stream, stream_impl.get()));
stream_impl.release();
return status::success;
}
dnnl_status_t dnnl_l0_interop_stream_get_list(
dnnl_stream_t stream, ze_command_list_handle_t list) {
bool args_ok = !utils::any_null(list, stream)
&& stream->engine()->runtime_kind() == runtime_kind::l0;
if (!args_ok) return status::invalid_arguments;
auto *l0_stream = utils::downcast<const gpu::intel::l0::stream_t *>(stream);
list = l0_stream->list();
return status::success;
}

View File

@ -0,0 +1,189 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/compiler.hpp"
#ifdef _WIN32
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include "windows.h"
#else
#include <dlfcn.h>
#endif
#include "ocloc_api.h"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
inline void *find_ocloc_symbol(const char *symbol) {
#ifdef _WIN32
// Use LOAD_LIBRARY_SEARCH_SYSTEM32 flag to avoid DLL hijacking issue.
HMODULE handle = LoadLibraryExA(
"ocloc64.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
if (!handle) return nullptr;
return reinterpret_cast<void *>(GetProcAddress(handle, symbol));
#else
void *handle = dlopen("libocloc.so", RTLD_NOW | RTLD_LOCAL);
if (!handle) return nullptr;
return dlsym(handle, symbol);
#endif
}
template <typename F>
F find_ocloc_symbol(const char *symbol) {
return (F)find_ocloc_symbol(symbol);
}
status_t ocloc_invoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources,
const uint8_t **DataSources, const uint64_t *LenSources,
const char **NameSources, uint32_t NumInputHeaders,
const uint8_t **DataInputHeaders, const uint64_t *LenInputHeaders,
const char **NameInputHeaders, uint32_t *NumOutputs,
uint8_t ***DataOutputs, uint64_t **LenOutputs, char ***NameOutputs) {
static auto f = find_ocloc_symbol<decltype(&oclocInvoke)>("oclocInvoke");
if (!f) return status::runtime_error;
if (f(NumArgs, Argv, NumSources, DataSources, LenSources, NameSources,
NumInputHeaders, DataInputHeaders, LenInputHeaders,
NameInputHeaders, NumOutputs, DataOutputs, LenOutputs,
NameOutputs))
return status::runtime_error;
return status::success;
}
status_t ocloc_free(uint32_t *numOutputs, uint8_t ***dataOutputs,
uint64_t **lenOutputs, char ***nameOutputs) {
static auto f
= find_ocloc_symbol<decltype(&oclocFreeOutput)>("oclocFreeOutput");
if (!f) return status::runtime_error;
if (f(numOutputs, dataOutputs, lenOutputs, nameOutputs))
return status::runtime_error;
return status::success;
}
status_t ocloc_get_extensions(std::string &extensions) {
std::vector<const char *> args = {"ocloc", "query", "CL_DEVICE_EXTENSIONS"};
uint32_t num_outputs = 0;
uint8_t **data_outputs = nullptr;
uint64_t *len_outputs = nullptr;
char **name_outputs = nullptr;
CHECK(ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(), 0,
nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr, &num_outputs,
&data_outputs, &len_outputs, &name_outputs));
for (uint32_t i = 0; i < num_outputs; i++) {
if (!strcmp(name_outputs[i], "stdout.log")) {
if (len_outputs[i] > 0) {
extensions = std::string(
reinterpret_cast<const char *>(data_outputs[i]));
break;
}
}
}
CHECK(ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs));
return status::success;
}
bool ocloc_mayiuse_microkernels(const std::string &kernel_code) {
std::vector<const char *> args
= {"ocloc", "compile", "-q", "-file", "test.cl"};
const uint8_t *data_sources[]
= {reinterpret_cast<const uint8_t *>(kernel_code.c_str())};
const uint64_t len_sources[] = {kernel_code.length() + 1};
const char *name_sources[] = {"test.cl"};
uint32_t num_outputs = 0;
uint8_t **data_outputs = nullptr;
uint64_t *len_outputs = nullptr;
char **name_outputs = nullptr;
bool compilation_successful = true;
if (ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(), 1,
data_sources, len_sources, name_sources, 0, nullptr, nullptr,
nullptr, &num_outputs, &data_outputs, &len_outputs,
&name_outputs))
compilation_successful = false;
ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs);
return compilation_successful;
}
status_t ocloc_build_kernels(const std::string &kernel_code,
const std::string &options, const std::string &ip_version,
xpu::binary_t &binary) {
std::vector<const char *> args = {"ocloc", "compile", "-q", "--format",
"zebin", "-exclude_ir", "-output_no_suffix", "-file", "main.cl",
"-device", ip_version.c_str(), "-options", options.c_str()};
const uint8_t *data_sources[]
= {reinterpret_cast<const uint8_t *>(kernel_code.c_str())};
const uint64_t len_sources[] = {kernel_code.length() + 1};
const char *name_sources[] = {"main.cl"};
uint32_t num_outputs = 0;
uint8_t **data_outputs = nullptr;
uint64_t *len_outputs = nullptr;
char **name_outputs = nullptr;
status_t ret = ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(),
1, data_sources, len_sources, name_sources, 0, nullptr, nullptr,
nullptr, &num_outputs, &data_outputs, &len_outputs, &name_outputs);
if (ret != status::success) {
std::string output_string;
for (uint32_t i = 0; i < num_outputs; i++) {
if (!strcmp(name_outputs[i], "stdout.log")) {
if (len_outputs[i] > 0) {
output_string = std::string(
reinterpret_cast<const char *>(data_outputs[i]));
}
}
}
CHECK(ocloc_free(
&num_outputs, &data_outputs, &len_outputs, &name_outputs));
throw std::runtime_error(output_string);
}
for (uint32_t i = 0; i < num_outputs; i++) {
if (!strcmp(name_outputs[i], "main.bin")) {
if (len_outputs[i] > 0) {
binary.resize(len_outputs[i]);
std::memcpy(binary.data(), data_outputs[i], len_outputs[i]);
break;
}
}
}
CHECK(ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs));
return status::success;
}
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,40 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_COMPILER_HPP
#define GPU_INTEL_L0_COMPILER_HPP
#include "gpu/intel/l0/utils/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
status_t ocloc_get_extensions(std::string &extensions);
bool ocloc_mayiuse_microkernels(const std::string &kernel_code);
status_t ocloc_build_kernels(const std::string &kernel_code,
const std::string &options, const std::string &ip_version,
xpu::binary_t &binary);
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_COMPILER_HPP

View File

@ -0,0 +1,28 @@
/*******************************************************************************
* Copyright 2023-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/context.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,103 @@
/*******************************************************************************
* Copyright 2023-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_CONTEXT_HPP
#define GPU_INTEL_L0_CONTEXT_HPP
#include "gpu/intel/l0/utils/utils.hpp"
#include "xpu/context.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
struct event_t : public xpu::event_t {
event_t() = default;
event_t(const event_t &) = default;
event_t(const std::vector<ze_event_handle_t> &event) : events_(event) {}
event_t(std::vector<ze_event_handle_t> &&event)
: events_(std::move(event)) {}
event_t(ze_event_handle_t &&event) {
events_.emplace_back(std::move(event));
}
~event_t() override = default;
event_t &operator=(event_t &&other) {
std::swap(events_, other.events_);
return *this;
}
event_t &operator=(const event_t &other) {
events_ = other.events_;
return *this;
}
const ze_event_handle_t &operator[](size_t i) const { return events_[i]; }
ze_event_handle_t &operator[](size_t i) { return events_[i]; }
size_t size() const { return events_.size(); }
static event_t &from(xpu::event_t &event) {
return *utils::downcast<event_t *>(&event);
}
static const event_t &from(const xpu::event_t &event) {
return *utils::downcast<const event_t *>(&event);
}
std::unique_ptr<xpu::event_t> clone() const override {
return std::unique_ptr<xpu::event_t>(new event_t(*this));
}
void append(const xpu::event_t &event) {
auto &other = *utils::downcast<const event_t *>(&event);
events_.insert(
events_.end(), other.events_.begin(), other.events_.end());
}
std::vector<ze_event_handle_t> events_;
};
class context_t final : public xpu::context_t {
public:
context_t() = default;
~context_t() override = default;
context_t &operator=(const context_t &other) {
events_ = other.events_;
return *this;
}
void set_deps(std::vector<ze_event_handle_t> &&event) {
events_ = event_t(event);
}
void set_deps(event_t &&events) { events_ = std::move(events); }
xpu::event_t &get_deps() override { return events_; }
const xpu::event_t &get_deps() const override { return events_; }
void append_deps(const xpu::event_t &event) override {
events_.append(event);
}
status_t get_event(ze_event_handle_t *new_event);
private:
event_t events_;
};
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_CONTEXT_HPP

View File

@ -0,0 +1,145 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/device_info.hpp"
#include "gpu/intel/l0/compiler.hpp"
#include "gpu/intel/l0/engine.hpp"
#include "ngen_level_zero.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
status_t device_info_t::init_arch(impl::engine_t *engine) {
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
auto context = l0_engine->context();
auto device = l0_engine->device();
return init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_,
gpu_product_, native_extensions_, mayiuse_systolic_,
mayiuse_ngen_kernels_);
}
status_t device_info_t::init_device_name(impl::engine_t *engine) {
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
auto device = l0_engine->device();
ze_device_properties_t device_properties = {};
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
device_properties.pNext = nullptr;
CHECK(l0::zeDeviceGetProperties(device, &device_properties));
name_ = std::string(device_properties.name);
return status::success;
}
status_t device_info_t::init_runtime_version(impl::engine_t *engine) {
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
auto driver = l0_engine->driver();
ze_driver_properties_t driver_properties = {};
driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
driver_properties.pNext = nullptr;
l0::zeDriverGetProperties(driver, &driver_properties);
runtime_version_.major
= (driver_properties.driverVersion & 0xFF000000) >> 24;
runtime_version_.minor
= (driver_properties.driverVersion & 0x00FF0000) >> 16;
runtime_version_.build = driver_properties.driverVersion & 0x0000FFFF;
return status::success;
}
status_t device_info_t::init_extensions(impl::engine_t *engine) {
std::string extension_string;
CHECK(ocloc_get_extensions(extension_string));
for (uint64_t i_ext = 1; i_ext < (uint64_t)compute::device_ext_t::last;
i_ext <<= 1) {
const char *s_ext = ext2cl_str((compute::device_ext_t)i_ext);
if (s_ext && extension_string.find(s_ext) != std::string::npos) {
extensions_ |= i_ext;
}
}
extensions_
|= (uint64_t)get_future_extensions(gpu_arch(), mayiuse_systolic());
return status::success;
}
status_t device_info_t::init_attributes(impl::engine_t *engine) {
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
auto device = l0_engine->device();
ze_device_properties_t device_properties = {};
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
device_properties.pNext = nullptr;
CHECK(l0::zeDeviceGetProperties(device, &device_properties));
eu_count_ = device_properties.numSlices
* device_properties.numSubslicesPerSlice
* device_properties.numEUsPerSubslice;
ze_device_compute_properties_t device_compute_properties = {};
device_compute_properties.stype
= ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
device_compute_properties.pNext = nullptr;
CHECK(l0::zeDeviceGetComputeProperties(device, &device_compute_properties));
max_wg_size_ = device_compute_properties.maxTotalGroupSize;
uint32_t device_cache_properties_count = 0;
CHECK(l0::zeDeviceGetCacheProperties(
device, &device_cache_properties_count, nullptr));
std::vector<ze_device_cache_properties_t> device_cache_properties(
device_cache_properties_count);
for (ze_device_cache_properties_t &p : device_cache_properties) {
p.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
p.pNext = nullptr;
}
CHECK(l0::zeDeviceGetCacheProperties(device, &device_cache_properties_count,
device_cache_properties.data()));
l3_cache_size_ = device_cache_properties[0].cacheSize;
ze_device_memory_access_properties_t device_memory_access_properties = {};
device_memory_access_properties.stype
= ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
device_memory_access_properties.pNext = nullptr;
l0::zeDeviceGetMemoryAccessProperties(
device, &device_memory_access_properties);
mayiuse_system_memory_allocators_
= device_memory_access_properties.sharedSystemAllocCapabilities;
return status::success;
}
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,44 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_DEVICE_INFO_HPP
#define GPU_INTEL_L0_DEVICE_INFO_HPP
#include "gpu/intel/compute/device_info.hpp"
#include "gpu/intel/l0/utils/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
class device_info_t : public compute::device_info_t {
protected:
status_t init_device_name(impl::engine_t *engine) override;
status_t init_arch(impl::engine_t *engine) override;
status_t init_runtime_version(impl::engine_t *engine) override;
status_t init_extensions(impl::engine_t *engine) override;
status_t init_attributes(impl::engine_t *engine) override;
};
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_DEVICE_INFO_HPP

292
src/gpu/intel/l0/engine.cpp Normal file
View File

@ -0,0 +1,292 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/engine.hpp"
#include "gpu/intel/l0/compiler.hpp"
#include "gpu/intel/l0/device_info.hpp"
#include "gpu/intel/l0/kernel.hpp"
#include "gpu/intel/l0/memory_storage.hpp"
#include "gpu/intel/l0/stream.hpp"
#include "gpu/intel/compute/ukernels.hpp"
#include "gpu/intel/jit/dsl/runtime.hpp"
#include "gpu/intel/jit/generator.hpp"
#include "gpu/intel/microkernels/fuser.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
class engine_impl_t : public impl::engine_impl_t {
public:
engine_impl_t(engine_kind_t kind, const ze_driver_handle_t driver,
const ze_device_handle_t device, const ze_context_handle_t context,
size_t index)
: impl::engine_impl_t(kind, runtime_kind::l0, index)
, driver_(driver)
, device_(device)
, context_(context) {}
~engine_impl_t() override { l0::zeContextDestroy(context_); }
const ze_driver_handle_t driver() const { return driver_; }
const ze_device_handle_t device() const { return device_; }
const ze_context_handle_t context() const { return context_; }
status_t create_stream_impl(
impl::stream_impl_t **stream_impl, unsigned flags) const override {
auto *si = new stream_impl_t(flags, context_, device_);
if (!si) return status::out_of_memory;
*stream_impl = si;
return status::success;
}
status_t create_memory_storage(impl::memory_storage_t **storage,
impl::engine_t *engine, unsigned flags, size_t size,
void *handle) const override {
std::unique_ptr<memory_storage_t> _storage;
_storage.reset(
new memory_storage_t(engine, memory_storage_kind_t::device));
if (!_storage) return status::out_of_memory;
status_t status = _storage->init(flags, size, handle);
if (status != status::success) return status;
*storage = _storage.release();
return status::success;
}
engine_id_t engine_id() const override {
return engine_id_t(new engine_id_impl_t(
device(), context(), kind(), runtime_kind(), index()));
}
int get_buffer_alignment() const override { return 128; }
private:
ze_driver_handle_t driver_;
ze_device_handle_t device_;
ze_context_handle_t context_;
engine_impl_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(engine_impl_t);
};
status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
const ze_driver_handle_t dri, const ze_device_handle_t dev,
const ze_context_handle_t ctx, size_t index) {
std::unique_ptr<gpu::intel::l0::engine_t, engine_deleter_t> e(
(new gpu::intel::l0::engine_t(dri, dev, ctx, index)));
if (!e) return status::out_of_memory;
CHECK(e->init());
*engine = e.release();
return status::success;
}
engine_t::engine_t(ze_driver_handle_t driver, ze_device_handle_t device,
ze_context_handle_t context, size_t index)
: gpu::intel::engine_t(new engine_impl_t(
engine_kind::gpu, driver, device, context, index)) {}
status_t engine_t::init() {
CHECK(init_impl());
CHECK(gpu::intel::engine_t::init());
return status::success;
}
status_t engine_t::create_stream(
impl::stream_t **stream, impl::stream_impl_t *stream_impl) {
return gpu::intel::l0::stream_t::create_stream(stream, this, stream_impl);
}
status_t engine_t::create_kernel(
compute::kernel_t *kernel, jit::generator_base_t *jitter) const {
if (kind() != engine_kind::gpu) {
assert(!"not expected");
return status::invalid_arguments;
}
return jitter->get_kernel(*kernel, this);
}
status_t engine_t::create_kernel(
compute::kernel_t &kernel, const jit::dsl::kernel_t &kernel_dsl) const {
if (kind() != engine_kind::gpu) {
assert(!"not expected");
return status::invalid_arguments;
}
auto module_and_kernel
= jit::dsl::make_kernel(kernel_dsl, context(), device());
auto l0_module_ptr
= std::make_shared<module_wrapper_t>(module_and_kernel.first);
return kernel_t::make(kernel, l0_module_ptr, module_and_kernel.second, {});
}
status_t engine_t::convert_to_l0(
std::vector<gpu::intel::compute::kernel_t> &kernels,
const std::vector<const char *> &kernel_names,
xpu::binary_t &binary) const {
ze_module_handle_t l0_module = nullptr;
std::vector<ze_kernel_handle_t> l0_kernels;
CHECK(gpu::intel::l0::create_kernels_from_binary(
device(), context(), kernel_names, binary, &l0_module, l0_kernels));
auto l0_module_ptr = std::make_shared<module_wrapper_t>(l0_module);
kernels = std::vector<gpu::intel::compute::kernel_t>(kernel_names.size());
for (size_t i = 0; i < kernel_names.size(); i++) {
if (!l0_kernels[i]) continue;
CHECK(kernel_t::make(
kernels[i], l0_module_ptr, l0_kernels[i], kernel_names[i]));
}
return status::success;
}
status_t engine_t::create_kernels(std::vector<compute::kernel_t> *kernels,
const std::vector<const char *> &kernel_names,
const compute::kernel_ctx_t &kernel_ctx) const {
if (kind() != engine_kind::gpu) {
assert(!"not expected");
return status::invalid_arguments;
}
const char *source = nullptr;
for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++)
source = intel::get_kernel_source(kernel_names[i]);
std::string options = kernel_ctx.options();
auto *dev_info = utils::downcast<const device_info_t *>(device_info());
options += " " + dev_info->get_cl_ext_options();
stringstream_t code_ss;
CHECK(compute::preprocess_headers(code_ss, source, kernel_ctx));
std::string code = code_ss.str();
gpu::intel::compute::program_src_t src(code);
if (src) { options += " -g -s " + std::string(src.name()); }
compute::debugdump_processed_source(
code, options, dev_info->get_cl_ext_options());
xpu::binary_t binary;
#if 1
CHECK(ocloc_build_kernels(
code, options, std::to_string(dev_info->ip_version()), binary));
#else
CHECK(gpu::intel::l0::compile_ocl_module(
device(), context(), code, options, binary));
#endif
const char *code_c = code.c_str();
if (kernel_ctx.has_custom_headers() && micro::hasMicrokernels(code_c)) {
try {
micro::fuseMicrokernels(binary, code_c);
} catch (...) { return status::runtime_error; }
}
CHECK(convert_to_l0(*kernels, kernel_names, binary));
return status::success;
}
status_t engine_t::create_kernel_from_binary(compute::kernel_t &kernel,
const xpu::binary_t &binary, const char *kernel_name,
const compute::program_src_t &src) const {
std::vector<const char *> kernel_names = {kernel_name};
ze_module_handle_t l0_module = nullptr;
std::vector<ze_kernel_handle_t> l0_kernels;
CHECK(gpu::intel::l0::create_kernels_from_binary(
device(), context(), kernel_names, binary, &l0_module, l0_kernels));
auto l0_module_ptr = std::make_shared<module_wrapper_t>(l0_module);
CHECK(kernel_t::make(kernel, l0_module_ptr, l0_kernels[0], kernel_name));
return status::success;
}
status_t engine_t::create_kernels_from_cache_blob(
const cache_blob_t &cache_blob, std::vector<compute::kernel_t> &kernels,
const std::vector<const char *> &kernel_names) const {
if (kind() != engine_kind::gpu) {
assert(!"not expected");
return status::invalid_arguments;
}
kernels = std::vector<compute::kernel_t>(kernel_names.size());
for (size_t i = 0; i < kernel_names.size(); i++) {
if (!kernel_names[i] && kernel_names.size() > 1) continue;
std::string kernel_name(kernel_names[i] ? kernel_names[i] : "");
const uint8_t *binary_data = nullptr;
size_t binary_size = 0;
CHECK(cache_blob.get_binary(&binary_data, &binary_size));
xpu::binary_t binary(binary_data, binary_data + binary_size);
CHECK(create_kernel_from_binary(kernels[i], binary, kernel_names[i],
gpu::intel::compute::program_src_t()));
}
return status::success;
}
gpu_utils::device_id_t engine_t::device_id() const {
return std::tuple_cat(
std::make_tuple(1), gpu::intel::l0::get_device_uuid(device()));
}
const ze_driver_handle_t engine_t::driver() const {
return static_cast<const engine_impl_t *>(impl())->driver();
}
const ze_device_handle_t engine_t::device() const {
return static_cast<const engine_impl_t *>(impl())->device();
}
const ze_context_handle_t engine_t::context() const {
return static_cast<const engine_impl_t *>(impl())->context();
}
bool engine_t::mayiuse_microkernels() const {
return ocloc_mayiuse_microkernels(
std::string(compute::cl_microkernels_check_kernel_code));
}
status_t engine_t::init_device_info() {
device_info_ = std::make_shared<gpu::intel::l0::device_info_t>();
CHECK(device_info_->init(this));
return status::success;
}
status_t engine_t::init_device_info(const std::vector<uint8_t> &cache_blob) {
gpu_assert(false) << "unimplemented function init_device_info() called";
return status::runtime_error;
}
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

117
src/gpu/intel/l0/engine.hpp Normal file
View File

@ -0,0 +1,117 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_ENGINE_HPP
#define GPU_INTEL_L0_ENGINE_HPP
// #include <list>
#include "gpu/intel/engine.hpp"
#include "gpu/intel/l0/utils/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
struct engine_id_impl_t : public impl::engine_id_impl_t {
engine_id_impl_t(const ze_device_handle_t device,
const ze_context_handle_t context, engine_kind_t kind,
runtime_kind_t runtime_kind, size_t index)
: impl::engine_id_impl_t(kind, runtime_kind, index)
, device_(device)
, context_(context) {}
~engine_id_impl_t() override = default;
private:
bool compare_resource(
const impl::engine_id_impl_t *id_impl) const override {
const auto *typed_id
= utils::downcast<const engine_id_impl_t *>(id_impl);
return device_ == typed_id->device_ && context_ == typed_id->context_;
}
size_t hash_resource() const override {
size_t seed = 0;
seed = hash_combine(seed, device_);
seed = hash_combine(seed, context_);
return seed;
}
ze_device_handle_t device_;
ze_context_handle_t context_;
engine_id_impl_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(engine_id_impl_t);
};
status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
const ze_driver_handle_t dri, const ze_device_handle_t dev,
const ze_context_handle_t ctx, size_t index);
class engine_t : public intel::engine_t {
public:
engine_t(ze_driver_handle_t driver, ze_device_handle_t device,
ze_context_handle_t context, size_t index);
~engine_t() override = default;
status_t init() override;
status_t create_stream(
impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
status_t create_kernel(compute::kernel_t *kernel,
jit::generator_base_t *jitter) const override;
status_t create_kernel(compute::kernel_t &kernel,
const jit::kernel_t &kernel_ir) const override;
status_t create_kernels(std::vector<compute::kernel_t> *kernels,
const std::vector<const char *> &kernel_names,
const compute::kernel_ctx_t &kernel_ctx) const override;
status_t create_kernel_from_binary(compute::kernel_t &kernel,
const xpu::binary_t &binary, const char *kernel_name,
const compute::program_src_t &src) const override;
status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob,
std::vector<compute::kernel_t> &kernels,
const std::vector<const char *> &kernel_names) const override;
gpu::intel::gpu_utils::device_id_t device_id() const override;
const ze_driver_handle_t driver() const;
const ze_device_handle_t device() const;
const ze_context_handle_t context() const;
bool mayiuse_microkernels() const;
private:
status_t init_device_info() override;
status_t init_device_info(const std::vector<uint8_t> &cache_blob) override;
status_t convert_to_l0(std::vector<gpu::intel::compute::kernel_t> &kernels,
const std::vector<const char *> &kernel_names,
xpu::binary_t &binary) const;
engine_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(engine_t);
};
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_ENGINE_HPP

View File

@ -0,0 +1,88 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/engine_factory.hpp"
#include "gpu/intel/l0/engine.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
engine_factory_t::engine_factory_t(engine_kind_t engine_kind)
: engine_kind_(engine_kind) {
assert(utils::one_of(engine_kind_, engine_kind::gpu));
}
size_t engine_factory_t::count() const {
uint32_t driver_count = 0;
l0::zeDriverGet(&driver_count, nullptr);
std::vector<ze_driver_handle_t> drivers(driver_count);
l0::zeDriverGet(&driver_count, drivers.data());
uint32_t device_count = 0;
l0::zeDeviceGet(drivers[0], &device_count, nullptr);
return device_count;
}
status_t engine_factory_t::engine_create(
impl::engine_t **engine, size_t index) const {
ze_driver_handle_t driver = nullptr;
ze_device_handle_t device = nullptr;
ze_context_handle_t context = nullptr;
uint32_t driver_count = 0;
CHECK(l0::zeDriverGet(&driver_count, nullptr));
std::vector<ze_driver_handle_t> drivers(driver_count);
CHECK(l0::zeDriverGet(&driver_count, drivers.data()));
driver = drivers[0];
uint32_t device_count = 0;
CHECK(l0::zeDeviceGet(driver, &device_count, nullptr));
VERROR_ENGINE(index < device_count, status::invalid_arguments,
"asked for device %zu but only %u devices are found", index,
device_count);
std::vector<ze_device_handle_t> devices(device_count);
CHECK(l0::zeDeviceGet(driver, &device_count, devices.data()));
device = devices[index];
ze_context_desc_t context_desc = {};
context_desc.stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC;
context_desc.pNext = nullptr;
context_desc.flags = 0;
CHECK(l0::zeContextCreate(driver, &context_desc, &context));
return engine_create(engine, driver, device, context, index);
}
status_t engine_factory_t::engine_create(impl::engine_t **engine,
const ze_driver_handle_t driver, const ze_device_handle_t device,
const ze_context_handle_t context, size_t index) const {
return gpu::intel::l0::engine_create(
engine, engine_kind_, driver, device, context, index);
}
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,59 @@
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_ENGINE_FACTORY_HPP
#define GPU_INTEL_L0_ENGINE_FACTORY_HPP
#include "common/engine.hpp"
#include "gpu/intel/l0/utils/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
class engine_factory_t : public impl::engine_factory_t {
public:
engine_factory_t(engine_kind_t engine_kind);
~engine_factory_t() override = default;
size_t count() const override;
status_t engine_create(
impl::engine_t **engine, size_t index) const override;
status_t engine_create(impl::engine_t **engine,
const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
const ze_context_handle_t acontext, size_t index) const;
private:
engine_kind_t engine_kind_;
engine_factory_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(engine_factory_t);
};
inline std::unique_ptr<engine_factory_t> get_engine_factory(
engine_kind_t engine_kind) {
return std::unique_ptr<engine_factory_t>(new engine_factory_t(engine_kind));
};
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_ENGINE_FACTORY_HPP

215
src/gpu/intel/l0/kernel.cpp Normal file
View File

@ -0,0 +1,215 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/kernel.hpp"
#include "gpu/intel/l0/context.hpp"
#include "gpu/intel/l0/engine.hpp"
#include "gpu/intel/l0/memory_storage.hpp"
#include "gpu/intel/l0/stream.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
// This class is to get around std::make_shared requirement to have a public
// constructor. We keep the original constructor as private but expose it here
// to use with std::make_shared.
class kernel_compat_t : public kernel_t {
public:
template <typename... Args>
kernel_compat_t(Args &&...args) : kernel_t(std::forward<Args>(args)...) {}
};
status_t kernel_t::make(compute::kernel_t &compute_kernel,
const std::shared_ptr<module_wrapper_t> module_ptr,
const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name) {
compute_kernel = compute::kernel_t(std::make_shared<kernel_compat_t>(
module_ptr, kernel_ptr, kernel_name));
return status::success;
}
kernel_t::kernel_t(const std::shared_ptr<module_wrapper_t> module_ptr,
const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name)
: module_(module_ptr), kernel_(kernel_ptr), kernel_name_(kernel_name) {}
kernel_t::~kernel_t() {
l0::zeKernelDestroy(kernel_);
}
status_t kernel_t::check_alignment(
const compute::kernel_arg_list_t &arg_list) const {
for (int i = 0; i < arg_list.nargs(); ++i) {
auto &arg = arg_list.get(i);
if (!arg.is_global()) continue;
auto *mem_storage = static_cast<const memory_storage_t *>(arg.value());
if (!*mem_storage) continue;
CHECK(compute::kernel_impl_t::check_alignment(
mem_storage->data_handle(), i));
}
return status::success;
}
status_t kernel_t::set_arg(
int arg_index, size_t arg_size, const void *arg_value) const {
return l0::zeKernelSetArgumentValue(
kernel_, arg_index, arg_size, arg_value);
}
status_t kernel_t::parallel_for(impl::stream_t &stream,
const compute::nd_range_t &range,
const compute::kernel_arg_list_t &arg_list, const xpu::event_t &deps,
xpu::event_t &out_dep) {
CHECK(check_scalar_arguments(arg_list));
CHECK(check_alignment(arg_list));
auto l0_stream = utils::downcast<stream_t *>(&stream);
auto l0_engine = l0_stream->l0_engine();
auto l0_device_info = l0_engine->device_info();
const size_t pointer_size = l0_device_info->device_address_bits() / 8;
size_t param_bytes = 0;
for (int i = 0; i < arg_list.nargs(); ++i) {
auto &arg = arg_list.get(i);
if (arg.is_global()) {
auto *mem_storage
= static_cast<const memory_storage_t *>(arg.value());
if (!mem_storage->is_null()) {
auto memory_storage_ctx
= utils::downcast<engine_t *>(mem_storage->engine())
->context();
if (l0_engine->context() != memory_storage_ctx) {
VERROR(primitive, gpu,
"mismatched Level Zero context for "
"primitive/memory");
return status::invalid_arguments;
}
void *ptr = mem_storage->ptr();
CHECK(set_arg(i, pointer_size, &ptr));
param_bytes += pointer_size;
} else {
CHECK(set_arg(i, pointer_size, nullptr));
param_bytes += pointer_size;
}
} else if (arg.is_local()) {
CHECK(set_arg(i, arg.size(), arg.value()));
param_bytes += pointer_size;
} else {
CHECK(set_arg(i, arg.size(), arg.value()));
param_bytes += arg.size();
}
}
if (param_bytes > l0_device_info->max_kernel_param_size()) {
VERROR(primitive, gpu,
"parameter bytes requirements greater than device supports");
return status::invalid_arguments;
}
if (range.is_zero()) { return status::success; }
std::vector<uint32_t> global_size(3, 1);
switch (range.global_range().ndims()) {
case 3: global_size[2] = static_cast<uint32_t>(range.global_range()[2]);
case 2: global_size[1] = static_cast<uint32_t>(range.global_range()[1]);
case 1:
global_size[0] = static_cast<uint32_t>(range.global_range()[0]);
break;
default:
VERROR(primitive, gpu,
"incorrect number of global range dimensions");
return status::invalid_arguments;
}
std::vector<uint32_t> group_size(3, 1);
if (range.local_range()) {
switch (range.local_range().ndims()) {
case 3:
group_size[2] = static_cast<uint32_t>(range.local_range()[2]);
case 2:
group_size[1] = static_cast<uint32_t>(range.local_range()[1]);
case 1:
group_size[0] = static_cast<uint32_t>(range.local_range()[0]);
break;
default:
VERROR(primitive, gpu,
"incorrect number of local range dimensions");
return status::invalid_arguments;
}
} else {
CHECK(l0::zeKernelSuggestGroupSize(kernel_, global_size[0],
global_size[1], global_size[2], &group_size[0], &group_size[1],
&group_size[2]));
}
for (size_t i = 0; i < global_size.size(); i++) {
if (global_size[i] % group_size[i] != 0) {
VERROR(primitive, gpu, "only uniform work-groups are supported");
return status::invalid_arguments;
}
}
CHECK(l0::zeKernelSetGroupSize(
kernel_, group_size[0], group_size[1], group_size[2]));
ze_group_count_t group_count = {global_size[0] / group_size[0],
global_size[1] / group_size[1], global_size[2] / group_size[2]};
std::vector<ze_event_handle_t> l0_deps
= utils::downcast<const event_t *>(&deps)->events_;
std::vector<ze_event_handle_t> l0_out_deps
= utils::downcast<const event_t *>(&out_dep)->events_;
event_ = l0_stream->create_event();
ze_event_handle_t out_event = *(event_.get());
CHECK(l0::zeCommandListAppendLaunchKernel(l0_stream->list(), kernel_,
&group_count, out_event, static_cast<uint32_t>(l0_deps.size()),
l0_deps.size() ? l0_deps.data() : nullptr));
if (out_event) l0_out_deps.push_back(out_event);
if (stream.is_profiling_enabled()) {
l0_stream->profiler().register_event(
utils::make_unique<event_t>(std::move(out_event)));
}
return status::success;
}
status_t kernel_t::get_kernel_binary(xpu::binary_t &binary) const {
return l0::get_kernel_binary(kernel_, binary);
}
std::string kernel_t::name() const {
return kernel_name_;
}
status_t kernel_t::dump() const {
xpu::binary_t binary;
CHECK(get_kernel_binary(binary));
return gpu_utils::dump_kernel_binary(binary, kernel_name_);
}
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,76 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_KERNEL_HPP
#define GPU_INTEL_L0_KERNEL_HPP
#include <thread>
#include "common/rw_mutex.hpp"
#include "gpu/intel/compute/kernel.hpp"
#include "gpu/intel/l0/utils/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
class kernel_t : public compute::kernel_impl_t {
public:
static status_t make(compute::kernel_t &compute_kernel,
const std::shared_ptr<module_wrapper_t> module_ptr,
const ze_kernel_handle_t kernel_ptr,
const std::string &kernel_name);
~kernel_t() override;
status_t check_alignment(
const compute::kernel_arg_list_t &arg_list) const override;
status_t set_arg(
int arg_index, size_t arg_size, const void *arg_value) const;
status_t parallel_for(impl::stream_t &stream,
const compute::nd_range_t &range,
const compute::kernel_arg_list_t &arg_list,
const xpu::event_t &deps, xpu::event_t &out_dep) override;
status_t get_kernel_binary(xpu::binary_t &binary) const override;
std::string name() const override;
status_t dump() const override;
private:
friend class kernel_compat_t;
kernel_t(const std::shared_ptr<module_wrapper_t> module_ptr,
const ze_kernel_handle_t kernel_ptr,
const std::string &kernel_name);
std::shared_ptr<module_wrapper_t> module_;
ze_kernel_handle_t kernel_;
std::string kernel_name_;
std::shared_ptr<ze_event_pool_handle_t> event_pool_;
std::shared_ptr<event_wrapper_t> event_;
kernel_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(kernel_t);
};
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_KERNEL_HPP

View File

@ -0,0 +1,215 @@
/*******************************************************************************
* Copyright 2021-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/memory_storage.hpp"
#include "common/memory_map_manager.hpp"
#include "gpu/intel/l0/engine.hpp"
#include "gpu/intel/l0/stream.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
status_t memory_storage_t::get_data_handle(void **handle) const {
*handle = ptr_.get();
return status::success;
}
status_t memory_storage_t::set_data_handle(void *handle) {
ptr_ = decltype(ptr_)(handle, [](void *) {});
kind_ = get_memory_storage_kind(
get_pointer_type(l0_engine()->context(), handle));
return status::success;
}
bool memory_storage_t::is_host_accessible() const {
return utils::one_of(kind_, memory_storage_kind_t::host,
memory_storage_kind_t::shared, memory_storage_kind_t::unknown);
}
struct map_usm_tag;
status_t memory_storage_t::map_data(
void **mapped_ptr, impl::stream_t *stream, size_t size) const {
if (is_host_accessible()) {
*mapped_ptr = ptr();
return status::success;
}
if (!ptr() || size == 0) {
*mapped_ptr = nullptr;
return status::success;
}
if (!stream) CHECK(engine()->get_service_stream(stream));
void *host_ptr = malloc_host(size);
if (!host_ptr) return status::out_of_memory;
auto leak_guard = decltype(ptr_)(host_ptr, [this](void *p) { free(p); });
CHECK(memcpy(stream, host_ptr, ptr(), size));
CHECK(stream->wait());
leak_guard.release();
auto *usm_ptr_for_unmap = ptr();
auto unmap_callback = [size, usm_ptr_for_unmap, this](
impl::stream_t *stream, void *mapped_ptr) {
CHECK(memcpy(stream, usm_ptr_for_unmap, mapped_ptr, size));
CHECK(stream->wait());
free(mapped_ptr);
return status::success;
};
auto &map_manager = memory_map_manager_t<map_usm_tag>::instance();
*mapped_ptr = host_ptr;
return map_manager.map(this, stream, *mapped_ptr, unmap_callback);
}
status_t memory_storage_t::unmap_data(
void *mapped_ptr, impl::stream_t *stream) const {
if (!mapped_ptr || is_host_accessible()) return status::success;
if (!stream) CHECK(engine()->get_service_stream(stream));
auto &map_manager = memory_map_manager_t<map_usm_tag>::instance();
return map_manager.unmap(this, stream, mapped_ptr);
}
std::unique_ptr<impl::memory_storage_t> memory_storage_t::get_sub_storage(
size_t offset, size_t size) const {
void *sub_ptr
= ptr_ ? reinterpret_cast<uint8_t *>(ptr_.get()) + offset : nullptr;
auto storage = utils::make_unique<memory_storage_t>(engine(), kind_);
if (!storage) return nullptr;
auto status = storage->init(memory_flags_t::use_runtime_ptr, size, sub_ptr);
if (status != status::success) return nullptr;
// XXX: Clang has a bug that prevents implicit conversion.
return std::unique_ptr<memory_storage_t>(storage.release());
}
std::unique_ptr<impl::memory_storage_t> memory_storage_t::clone() const {
auto storage = utils::make_unique<memory_storage_t>(engine(), kind_);
if (!storage) return nullptr;
auto status = storage->init(memory_flags_t::use_runtime_ptr, 0, nullptr);
if (status != status::success) return nullptr;
storage->ptr_ = decltype(ptr_)(ptr_.get(), [](void *) {});
storage->kind_ = kind_;
// XXX: Clang has a bug that prevents implicit conversion.
return std::unique_ptr<memory_storage_t>(storage.release());
}
status_t memory_storage_t::init_allocate(size_t size) {
if (kind_ == memory_storage_kind_t::unknown)
kind_ = memory_storage_kind_t::device;
void *ptr_alloc = nullptr;
switch (kind_) {
case memory_storage_kind_t::host: ptr_alloc = malloc_host(size); break;
case memory_storage_kind_t::device:
ptr_alloc = malloc_device(size);
break;
case memory_storage_kind_t::shared:
ptr_alloc = malloc_shared(size);
break;
default: break;
}
if (!ptr_alloc) return status::out_of_memory;
ptr_ = decltype(ptr_)(ptr_alloc, [&](void *ptr) { free(ptr); });
return status::success;
}
void *memory_storage_t::malloc_host(size_t size) const {
void *pptr = nullptr;
ze_host_mem_alloc_desc_t host_mem_alloc_desc = {};
host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
host_mem_alloc_desc.pNext = nullptr;
host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
l0::zeMemAllocHost(
l0_engine()->context(), &host_mem_alloc_desc, size, 0, &pptr);
return pptr;
}
void *memory_storage_t::malloc_device(size_t size) const {
void *pptr = nullptr;
ze_device_mem_alloc_desc_t device_mem_alloc_desc = {};
device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
device_mem_alloc_desc.pNext = nullptr;
device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
device_mem_alloc_desc.ordinal = 0;
l0::zeMemAllocDevice(l0_engine()->context(), &device_mem_alloc_desc, size,
0, l0_engine()->device(), &pptr);
return pptr;
}
void *memory_storage_t::malloc_shared(size_t size) const {
void *pptr = nullptr;
ze_device_mem_alloc_desc_t device_mem_alloc_desc = {};
device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
device_mem_alloc_desc.pNext = nullptr;
device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
device_mem_alloc_desc.ordinal = 0;
ze_host_mem_alloc_desc_t host_mem_alloc_desc = {};
host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
host_mem_alloc_desc.pNext = nullptr;
host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
l0::zeMemAllocShared(l0_engine()->context(), &device_mem_alloc_desc,
&host_mem_alloc_desc, size, 0, l0_engine()->device(), &pptr);
return pptr;
}
void memory_storage_t::free(void *ptr) const {
l0::zeMemFree(l0_engine()->context(), ptr);
}
status_t memory_storage_t::memcpy(
impl::stream_t *stream, void *dst, const void *src, size_t size) const {
auto *l0_stream = utils::downcast<stream_t *>(stream);
return l0::zeCommandListAppendMemoryCopy(
l0_stream->list(), dst, src, size, nullptr, 0, nullptr);
}
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,91 @@
/*******************************************************************************
* Copyright 2021-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_MEMORY_STORAGE_HPP
#define GPU_INTEL_L0_MEMORY_STORAGE_HPP
#include <functional>
#include "common/c_types_map.hpp"
#include "common/memory_storage.hpp"
#include "common/utils.hpp"
#include "gpu/intel/l0/engine.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
enum class memory_storage_kind_t { unknown, host, device, shared };
inline memory_storage_kind_t get_memory_storage_kind(ze_memory_type_t type) {
switch (type) {
case ZE_MEMORY_TYPE_HOST: return memory_storage_kind_t::host;
case ZE_MEMORY_TYPE_DEVICE: return memory_storage_kind_t::device;
case ZE_MEMORY_TYPE_SHARED: return memory_storage_kind_t::shared;
default: return memory_storage_kind_t::unknown;
}
};
class memory_storage_t : public impl::memory_storage_t {
public:
memory_storage_t(impl::engine_t *engine, memory_storage_kind_t kind)
: impl::memory_storage_t(engine), kind_(kind) {}
void *ptr() const { return ptr_.get(); }
status_t get_data_handle(void **handle) const override;
status_t set_data_handle(void *handle) override;
bool is_host_accessible() const override;
status_t map_data(void **mapped_ptr, impl::stream_t *stream,
size_t size) const override;
status_t unmap_data(
void *mapped_ptr, impl::stream_t *stream) const override;
std::unique_ptr<impl::memory_storage_t> get_sub_storage(
size_t offset, size_t size) const override;
std::unique_ptr<impl::memory_storage_t> clone() const override;
private:
status_t init_allocate(size_t size) override;
gpu::intel::l0::engine_t *l0_engine() const {
return utils::downcast<gpu::intel::l0::engine_t *>(engine());
}
void *malloc_host(size_t size) const;
void *malloc_device(size_t size) const;
void *malloc_shared(size_t size) const;
void free(void *ptr) const;
status_t memcpy(impl::stream_t *stream, void *dst, const void *src,
size_t size) const;
std::unique_ptr<void, std::function<void(void *)>> ptr_;
memory_storage_kind_t kind_ = memory_storage_kind_t::unknown;
DNNL_DISALLOW_COPY_AND_ASSIGN(memory_storage_t);
};
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_MEMORY_STORAGE_HPP

217
src/gpu/intel/l0/stream.cpp Normal file
View File

@ -0,0 +1,217 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/stream.hpp"
#include "gpu/intel/l0/engine.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
status_t stream_t::create_stream(impl::stream_t **stream,
impl::engine_t *engine, impl::stream_impl_t *stream_impl) {
std::unique_ptr<intel::l0::stream_t> s(
new intel::l0::stream_t(engine, stream_impl));
if (!s) return status::out_of_memory;
*stream = s.release();
return status::success;
}
void stream_t::before_exec_hook() {
if (is_profiling_enabled()) profiler_->start_profiling();
}
void stream_t::after_exec_hook() {
l0_ctx().set_deps(event_t());
if (is_profiling_enabled()) profiler_->stop_profiling();
}
status_t stream_t::reset_profiling() {
if (!is_profiling_enabled()) return status::invalid_arguments;
profiler_->reset();
return status::success;
}
status_t stream_t::get_profiling_data(profiling_data_kind_t data_kind,
int *num_entries, uint64_t *data) const {
if (!is_profiling_enabled()) return status::invalid_arguments;
return profiler_->get_info(data_kind, num_entries, data);
}
stream_impl_t::stream_impl_t(unsigned flags, ze_command_list_handle_t list)
: impl::stream_impl_t(flags)
, allocated_(false)
, list_(list)
, event_pool_(nullptr) {
l0::zeCommandListGetContextHandle(list_, &context_);
if (flags & stream_flags::out_of_order || is_profiling_enabled())
create_event_pool();
}
stream_impl_t::stream_impl_t(
unsigned flags, ze_context_handle_t context, ze_device_handle_t device)
: impl::stream_impl_t(flags)
, context_(context)
, allocated_(true)
, event_pool_(nullptr) {
ze_command_queue_desc_t command_queue_desc = {};
command_queue_desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
command_queue_desc.pNext = nullptr;
command_queue_desc.ordinal = 0;
command_queue_desc.index = 0;
command_queue_desc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER;
command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT;
command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
l0::zeCommandListCreateImmediate(
context_, device, &command_queue_desc, &list_);
if (flags & stream_flags::out_of_order || is_profiling_enabled())
create_event_pool();
}
void stream_impl_t::create_event_pool() {
ze_event_pool_desc_t event_pool_desc = {};
event_pool_desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
event_pool_desc.pNext = nullptr;
event_pool_desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
if (is_profiling_enabled())
event_pool_desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
event_pool_desc.count = 16384;
ze_event_pool_handle_t event_pool;
l0::zeEventPoolCreate(context_, &event_pool_desc, 0, nullptr, &event_pool);
event_pool_ = std::make_shared<event_pool_wrapper_t>(event_pool);
}
stream_impl_t::~stream_impl_t() {
wait();
if (allocated_) l0::zeCommandListDestroy(list_);
}
xpu::context_t &stream_impl_t::ctx() {
return l0_ctx();
}
const xpu::context_t &stream_impl_t::ctx() const {
return l0_ctx();
}
context_t &stream_impl_t::l0_ctx() {
const context_t &ctx = const_cast<const stream_impl_t *>(this)->l0_ctx();
return *const_cast<context_t *>(&ctx);
}
const context_t &stream_impl_t::l0_ctx() const {
static context_t empty_ctx;
return ctx_.get(empty_ctx);
}
ze_event_handle_t stream_impl_t::get_output_event() const {
auto &deps = event_t::from(ctx().get_deps()).events_;
if (deps.size()) return deps[0];
return nullptr;
}
std::shared_ptr<event_wrapper_t> stream_impl_t::create_event() {
if (!event_pool_.get()) return std::make_shared<event_wrapper_t>(nullptr);
ze_event_desc_t event_desc = {};
event_desc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
event_desc.pNext = nullptr;
event_desc.index = static_cast<uint32_t>(events_.size());
event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
event_desc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
ze_event_handle_t event;
l0::zeEventCreate(*(event_pool_.get()), &event_desc, &event);
std::shared_ptr<event_wrapper_t> event_ptr
= std::make_shared<event_wrapper_t>(event);
events_.push_back(event_ptr);
return event_ptr;
}
std::shared_ptr<event_pool_wrapper_t> stream_impl_t::get_event_pool() {
return event_pool_;
}
ze_command_list_handle_t stream_impl_t::list() {
return list_;
}
status_t stream_impl_t::wait() {
CHECK(l0::zeCommandListHostSynchronize(list_, UINT64_MAX));
return status::success;
}
status_t stream_impl_t::barrier() {
CHECK(l0::zeCommandListAppendBarrier(list_, nullptr, 0, nullptr));
return status::success;
}
status_t stream_impl_t::copy(const impl::memory_storage_t &src,
const impl::memory_storage_t &dst, size_t size,
const xpu::event_t &deps, xpu::event_t &out_dep) {
if (size == 0) return status::success;
std::vector<ze_event_handle_t> l0_deps
= utils::downcast<const event_t *>(&deps)->events_;
ze_event_handle_t out_event = *(create_event().get());
CHECK(l0::zeCommandListAppendMemoryCopy(list_, dst.data_handle(),
src.data_handle(), size, out_event,
static_cast<uint32_t>(l0_deps.size()),
l0_deps.size() ? l0_deps.data() : nullptr));
if (out_event)
utils::downcast<event_t *>(&out_dep)->events_.push_back(out_event);
return status::success;
}
status_t stream_impl_t::fill(const impl::memory_storage_t &dst, uint8_t pattern,
size_t size, const xpu::event_t &deps, xpu::event_t &out_dep) {
if (size == 0) return status::success;
std::vector<ze_event_handle_t> l0_deps
= utils::downcast<const event_t *>(&deps)->events_;
ze_event_handle_t out_event = *(create_event().get());
CHECK(l0::zeCommandListAppendMemoryFill(list_, dst.data_handle(), &pattern,
sizeof(pattern), size, out_event,
static_cast<uint32_t>(l0_deps.size()),
l0_deps.size() ? l0_deps.data() : nullptr));
if (out_event)
utils::downcast<event_t *>(&out_dep)->events_.push_back(out_event);
return status::success;
}
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

138
src/gpu/intel/l0/stream.hpp Normal file
View File

@ -0,0 +1,138 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_STREAM_HPP
#define GPU_INTEL_L0_STREAM_HPP
#include <list>
#include "common/thread_local_storage.hpp"
#include "gpu/intel/l0/context.hpp"
#include "gpu/intel/l0/utils/utils.hpp"
#include "gpu/intel/stream.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
class stream_impl_t : public impl::stream_impl_t {
public:
stream_impl_t(unsigned flags, ze_command_list_handle_t list);
stream_impl_t(unsigned flags, ze_context_handle_t context,
ze_device_handle_t device);
~stream_impl_t();
context_t &l0_ctx();
const context_t &l0_ctx() const;
xpu::context_t &ctx();
const xpu::context_t &ctx() const;
ze_event_handle_t get_output_event() const;
std::shared_ptr<event_wrapper_t> create_event();
std::shared_ptr<event_pool_wrapper_t> get_event_pool();
ze_command_list_handle_t list();
status_t wait();
status_t barrier();
status_t copy(const impl::memory_storage_t &src,
const impl::memory_storage_t &dst, size_t size,
const xpu::event_t &deps, xpu::event_t &out_dep);
status_t fill(const impl::memory_storage_t &dst, uint8_t pattern,
size_t size, const xpu::event_t &deps, xpu::event_t &out_dep);
private:
void create_event_pool();
ze_context_handle_t context_;
bool allocated_;
ze_command_list_handle_t list_;
std::shared_ptr<event_pool_wrapper_t> event_pool_;
std::list<std::shared_ptr<event_wrapper_t>> events_;
mutable utils::thread_local_storage_t<context_t> ctx_;
stream_impl_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(stream_impl_t);
};
class stream_t : public intel::stream_t {
public:
static status_t create_stream(impl::stream_t **stream,
impl::engine_t *engine, impl::stream_impl_t *stream_impl);
stream_impl_t *impl() const {
return static_cast<stream_impl_t *>(impl::stream_t::impl_.get());
}
engine_t *l0_engine() const {
return utils::downcast<engine_t *>(engine());
}
context_t &l0_ctx() { return impl()->l0_ctx(); }
const context_t &l0_ctx() const { return impl()->l0_ctx(); }
xpu::context_t &ctx() override { return impl()->ctx(); }
const xpu::context_t &ctx() const override { return impl()->ctx(); }
ze_event_handle_t get_output_event() const {
return impl()->get_output_event();
}
std::shared_ptr<event_wrapper_t> create_event() {
return impl()->create_event();
}
std::shared_ptr<event_pool_wrapper_t> get_event_pool() {
return impl()->get_event_pool();
}
const ze_command_list_handle_t list() const { return impl()->list(); }
status_t wait() override { return impl()->wait(); }
status_t barrier() override { return impl()->barrier(); }
void before_exec_hook() override;
void after_exec_hook() override;
status_t reset_profiling() override;
status_t get_profiling_data(profiling_data_kind_t data_kind,
int *num_entries, uint64_t *data) const override;
status_t copy(const impl::memory_storage_t &src,
const impl::memory_storage_t &dst, size_t size,
const xpu::event_t &deps, xpu::event_t &out_dep) override {
return impl()->copy(src, dst, size, deps, out_dep);
}
status_t fill(const impl::memory_storage_t &dst, uint8_t pattern,
size_t size, const xpu::event_t &deps,
xpu::event_t &out_dep) override {
return impl()->fill(dst, pattern, size, deps, out_dep);
}
private:
stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl)
: gpu::intel::stream_t(engine, stream_impl) {}
stream_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(stream_t);
};
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_STREAM_HPP

View File

@ -0,0 +1,24 @@
#===============================================================================
# Copyright 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
file(GLOB_RECURSE SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
)
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0_utils)
add_library(${OBJ_LIB} OBJECT ${SOURCES})
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
$<TARGET_OBJECTS:${OBJ_LIB}>)

View File

@ -0,0 +1,387 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/l0/utils/utils.hpp"
#include "gpu/intel/jit/binary_format.hpp"
#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
#include "ngen_level_zero.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
event_wrapper_t::event_wrapper_t(ze_event_handle_t event) : event_(event) {}
event_wrapper_t::~event_wrapper_t() {
if (event_) {
l0::zeEventHostSynchronize(event_, UINT64_MAX);
l0::zeEventDestroy(event_);
}
}
event_wrapper_t::operator ze_event_handle_t() const {
return event_;
}
event_pool_wrapper_t::event_pool_wrapper_t(ze_event_pool_handle_t event_pool)
: event_pool_(event_pool) {}
event_pool_wrapper_t::~event_pool_wrapper_t() {
if (event_pool_) l0::zeEventPoolDestroy(event_pool_);
}
event_pool_wrapper_t::operator ze_event_pool_handle_t() const {
return event_pool_;
}
module_wrapper_t::module_wrapper_t(ze_module_handle_t module)
: module_(module) {};
module_wrapper_t::~module_wrapper_t() {
if (module_) l0::zeModuleDestroy(module_);
};
module_wrapper_t::operator ze_module_handle_t() const {
return module_;
}
status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
ze_device_ip_version_ext_t device_ip_version_ext = {};
device_ip_version_ext.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
device_ip_version_ext.pNext = nullptr;
ze_device_properties_t device_properties = {};
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
device_properties.pNext = &device_ip_version_ext;
CHECK(l0::zeDeviceGetProperties(device, &device_properties));
ip_version = device_ip_version_ext.ipVersion;
return status::success;
}
status_t get_l0_device_enabled_systolic_intel(
ze_device_handle_t device, bool &mayiuse_systolic) {
ze_intel_device_module_dp_exp_properties_t
intel_device_module_dp_exp_properties
= {};
intel_device_module_dp_exp_properties.stype
= ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
intel_device_module_dp_exp_properties.pNext = nullptr;
ze_device_module_properties_t device_module_properties = {};
device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
device_module_properties.pNext = &intel_device_module_dp_exp_properties;
CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties));
mayiuse_systolic = intel_device_module_dp_exp_properties.flags
& ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
return status::success;
}
status_t get_l0_device_enabled_native_float_atomics(
ze_device_handle_t device, uint64_t &native_extensions) {
using namespace gpu::intel::compute;
ze_float_atomic_ext_properties_t float_atomic_ext_properties = {};
float_atomic_ext_properties.stype
= ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
float_atomic_ext_properties.pNext = nullptr;
ze_device_module_properties_t device_module_properties = {};
device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
device_module_properties.pNext = &float_atomic_ext_properties;
CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties));
ze_device_fp_atomic_ext_flags_t atomic_load_store
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
ze_device_fp_atomic_ext_flags_t atomic_add
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
ze_device_fp_atomic_ext_flags_t atomic_min_max
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
if ((float_atomic_ext_properties.fp16Flags & atomic_load_store)
== atomic_load_store)
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
if ((float_atomic_ext_properties.fp16Flags & atomic_add) == atomic_add)
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
if ((float_atomic_ext_properties.fp16Flags & atomic_min_max)
== atomic_min_max)
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
if ((float_atomic_ext_properties.fp32Flags & atomic_load_store)
== atomic_load_store)
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
if ((float_atomic_ext_properties.fp32Flags & atomic_add) == atomic_add)
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
if ((float_atomic_ext_properties.fp32Flags & atomic_min_max)
== atomic_min_max)
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
if ((float_atomic_ext_properties.fp64Flags & atomic_load_store)
== atomic_load_store)
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
if ((float_atomic_ext_properties.fp64Flags & atomic_add) == atomic_add)
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
if ((float_atomic_ext_properties.fp64Flags & atomic_min_max)
== atomic_min_max)
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
return status::success;
}
status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
ze_eu_count_ext_t eu_count_ext = {};
eu_count_ext.stype = ZE_STRUCTURE_TYPE_EU_COUNT_EXT;
eu_count_ext.pNext = nullptr;
ze_device_properties_t device_properties = {};
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
device_properties.pNext = &eu_count_ext;
CHECK(l0::zeDeviceGetProperties(device, &device_properties));
eu_count = eu_count_ext.numTotalEUs;
return status::success;
}
status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
ze_context_handle_t context, uint32_t &ip_version,
compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_,
uint64_t &native_extensions, bool &mayiuse_systolic,
bool &mayiuse_ngen_kernels) {
using namespace ngen;
ngen::Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
context, device);
gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
std::memcpy(&product_, &product, sizeof(ngen::Product));
mayiuse_systolic = false;
if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic)
!= status::success)
mayiuse_systolic = false;
/* Some old drivers do not report systolic availability. Manually override
systolic availability based on product family. */
switch (product.family) {
case ProductFamily::DG2:
case ProductFamily::ARL:
case ProductFamily::PVC: mayiuse_systolic = true;
default: break;
}
CHECK(get_l0_device_enabled_native_float_atomics(
device, native_extensions));
auto status
= jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
if (status != status::success) mayiuse_ngen_kernels = false;
ip_version = 0;
return get_device_ip(device, ip_version);
}
xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device) {
static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16,
"ZE_MAX_DEVICE_UUID_SIZE is expected to be 16");
ze_device_properties_t device_properties = {};
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
device_properties.pNext = nullptr;
auto status = l0::zeDeviceGetProperties(device, &device_properties);
MAYBE_UNUSED(status);
assert(status == status::success);
const auto &device_id = device_properties.uuid.id;
uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {};
for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
uuid[i / sizeof(uint64_t)] |= (((uint64_t)device_id[i]) << shift);
}
return xpu::device_uuid_t(uuid[0], uuid[1]);
}
status_t get_device_index(const ze_device_handle_t device, size_t *index) {
uint32_t driver_count = 0;
CHECK(l0::zeDriverGet(&driver_count, nullptr));
std::vector<ze_driver_handle_t> drivers(driver_count);
CHECK(l0::zeDriverGet(&driver_count, drivers.data()));
uint32_t device_count = 0;
CHECK(l0::zeDeviceGet(drivers[0], &device_count, nullptr));
std::vector<ze_device_handle_t> devices(device_count);
CHECK(l0::zeDeviceGet(drivers[0], &device_count, devices.data()));
for (size_t i = 0; i < device_count; i++) {
if (device == devices[i]) {
*index = i;
return status::success;
}
}
return status::invalid_arguments;
}
std::string get_kernel_name(const ze_kernel_handle_t kernel) {
std::string kernel_name;
size_t kernel_name_size = 0;
l0::zeKernelGetName(kernel, &kernel_name_size, nullptr);
kernel_name.resize(kernel_name_size, 0);
l0::zeKernelGetName(kernel, &kernel_name_size, &kernel_name[0]);
// Remove the null terminator as std::string already includes it
kernel_name.resize(kernel_name_size - 1);
return kernel_name;
}
status_t get_kernel_binary(
const ze_kernel_handle_t kernel, xpu::binary_t &binary) {
size_t binary_size = 0;
CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, nullptr));
binary.resize(binary_size);
CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, binary.data()));
return status::success;
}
status_t get_module_binary(
const ze_module_handle_t module, xpu::binary_t &binary) {
size_t module_binary_size;
CHECK(l0::zeModuleGetNativeBinary(module, &module_binary_size, nullptr));
binary.resize(module_binary_size);
CHECK(l0::zeModuleGetNativeBinary(
module, &module_binary_size, binary.data()));
return status::success;
}
#define ZE_MODULE_FORMAT_OCLC (ze_module_format_t)3U
status_t compile_ocl_module(const ze_device_handle_t device,
const ze_context_handle_t context, std::string &code,
std::string &options, xpu::binary_t &binary) {
ze_module_desc_t module_desc;
module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
module_desc.pNext = nullptr;
module_desc.format = ZE_MODULE_FORMAT_OCLC;
module_desc.inputSize = code.size();
module_desc.pInputModule = reinterpret_cast<const uint8_t *>(code.c_str());
module_desc.pBuildFlags = options.c_str();
module_desc.pConstants = nullptr;
ze_module_handle_t module;
ze_module_build_log_handle_t module_build_log;
if (l0::zeModuleCreate(
context, device, &module_desc, &module, &module_build_log)
!= status::success) {
size_t build_log_size = 0;
CHECK(l0::zeModuleBuildLogGetString(
module_build_log, &build_log_size, nullptr));
char *build_log = new char[build_log_size];
CHECK(l0::zeModuleBuildLogGetString(
module_build_log, &build_log_size, build_log));
std::cout << std::endl << "Build log: " << build_log << std::endl;
delete[] build_log;
return status::runtime_error;
}
CHECK(l0::zeModuleBuildLogDestroy(module_build_log));
CHECK(get_module_binary(module, binary));
CHECK(l0::zeModuleDestroy(module));
return status::success;
}
status_t create_kernels_from_binary(const ze_device_handle_t device,
const ze_context_handle_t context,
const std::vector<const char *> &kernel_names,
const xpu::binary_t &binary, ze_module_handle_t *module,
std::vector<ze_kernel_handle_t> &kernels) {
ze_module_desc_t module_desc;
module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
module_desc.pNext = nullptr;
module_desc.format = ZE_MODULE_FORMAT_NATIVE;
module_desc.inputSize = binary.size();
module_desc.pInputModule = binary.data();
module_desc.pBuildFlags = "";
module_desc.pConstants = nullptr;
CHECK(l0::zeModuleCreate(context, device, &module_desc, module, nullptr));
kernels.resize(kernel_names.size(), nullptr);
for (size_t i = 0; i < kernel_names.size(); i++) {
if (kernel_names[i] == nullptr) {
kernels[i] = nullptr;
continue;
}
ze_kernel_desc_t kernel_desc = {};
kernel_desc.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
kernel_desc.pNext = nullptr;
kernel_desc.flags = 0;
kernel_desc.pKernelName = kernel_names[i];
ze_kernel_handle_t kernel;
CHECK(l0::zeKernelCreate(*module, &kernel_desc, &kernel));
kernels[i] = kernel;
}
return status::success;
}
ze_memory_type_t get_pointer_type(
const ze_context_handle_t context, const void *ptr) {
ze_memory_allocation_properties_t memory_allocation_properties;
memory_allocation_properties.stype
= ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
memory_allocation_properties.pNext = nullptr;
l0::zeMemGetAllocProperties(
context, ptr, &memory_allocation_properties, nullptr);
return memory_allocation_properties.type;
}
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -0,0 +1,238 @@
/*******************************************************************************
* Copyright 2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_L0_UTILS_HPP
#define GPU_INTEL_L0_UTILS_HPP
#if defined(__linux__)
#include <dlfcn.h>
#elif defined(_WIN32)
#include "windows.h"
#else
#error "Level Zero is supported on Linux and Windows only"
#endif
#include "gpu/intel/compute/kernel.hpp"
#include "level_zero/ze_api.h"
#include "level_zero/ze_intel_gpu.h"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace l0 {
inline std::string to_string(ze_result_t r) {
#define ZE_STATUS_CASE(status) \
case status: return #status
switch (r) {
ZE_STATUS_CASE(ZE_RESULT_SUCCESS);
ZE_STATUS_CASE(ZE_RESULT_NOT_READY);
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST);
ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY);
ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET);
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN);
ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32);
default: return std::to_string((int)r);
}
#undef ZE_STATUS_CASE
};
#define ZE_CHECK(f) \
do { \
ze_result_t res_ = (f); \
if (res_ != ZE_RESULT_SUCCESS) { \
std::string err_str_ = to_string(res_); \
VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \
return status::runtime_error; \
} \
} while (false)
inline void *find_symbol(const char *symbol) {
#if defined(_WIN32)
HMODULE handle = LoadLibraryExA(
"ze_loader.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
if (!handle) return nullptr;
return reinterpret_cast<void *>(GetProcAddress(handle, symbol));
#elif defined(__linux__)
void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
if (!handle) return nullptr;
return dlsym(handle, symbol);
#endif
}
template <typename F>
F find_ze_symbol(const char *symbol) {
auto f = (F)find_symbol(symbol);
if (!f) VERROR(common, level_zero, "cannot find symbol: %s", symbol);
return f;
}
#undef L0_LIB_NAME
#define INDIRECT_L0_CALL(f) \
template <typename... Args> \
status_t f(Args &&...args) { \
const ze_init_flags_t default_ze_flags = 0; \
static auto init_ = find_ze_symbol<decltype(&::zeInit)>("zeInit"); \
ZE_CHECK(init_(default_ze_flags)); \
static auto f_ = find_ze_symbol<decltype(&::f)>(#f); \
ZE_CHECK(f_(std::forward<Args>(args)...)); \
return status::success; \
}
INDIRECT_L0_CALL(zeDriverGet)
INDIRECT_L0_CALL(zeDriverGetProperties)
INDIRECT_L0_CALL(zeDeviceGet)
INDIRECT_L0_CALL(zeDeviceGetProperties)
INDIRECT_L0_CALL(zeDeviceGetComputeProperties)
INDIRECT_L0_CALL(zeDeviceGetModuleProperties)
INDIRECT_L0_CALL(zeDeviceGetMemoryAccessProperties)
INDIRECT_L0_CALL(zeDeviceGetCacheProperties)
INDIRECT_L0_CALL(zeContextCreate)
INDIRECT_L0_CALL(zeContextDestroy)
INDIRECT_L0_CALL(zeCommandListCreateImmediate)
INDIRECT_L0_CALL(zeCommandListDestroy)
INDIRECT_L0_CALL(zeCommandListHostSynchronize)
INDIRECT_L0_CALL(zeCommandListGetContextHandle)
INDIRECT_L0_CALL(zeCommandListAppendBarrier)
INDIRECT_L0_CALL(zeCommandListAppendMemoryCopy)
INDIRECT_L0_CALL(zeCommandListAppendMemoryFill)
INDIRECT_L0_CALL(zeEventPoolCreate)
INDIRECT_L0_CALL(zeEventPoolDestroy)
INDIRECT_L0_CALL(zeEventCreate)
INDIRECT_L0_CALL(zeEventDestroy)
INDIRECT_L0_CALL(zeEventHostSynchronize)
INDIRECT_L0_CALL(zeMemAllocShared)
INDIRECT_L0_CALL(zeMemAllocDevice)
INDIRECT_L0_CALL(zeMemAllocHost)
INDIRECT_L0_CALL(zeMemFree)
INDIRECT_L0_CALL(zeMemGetAllocProperties)
INDIRECT_L0_CALL(zeModuleCreate)
INDIRECT_L0_CALL(zeModuleDestroy)
INDIRECT_L0_CALL(zeModuleBuildLogDestroy)
INDIRECT_L0_CALL(zeModuleBuildLogGetString)
INDIRECT_L0_CALL(zeModuleGetNativeBinary)
INDIRECT_L0_CALL(zeKernelCreate)
INDIRECT_L0_CALL(zeKernelDestroy)
INDIRECT_L0_CALL(zeKernelSetArgumentValue)
INDIRECT_L0_CALL(zeKernelGetName)
INDIRECT_L0_CALL(zeKernelGetBinaryExp)
INDIRECT_L0_CALL(zeKernelSetGroupSize)
INDIRECT_L0_CALL(zeKernelSuggestGroupSize)
INDIRECT_L0_CALL(zeCommandListAppendLaunchKernel)
#undef INDIRECT_L0_CALL
class event_wrapper_t {
public:
event_wrapper_t(ze_event_handle_t event);
~event_wrapper_t();
operator ze_event_handle_t() const;
private:
ze_event_handle_t event_;
event_wrapper_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(event_wrapper_t);
};
class event_pool_wrapper_t {
public:
event_pool_wrapper_t(ze_event_pool_handle_t event_pool);
~event_pool_wrapper_t();
operator ze_event_pool_handle_t() const;
private:
ze_event_pool_handle_t event_pool_;
event_pool_wrapper_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(event_pool_wrapper_t);
};
class module_wrapper_t {
public:
module_wrapper_t(ze_module_handle_t module);
~module_wrapper_t();
operator ze_module_handle_t() const;
private:
ze_module_handle_t module_;
module_wrapper_t() = delete;
DNNL_DISALLOW_COPY_AND_ASSIGN(module_wrapper_t);
};
status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
ze_context_handle_t context, uint32_t &ip_version,
compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product,
uint64_t &native_extensions, bool &mayiuse_systolic,
bool &mayiuse_ngen_kernels);
xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device);
status_t get_device_index(const ze_device_handle_t device, size_t *index);
std::string get_kernel_name(const ze_kernel_handle_t kernel);
status_t get_kernel_binary(
const ze_kernel_handle_t kernel, xpu::binary_t &binary);
status_t get_module_binary(
const ze_module_handle_t module, xpu::binary_t &binary);
status_t compile_ocl_module(const ze_device_handle_t device,
const ze_context_handle_t context, std::string &code,
std::string &options, xpu::binary_t &binary);
status_t create_kernels_from_binary(const ze_device_handle_t device,
const ze_context_handle_t context,
const std::vector<const char *> &kernel_names,
const xpu::binary_t &binary, ze_module_handle_t *module,
std::vector<ze_kernel_handle_t> &kernels);
ze_memory_type_t get_pointer_type(const ze_context_handle_t, const void *ptr);
} // namespace l0
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_L0_UTILS_HPP

View File

@ -15,9 +15,7 @@
#===============================================================================
file(GLOB_RECURSE SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/*.h
${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
${CMAKE_CURRENT_SOURCE_DIR}/*.c
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
)

View File

@ -22,7 +22,7 @@
#include <CL/cl.h>
#include "gpu/intel/compute/device_info.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
namespace dnnl {
namespace impl {

View File

@ -32,7 +32,7 @@
#include "gpu/intel/ocl/engine.hpp"
#include "gpu/intel/ocl/kernel.hpp"
#include "gpu/intel/ocl/stream.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
namespace dnnl {
namespace impl {
@ -40,33 +40,6 @@ namespace gpu {
namespace intel {
namespace ocl {
status_t preprocess_headers(stringstream_t &pp_code, const char *code,
const compute::kernel_ctx_t &kernel_ctx) {
stringstream_t code_stream(code);
for (std::string line; std::getline(code_stream, line);) {
const size_t include_pos = line.find("#include");
if (include_pos != std::string::npos) {
static constexpr size_t include_len = 8;
const size_t first_quote_pos
= line.find("\"", include_pos + include_len);
const size_t second_quote_pos
= line.find("\"", first_quote_pos + 1);
const size_t kernel_name_len
= second_quote_pos - first_quote_pos - 1;
const auto header_name
= line.substr(first_quote_pos + 1, kernel_name_len);
const char *header_source
= kernel_ctx.get_custom_header(header_name);
if (!header_source) header_source = get_kernel_header(header_name);
CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
} else {
pp_code << line << std::endl;
}
}
return status::success;
}
status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
cl_device_id dev, cl_context ctx, size_t index,
const std::vector<uint8_t> &cache_blob) {
@ -249,14 +222,14 @@ status_t engine_t::build_program_from_source(
// `clCompileProgram` `clBuildProgram` doesn't take headers. Because of
// that, a manual preprocessing of `include` header directives in the
// OpenCL kernels is required.
CHECK(preprocess_headers(pp_code, code_string, kernel_ctx));
CHECK(compute::preprocess_headers(pp_code, code_string, kernel_ctx));
std::string pp_code_str = pp_code.str();
const char *pp_code_str_ptr = pp_code_str.c_str();
src = {pp_code_str};
if (src) { options += " -g -s " + std::string(src.name()); }
debugdump_processed_source(
compute::debugdump_processed_source(
pp_code_str, options, dev_info->get_cl_ext_options());
auto ctx = context();

View File

@ -21,6 +21,7 @@
#include "common/utils.hpp"
#include "gpu/gpu_impl_list.hpp"
#include "gpu/intel/engine.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#include "xpu/ocl/engine_impl.hpp"
#include "xpu/utils.hpp"
@ -30,9 +31,6 @@ namespace gpu {
namespace intel {
namespace ocl {
status_t preprocess_headers(stringstream_t &pp_code, const char *code,
const compute::kernel_ctx_t &kernel_ctx);
status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
cl_device_id dev, cl_context ctx, size_t index,
const std::vector<uint8_t> &cache_blob);

View File

@ -15,7 +15,7 @@
*******************************************************************************/
#include "gpu/intel/ocl/hw_info.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#include "gpu/intel/jit/binary_format.hpp"
#include "gpu/intel/jit/generator.hpp"

View File

@ -31,7 +31,7 @@
#include "gpu/intel/ocl/engine.hpp"
#include "gpu/intel/ocl/stream.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
namespace dnnl {
namespace impl {

View File

@ -28,7 +28,7 @@
#include <dlfcn.h>
#include <vector>
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#include "mdapi/metrics_discovery_api.h"
#ifndef CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL

View File

@ -26,7 +26,7 @@
#include "gpu/intel/ocl/engine.hpp"
#include "gpu/intel/ocl/stream.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
namespace dnnl {
namespace impl {

View File

@ -26,9 +26,11 @@
#include "xpu/ocl/context.hpp"
#include "xpu/ocl/stream_impl.hpp"
#include "gpu/intel/ocl/mdapi_utils.hpp"
#include "gpu/intel/stream.hpp"
#include "gpu/intel/ocl/mdapi_utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {

View File

@ -0,0 +1,24 @@
#===============================================================================
# Copyright 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================
file(GLOB_RECURSE SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
)
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_ocl_utils)
add_library(${OBJ_LIB} OBJECT ${SOURCES})
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
$<TARGET_OBJECTS:${OBJ_LIB}>)

View File

@ -23,7 +23,7 @@
#include "gpu/intel/ocl/engine.hpp"
#include "gpu/intel/ocl/hw_info.hpp"
#include "gpu/intel/ocl/kernel.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#include "xpu/ocl/utils.hpp"
#ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
@ -217,69 +217,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary) {
return status::success;
}
void debugdump_processed_source(const std::string &source,
const std::string &options, const std::string &cl_options) {
#if defined(__linux__) && defined(DNNL_DEV_MODE)
if (get_verbose(verbose_t::debuginfo) >= 10) {
auto get_defines = [](const std::string &from) {
std::string ret;
size_t pos = 0;
while (pos < from.length()) {
// Find next define argument
pos = from.find("-D", pos);
// Generate argument, quotes are interpreted literally, but
// other special shell characters need escaped. Does not
// currently handle quotes with the ' character or nested quotes
char quote_parity = true;
while (pos < from.length()) {
if (quote_parity
&& utils::one_of(from[pos], '~', '#', '$', '&', '*',
'(', ')', '\\', '|', '[', ']', '{', '}',
';', '\'', '<', '>', '/', '?', '!')) {
ret += '\\';
}
ret += from[pos];
if (from[pos] == '"') quote_parity ^= true;
if (from[pos] == ' ' && quote_parity) break;
pos++;
}
}
return ret;
};
auto execute_command = [](const std::string &cmd,
const std::string &stdin) {
std::string result;
std::array<char, 256> buffer;
FILE *pipe = popen(cmd.c_str(), "w");
fputs(stdin.c_str(), pipe);
if (pipe) {
while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
result += buffer.data();
}
}
pclose(pipe);
return result;
};
// Run utilities to evaluate preprocessor defines and format the file
// Theoretically, we can accomplish this task with libclang, but it
// seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
// so that calls to the system are not included in the default build.
// Due to the use of a different C preprocessor, warnings should not be
// ignored, as they may correspond to a different behavior in the OpenCL
// C preprocessor
auto o = get_defines(options) + get_defines(cl_options);
std::string preprocess_cmd
= std::string() + "cpp -P " + o + " | clang-format";
execute_command(preprocess_cmd, source);
std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
}
#endif
}
status_t get_kernel_arg_types(cl_kernel ocl_kernel,
std::vector<gpu::intel::compute::scalar_type_t> *arg_types) {
cl_uint nargs;

View File

@ -55,9 +55,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary);
status_t get_ocl_program_binary_size(
cl_kernel kernel, cl_device_id device, size_t *size);
void debugdump_processed_source(const std::string &source,
const std::string &options, const std::string &ocl_options);
status_t get_kernel_arg_types(cl_kernel ocl_kernel,
std::vector<gpu::intel::compute::scalar_type_t> *arg_types);

View File

@ -35,7 +35,7 @@
#include "gpu/intel/compute/device_info.hpp"
#include "gpu/intel/sycl/compat.hpp"
#include "gpu/intel/sycl/engine.hpp"
#include "gpu/intel/sycl/l0/utils.hpp"
#include "gpu/intel/sycl/utils.hpp"
namespace dnnl {
namespace impl {

View File

@ -19,11 +19,11 @@
#include "gpu/intel/sycl/compat.hpp"
#include "gpu/intel/sycl/device_info.hpp"
#include "gpu/intel/sycl/engine.hpp"
#include "gpu/intel/sycl/l0/utils.hpp"
#include "gpu/intel/sycl/utils.hpp"
#include "gpu/intel/ocl/hw_info.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#include "gpu/intel/l0/utils/utils.hpp"
namespace dnnl {
namespace impl {
@ -56,7 +56,7 @@ status_t device_info_t::init_arch(impl::engine_t *engine) {
auto ze_dev = xpu::sycl::compat::get_native<ze_device_handle_t>(device);
auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(ctx);
status = gpu::intel::sycl::init_gpu_hw_info(engine, ze_dev, ze_ctx,
status = gpu::intel::l0::init_gpu_hw_info(engine, ze_dev, ze_ctx,
ip_version_, gpu_arch_, gpu_product_, native_extensions_,
mayiuse_systolic_, mayiuse_ngen_kernels_);
} else {

View File

@ -114,18 +114,25 @@ status_t engine_t::create_kernels(
const char *source = nullptr;
for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++)
source = ocl::get_kernel_source(kernel_names[i]);
source = get_kernel_source(kernel_names[i]);
VERROR_ENGINE(source, status::runtime_error,
"No OpenCL source was found for kernel");
stringstream_t pp_code;
CHECK(gpu::intel::ocl::preprocess_headers(pp_code, source, kernel_ctx));
CHECK(compute::preprocess_headers(pp_code, source, kernel_ctx));
std::string code_str = pp_code.str();
std::string build_options = kernel_ctx.options();
build_options += " " + device_info()->get_cl_ext_options();
gpu::intel::compute::program_src_t src(code_str);
if (src) { build_options += " -g -s " + std::string(src.name()); }
compute::debugdump_processed_source(
code_str, build_options, device_info()->get_cl_ext_options());
auto kb_src = syclex::create_kernel_bundle_from_source(
context(), syclex::source_language::opencl, pp_code.str());
context(), syclex::source_language::opencl, code_str);
auto kb_exe = syclex::build(
kb_src, syclex::properties {syclex::build_options(build_options)});
*kernels = std::vector<compute::kernel_t>(kernel_names.size());
@ -133,8 +140,7 @@ status_t engine_t::create_kernels(
if (!kernel_names[i]) continue;
CHECK(interop_kernel_t::make((*kernels)[i],
kb_exe.ext_oneapi_get_kernel(kernel_names[i]),
gpu::intel::compute::program_src_t(pp_code.str())));
kb_exe.ext_oneapi_get_kernel(kernel_names[i]), src));
}
return status::success;

View File

@ -31,7 +31,7 @@
#include "gpu/intel/ocl/engine.hpp"
#include "gpu/intel/ocl/kernel.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#include "gpu/intel/sycl/compat.hpp"
#include "gpu/intel/sycl/utils.hpp"

View File

@ -19,8 +19,7 @@
#include "common/verbose.hpp"
#include "gpu/intel/compute/types_interop.hpp"
#include "gpu/intel/compute/utils.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/sycl/l0/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#include "gpu/intel/sycl/stream.hpp"
#include "gpu/intel/sycl/utils.hpp"
#include "xpu/sycl/c_types_map.hpp"

View File

@ -1,436 +0,0 @@
/*******************************************************************************
* Copyright 2020-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "gpu/intel/sycl/l0/utils.hpp"
#include "oneapi/dnnl/dnnl_config.h"
#include "gpu/intel/jit/binary_format.hpp"
#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
#include "ngen_level_zero.hpp"
#if defined(__linux__)
#include <dlfcn.h>
#elif defined(_WIN32)
#include "windows.h"
#else
#error "Level Zero is supported on Linux and Windows only"
#endif
#include "level_zero/ze_api.h"
#include "level_zero/ze_intel_gpu.h"
#if !defined(__SYCL_COMPILER_VERSION)
#error "Unsupported compiler"
#endif
#if (__SYCL_COMPILER_VERSION < 20200818)
#error "Level Zero is not supported with this compiler version"
#endif
#include "common/c_types_map.hpp"
#include "common/verbose.hpp"
#include "gpu/intel/sycl/utils.hpp"
#include <sycl/ext/oneapi/backend/level_zero.hpp>
#include "gpu/intel/sycl/engine.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace sycl {
namespace {
std::string to_string(ze_result_t r) {
#define ZE_STATUS_CASE(status) \
case status: return #status
switch (r) {
ZE_STATUS_CASE(ZE_RESULT_SUCCESS);
ZE_STATUS_CASE(ZE_RESULT_NOT_READY);
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST);
ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY);
ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET);
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED);
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE);
ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS);
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN);
ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32);
default: return std::to_string((int)r);
}
#undef ZE_STATUS_CASE
};
#define ZE_CHECK_COMMON(f, retval) \
do { \
ze_result_t res_ = (f); \
if (res_ != ZE_RESULT_SUCCESS) { \
std::string err_str_ = to_string(res_); \
VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \
return retval; \
} \
} while (false)
#define ZE_CHECK(f) ZE_CHECK_COMMON(f, status::runtime_error)
#define ZE_CHECK_VP(f) ZE_CHECK_COMMON(f, nullptr)
void *find_ze_symbol(const char *symbol) {
#if defined(__linux__)
void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
#elif defined(_WIN32)
// Use LOAD_LIBRARY_SEARCH_SYSTEM32 flag to avoid DLL hijacking issue.
HMODULE handle = LoadLibraryExA(
"ze_loader.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
#endif
if (!handle) {
VERROR(common, level_zero, "cannot find loader library");
assert(!"not expected");
return nullptr;
}
using zeInit_decl_t = ze_result_t (*)(ze_init_flags_t flags);
const ze_init_flags_t default_ze_flags = 0;
#if defined(__linux__)
static const ze_result_t ze_result = reinterpret_cast<zeInit_decl_t>(
dlsym(handle, "zeInit"))(default_ze_flags);
void *f = reinterpret_cast<void *>(dlsym(handle, symbol));
#elif defined(_WIN32)
static const ze_result_t ze_result = reinterpret_cast<zeInit_decl_t>(
GetProcAddress(handle, "zeInit"))(default_ze_flags);
void *f = reinterpret_cast<void *>(GetProcAddress(handle, symbol));
#endif
ZE_CHECK_VP(ze_result);
if (!f) {
VERROR(common, level_zero, "cannot find symbol: %s", symbol);
assert(!"not expected");
}
return f;
}
template <typename F>
F find_ze_symbol(const char *symbol) {
return (F)find_ze_symbol(symbol);
}
status_t func_zeModuleCreate(ze_context_handle_t hContext,
ze_device_handle_t hDevice, const ze_module_desc_t *desc,
ze_module_handle_t *phModule,
ze_module_build_log_handle_t *phBuildLog) {
static auto f = find_ze_symbol<decltype(&zeModuleCreate)>("zeModuleCreate");
if (!f) return status::runtime_error;
ZE_CHECK(f(hContext, hDevice, desc, phModule, phBuildLog));
return status::success;
}
status_t func_zeDeviceGetProperties(
ze_device_handle_t hDevice, ze_device_properties_t *pDeviceProperties) {
static auto f = find_ze_symbol<decltype(&zeDeviceGetProperties)>(
"zeDeviceGetProperties");
if (!f) return status::runtime_error;
ZE_CHECK(f(hDevice, pDeviceProperties));
return status::success;
}
status_t func_zeDeviceGetModuleProperties(ze_device_handle_t hDevice,
ze_device_module_properties_t *pDeviceProperties) {
static auto f = find_ze_symbol<decltype(&zeDeviceGetModuleProperties)>(
"zeDeviceGetModuleProperties");
if (!f) {
VERROR(common, level_zero,
"failed to find systolic query extension (maybe update the "
"driver?)");
return status::runtime_error;
}
ZE_CHECK(f(hDevice, pDeviceProperties));
return status::success;
}
} // namespace
// This function is called from compatibility layer that ensures compatibility
// with SYCL 2017 API. Once the compatibility layer is removed this function
// can be moved to the anonymous namespace above and a function with SYCL
// data types in its interface can be created to call it.
status_t func_zeKernelCreate(ze_module_handle_t hModule,
const ze_kernel_desc_t *desc, ze_kernel_handle_t *phKernel) {
static auto f = find_ze_symbol<decltype(&zeKernelCreate)>("zeKernelCreate");
if (!f) return status::runtime_error;
ZE_CHECK(f(hModule, desc, phKernel));
return status::success;
}
#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
status_t func_zeGetKernelBinary(
ze_kernel_handle_t hKernel, size_t *pSize, uint8_t *pKernelBinary) {
static auto f = find_ze_symbol<decltype(&zeKernelGetBinaryExp)>(
"zeKernelGetBinaryExp");
if (!f) return status::runtime_error;
ZE_CHECK(f(hKernel, pSize, pKernelBinary));
return status::success;
}
#else
status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize,
uint8_t *pModuleNativeBinary) {
static auto f = find_ze_symbol<decltype(&zeModuleGetNativeBinary)>(
"zeModuleGetNativeBinary");
if (!f) return status::runtime_error;
ZE_CHECK(f(hModule, pSize, pModuleNativeBinary));
return status::success;
}
#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
// FIXME: Currently SYCL doesn't provide any API to get device UUID so
// we query it directly from Level0 with the zeDeviceGetProperties function.
// The `get_device_uuid` function packs 128 bits of the device UUID, which are
// represented as an uint8_t array of size 16, to 2 uint64_t values.
xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16,
"ZE_MAX_DEVICE_UUID_SIZE is expected to be 16");
auto ze_device_properties = ze_device_properties_t();
ze_device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(dev);
auto status = func_zeDeviceGetProperties(ze_device, &ze_device_properties);
MAYBE_UNUSED(status);
assert(status == status::success);
const auto &ze_device_id = ze_device_properties.uuid.id;
uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {};
for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
uuid[i / sizeof(uint64_t)] |= (((uint64_t)ze_device_id[i]) << shift);
}
return xpu::device_uuid_t(uuid[0], uuid[1]);
}
status_t sycl_create_kernels_with_level_zero(
std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
const std::vector<const char *> &kernel_names,
const gpu::intel::sycl::engine_t *sycl_engine,
const xpu::binary_t &binary) {
auto desc = ze_module_desc_t();
desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
desc.format = ZE_MODULE_FORMAT_NATIVE;
desc.inputSize = binary.size();
desc.pInputModule = binary.data();
desc.pBuildFlags = "";
desc.pConstants = nullptr;
ze_module_handle_t ze_module;
auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(
sycl_engine->device());
auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(
sycl_engine->context());
CHECK(func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr));
::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle
= ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero,
::sycl::bundle_state::executable>(
{ze_module}, sycl_engine->context());
sycl_kernels.resize(kernel_names.size());
for (size_t i = 0; i < kernel_names.size(); i++) {
if (kernel_names[i] == nullptr) continue;
ze_kernel_handle_t ze_kernel;
ze_kernel_desc_t ze_kernel_desc {
ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_names[i]};
CHECK(func_zeKernelCreate(ze_module, &ze_kernel_desc, &ze_kernel));
auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
{kernel_bundle, ze_kernel}, sycl_engine->context());
sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
}
return status::success;
}
bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
auto lhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(lhs);
auto rhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(rhs);
return lhs_ze_handle == rhs_ze_handle;
}
status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
auto devicePropsIP = ze_device_ip_version_ext_t();
devicePropsIP.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
auto deviceProps = ze_device_properties_t();
deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
deviceProps.pNext = &devicePropsIP;
CHECK(func_zeDeviceGetProperties(device, &deviceProps));
ip_version = devicePropsIP.ipVersion;
return status::success;
}
status_t get_l0_device_enabled_systolic_intel(
ze_device_handle_t device, bool &mayiuse_systolic) {
// Note: supported by Intel Driver 24.05 and onwards
auto deviceModPropsExt = ze_intel_device_module_dp_exp_properties_t();
deviceModPropsExt.stype
= ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
auto deviceModProps = ze_device_module_properties_t();
deviceModProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
deviceModProps.pNext = &deviceModPropsExt;
CHECK(func_zeDeviceGetModuleProperties(device, &deviceModProps));
mayiuse_systolic
= deviceModPropsExt.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
return status::success;
}
status_t get_l0_device_enabled_native_float_atomics(
ze_device_handle_t device, uint64_t &native_extensions) {
using namespace gpu::intel::compute;
auto fltAtom = ze_float_atomic_ext_properties_t();
fltAtom.stype = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
auto deviceProps = ze_device_module_properties_t();
deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
deviceProps.pNext = &fltAtom;
CHECK(func_zeDeviceGetModuleProperties(device, &deviceProps));
ze_device_fp_atomic_ext_flags_t atomic_load_store
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
ze_device_fp_atomic_ext_flags_t atomic_add
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
ze_device_fp_atomic_ext_flags_t atomic_min_max
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
if ((fltAtom.fp16Flags & atomic_load_store) == atomic_load_store)
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
if ((fltAtom.fp16Flags & atomic_add) == atomic_add)
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
if ((fltAtom.fp16Flags & atomic_min_max) == atomic_min_max)
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
if ((fltAtom.fp32Flags & atomic_load_store) == atomic_load_store)
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
if ((fltAtom.fp32Flags & atomic_add) == atomic_add)
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
if ((fltAtom.fp32Flags & atomic_min_max) == atomic_min_max)
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
if ((fltAtom.fp64Flags & atomic_load_store) == atomic_load_store)
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
if ((fltAtom.fp64Flags & atomic_add) == atomic_add)
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
if ((fltAtom.fp64Flags & atomic_min_max) == atomic_min_max)
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
return status::success;
}
status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
auto eucnt = ze_eu_count_ext_t();
auto deviceProps = ze_device_properties_t();
deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
deviceProps.pNext = &eucnt;
CHECK(func_zeDeviceGetProperties(device, &deviceProps));
eu_count = eucnt.numTotalEUs;
return status::success;
}
status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
ze_context_handle_t context, uint32_t &ip_version,
compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_,
uint64_t &native_extensions, bool &mayiuse_systolic,
bool &mayiuse_ngen_kernels) {
using namespace ngen;
ngen::Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
context, device);
gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
std::memcpy(&product_, &product, sizeof(ngen::Product));
mayiuse_systolic = false;
if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic)
!= status::success)
mayiuse_systolic = false;
/* Some old drivers do not report systolic availability. Manually override
systolic availability based on product family. */
switch (product.family) {
case ProductFamily::DG2:
case ProductFamily::ARL:
case ProductFamily::PVC: mayiuse_systolic = true;
default: break;
}
CHECK(get_l0_device_enabled_native_float_atomics(
device, native_extensions));
auto status
= jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
if (status != status::success) mayiuse_ngen_kernels = false;
ip_version = 0;
return get_device_ip(device, ip_version);
}
} // namespace sycl
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl

View File

@ -1,65 +0,0 @@
/*******************************************************************************
* Copyright 2020-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_SYCL_L0_UTILS_HPP
#define GPU_INTEL_SYCL_L0_UTILS_HPP
#include <memory>
#include <string>
#include <vector>
#include "gpu/intel/compute/kernel.hpp"
#include "gpu/intel/sycl/compat.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace sycl {
class engine_t;
xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
status_t sycl_create_kernels_with_level_zero(
std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
const std::vector<const char *> &kernel_names,
const gpu::intel::sycl::engine_t *sycl_engine,
const xpu::binary_t &binary);
bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
status_t func_zeGetKernelBinary(
ze_kernel_handle_t hKernel, size_t *pSize, uint8_t *pKernelBinary);
#else
status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize,
uint8_t *pModuleNativeBinary);
#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
ze_context_handle_t context, uint32_t &ip_version,
compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product,
uint64_t &native_extensions, bool &mayiuse_systolic,
bool &mayiuse_ngen_kernels);
} // namespace sycl
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_SYCL_L0_UTILS_HPP

View File

@ -24,7 +24,7 @@
#include "gpu/intel/sycl/stream.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
namespace dnnl {
namespace impl {

View File

@ -35,6 +35,8 @@
#include "gpu/intel/engine.hpp"
#include "gpu/intel/stream.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {

View File

@ -17,9 +17,9 @@
#include "gpu/intel/sycl/utils.hpp"
#include "gpu/intel/compute/ukernels.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/l0/utils/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#include "gpu/intel/sycl/engine.hpp"
#include "gpu/intel/sycl/l0/utils.hpp"
#include "xpu/ocl/engine_factory.hpp"
#include "xpu/ocl/utils.hpp"
#include "xpu/sycl/compat.hpp"
@ -32,6 +32,53 @@ namespace gpu {
namespace intel {
namespace sycl {
// FIXME: Currently SYCL doesn't provide any API to get device UUID so
// we query it directly from Level0 with the zeDeviceGetProperties function.
// The `get_device_uuid` function packs 128 bits of the device UUID, which are
// represented as an uint8_t array of size 16, to 2 uint64_t values.
xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
return gpu::intel::l0::get_device_uuid(
xpu::sycl::compat::get_native<ze_device_handle_t>(dev));
}
bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
auto lhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(lhs);
auto rhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(rhs);
return lhs_ze_handle == rhs_ze_handle;
}
status_t sycl_create_kernels_with_level_zero(
std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
const std::vector<const char *> &kernel_names,
const gpu::intel::sycl::engine_t *sycl_engine,
const xpu::binary_t &binary) {
auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(
sycl_engine->device());
auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(
sycl_engine->context());
ze_module_handle_t ze_module = nullptr;
std::vector<ze_kernel_handle_t> ze_kernels;
gpu::intel::l0::create_kernels_from_binary(
ze_device, ze_ctx, kernel_names, binary, &ze_module, ze_kernels);
::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle
= ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero,
::sycl::bundle_state::executable>(
{ze_module}, sycl_engine->context());
sycl_kernels.resize(kernel_names.size());
for (size_t i = 0; i < kernel_names.size(); i++) {
if (kernel_names[i] == nullptr) continue;
auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
{kernel_bundle, ze_kernels[i]}, sycl_engine->context());
sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
}
return status::success;
}
::sycl::nd_range<3> to_sycl_nd_range(
const gpu::intel::compute::nd_range_t &range) {
const auto &local_range = range.local_range();
@ -150,7 +197,6 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) {
}
*ocl_dev = d;
return status::success;
}
@ -216,12 +262,7 @@ status_t get_kernel_binary(
#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
auto l0_kernel = ::sycl::get_native<
::sycl::backend::ext_oneapi_level_zero>(kernel);
size_t binary_size = 0;
CHECK(gpu::intel::sycl::func_zeGetKernelBinary(
l0_kernel, &binary_size, nullptr));
binary.resize(binary_size);
CHECK(gpu::intel::sycl::func_zeGetKernelBinary(
l0_kernel, &binary_size, binary.data()));
CHECK(gpu::intel::l0::get_kernel_binary(l0_kernel, binary));
#else
auto bundle = kernel.get_kernel_bundle();
auto module_vec = ::sycl::get_native<
@ -229,11 +270,7 @@ status_t get_kernel_binary(
auto module = module_vec[0];
size_t module_binary_size;
xpu::binary_t module_binary;
CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary(
module, &module_binary_size, nullptr));
module_binary.resize(module_binary_size);
CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary(
module, &module_binary_size, module_binary.data()));
CHECK(gpu::intel::l0::get_mobule_binary(module, binary));
{
std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
ocl_engine;

View File

@ -29,6 +29,16 @@ namespace sycl {
class engine_t;
xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
status_t sycl_create_kernels_with_level_zero(
std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
const std::vector<const char *> &kernel_names,
const gpu::intel::sycl::engine_t *sycl_engine,
const xpu::binary_t &binary);
bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
::sycl::nd_range<3> to_sycl_nd_range(
const gpu::intel::compute::nd_range_t &range);

View File

@ -93,7 +93,6 @@ public:
#ifdef DNNL_WITH_SYCL
void set_deps(::sycl::event event) { e_ = std::move(event); }
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
void set_deps(cl_event event) { ocl_e_ = event; }
#endif
@ -106,7 +105,6 @@ private:
#ifdef DNNL_WITH_SYCL
::sycl::event e_;
#endif
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
cl_event ocl_e_;
#endif

View File

@ -31,6 +31,6 @@
#endif
#endif
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
#endif

View File

@ -24,7 +24,7 @@
#include "xpu/ocl/memory_storage_base.hpp"
#include "gpu/intel/ocl/utils.hpp"
#include "gpu/intel/ocl/utils/utils.hpp"
namespace dnnl {
namespace impl {

View File

@ -32,131 +32,6 @@ namespace impl {
namespace xpu {
namespace ocl {
status_t convert_to_dnnl(cl_int cl_status) {
switch (cl_status) {
case CL_SUCCESS: return status::success;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
case CL_OUT_OF_RESOURCES:
case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
case CL_DEVICE_NOT_FOUND:
case CL_DEVICE_NOT_AVAILABLE:
case CL_COMPILER_NOT_AVAILABLE:
case CL_PROFILING_INFO_NOT_AVAILABLE:
case CL_MEM_COPY_OVERLAP:
case CL_IMAGE_FORMAT_MISMATCH:
case CL_IMAGE_FORMAT_NOT_SUPPORTED:
case CL_BUILD_PROGRAM_FAILURE:
case CL_MAP_FAILURE:
case CL_MISALIGNED_SUB_BUFFER_OFFSET:
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
case CL_COMPILE_PROGRAM_FAILURE:
case CL_LINKER_NOT_AVAILABLE:
case CL_LINK_PROGRAM_FAILURE:
case CL_DEVICE_PARTITION_FAILED:
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
case CL_INVALID_PLATFORM:
case CL_INVALID_DEVICE: return status::runtime_error;
case CL_INVALID_VALUE:
case CL_INVALID_DEVICE_TYPE:
case CL_INVALID_CONTEXT:
case CL_INVALID_QUEUE_PROPERTIES:
case CL_INVALID_COMMAND_QUEUE:
case CL_INVALID_HOST_PTR:
case CL_INVALID_MEM_OBJECT:
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
case CL_INVALID_IMAGE_SIZE:
case CL_INVALID_SAMPLER:
case CL_INVALID_BINARY:
case CL_INVALID_BUILD_OPTIONS:
case CL_INVALID_PROGRAM:
case CL_INVALID_PROGRAM_EXECUTABLE:
case CL_INVALID_KERNEL_NAME:
case CL_INVALID_KERNEL_DEFINITION:
case CL_INVALID_KERNEL:
case CL_INVALID_ARG_INDEX:
case CL_INVALID_ARG_VALUE:
case CL_INVALID_ARG_SIZE:
case CL_INVALID_KERNEL_ARGS:
case CL_INVALID_WORK_DIMENSION:
case CL_INVALID_WORK_GROUP_SIZE:
case CL_INVALID_WORK_ITEM_SIZE:
case CL_INVALID_GLOBAL_OFFSET:
case CL_INVALID_EVENT_WAIT_LIST:
case CL_INVALID_EVENT:
case CL_INVALID_OPERATION:
case CL_INVALID_GL_OBJECT:
case CL_INVALID_BUFFER_SIZE:
case CL_INVALID_MIP_LEVEL:
case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments;
default: return status::runtime_error;
}
}
// Ordered by value as defined by opencl
const char *convert_cl_int_to_str(cl_int cl_status) {
#define CL_STATUS_CASE(status) \
case status: return #status
switch (cl_status) {
CL_STATUS_CASE(CL_SUCCESS);
CL_STATUS_CASE(CL_DEVICE_NOT_FOUND);
CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE);
CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE);
CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
CL_STATUS_CASE(CL_OUT_OF_RESOURCES);
CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY);
CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
CL_STATUS_CASE(CL_MEM_COPY_OVERLAP);
CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH);
CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE);
CL_STATUS_CASE(CL_MAP_FAILURE);
CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE);
CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE);
CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE);
CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED);
CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
CL_STATUS_CASE(CL_INVALID_VALUE);
CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE);
CL_STATUS_CASE(CL_INVALID_PLATFORM);
CL_STATUS_CASE(CL_INVALID_DEVICE);
CL_STATUS_CASE(CL_INVALID_CONTEXT);
CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES);
CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE);
CL_STATUS_CASE(CL_INVALID_HOST_PTR);
CL_STATUS_CASE(CL_INVALID_MEM_OBJECT);
CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE);
CL_STATUS_CASE(CL_INVALID_SAMPLER);
CL_STATUS_CASE(CL_INVALID_BINARY);
CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS);
CL_STATUS_CASE(CL_INVALID_PROGRAM);
CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
CL_STATUS_CASE(CL_INVALID_KERNEL_NAME);
CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION);
CL_STATUS_CASE(CL_INVALID_KERNEL);
CL_STATUS_CASE(CL_INVALID_ARG_INDEX);
CL_STATUS_CASE(CL_INVALID_ARG_VALUE);
CL_STATUS_CASE(CL_INVALID_ARG_SIZE);
CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS);
CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION);
CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE);
CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE);
CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET);
CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST);
CL_STATUS_CASE(CL_INVALID_EVENT);
CL_STATUS_CASE(CL_INVALID_OPERATION);
CL_STATUS_CASE(CL_INVALID_GL_OBJECT);
CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE);
CL_STATUS_CASE(CL_INVALID_MIP_LEVEL);
CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
#undef CL_STATUS_CASE
default: return "unknown macro name";
}
}
template <typename T, typename F>
static std::string get_ocl_name(T obj, F get_func, cl_uint name_query) {
size_t name_size;

View File

@ -33,8 +33,130 @@ namespace impl {
namespace xpu {
namespace ocl {
status_t convert_to_dnnl(cl_int cl_status);
const char *convert_cl_int_to_str(cl_int cl_status);
inline status_t convert_to_dnnl(cl_int cl_status) {
switch (cl_status) {
case CL_SUCCESS: return status::success;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
case CL_OUT_OF_RESOURCES:
case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
case CL_DEVICE_NOT_FOUND:
case CL_DEVICE_NOT_AVAILABLE:
case CL_COMPILER_NOT_AVAILABLE:
case CL_PROFILING_INFO_NOT_AVAILABLE:
case CL_MEM_COPY_OVERLAP:
case CL_IMAGE_FORMAT_MISMATCH:
case CL_IMAGE_FORMAT_NOT_SUPPORTED:
case CL_BUILD_PROGRAM_FAILURE:
case CL_MAP_FAILURE:
case CL_MISALIGNED_SUB_BUFFER_OFFSET:
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
case CL_COMPILE_PROGRAM_FAILURE:
case CL_LINKER_NOT_AVAILABLE:
case CL_LINK_PROGRAM_FAILURE:
case CL_DEVICE_PARTITION_FAILED:
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
case CL_INVALID_PLATFORM:
case CL_INVALID_DEVICE: return status::runtime_error;
case CL_INVALID_VALUE:
case CL_INVALID_DEVICE_TYPE:
case CL_INVALID_CONTEXT:
case CL_INVALID_QUEUE_PROPERTIES:
case CL_INVALID_COMMAND_QUEUE:
case CL_INVALID_HOST_PTR:
case CL_INVALID_MEM_OBJECT:
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
case CL_INVALID_IMAGE_SIZE:
case CL_INVALID_SAMPLER:
case CL_INVALID_BINARY:
case CL_INVALID_BUILD_OPTIONS:
case CL_INVALID_PROGRAM:
case CL_INVALID_PROGRAM_EXECUTABLE:
case CL_INVALID_KERNEL_NAME:
case CL_INVALID_KERNEL_DEFINITION:
case CL_INVALID_KERNEL:
case CL_INVALID_ARG_INDEX:
case CL_INVALID_ARG_VALUE:
case CL_INVALID_ARG_SIZE:
case CL_INVALID_KERNEL_ARGS:
case CL_INVALID_WORK_DIMENSION:
case CL_INVALID_WORK_GROUP_SIZE:
case CL_INVALID_WORK_ITEM_SIZE:
case CL_INVALID_GLOBAL_OFFSET:
case CL_INVALID_EVENT_WAIT_LIST:
case CL_INVALID_EVENT:
case CL_INVALID_OPERATION:
case CL_INVALID_GL_OBJECT:
case CL_INVALID_BUFFER_SIZE:
case CL_INVALID_MIP_LEVEL:
case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments;
default: return status::runtime_error;
}
}
// Ordered by value as defined by opencl
inline const char *convert_cl_int_to_str(cl_int cl_status) {
#define CL_STATUS_CASE(status) \
case status: return #status
switch (cl_status) {
CL_STATUS_CASE(CL_SUCCESS);
CL_STATUS_CASE(CL_DEVICE_NOT_FOUND);
CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE);
CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE);
CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
CL_STATUS_CASE(CL_OUT_OF_RESOURCES);
CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY);
CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
CL_STATUS_CASE(CL_MEM_COPY_OVERLAP);
CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH);
CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE);
CL_STATUS_CASE(CL_MAP_FAILURE);
CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE);
CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE);
CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE);
CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED);
CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
CL_STATUS_CASE(CL_INVALID_VALUE);
CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE);
CL_STATUS_CASE(CL_INVALID_PLATFORM);
CL_STATUS_CASE(CL_INVALID_DEVICE);
CL_STATUS_CASE(CL_INVALID_CONTEXT);
CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES);
CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE);
CL_STATUS_CASE(CL_INVALID_HOST_PTR);
CL_STATUS_CASE(CL_INVALID_MEM_OBJECT);
CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE);
CL_STATUS_CASE(CL_INVALID_SAMPLER);
CL_STATUS_CASE(CL_INVALID_BINARY);
CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS);
CL_STATUS_CASE(CL_INVALID_PROGRAM);
CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
CL_STATUS_CASE(CL_INVALID_KERNEL_NAME);
CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION);
CL_STATUS_CASE(CL_INVALID_KERNEL);
CL_STATUS_CASE(CL_INVALID_ARG_INDEX);
CL_STATUS_CASE(CL_INVALID_ARG_VALUE);
CL_STATUS_CASE(CL_INVALID_ARG_SIZE);
CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS);
CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION);
CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE);
CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE);
CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET);
CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST);
CL_STATUS_CASE(CL_INVALID_EVENT);
CL_STATUS_CASE(CL_INVALID_OPERATION);
CL_STATUS_CASE(CL_INVALID_GL_OBJECT);
CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE);
CL_STATUS_CASE(CL_INVALID_MIP_LEVEL);
CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
#undef CL_STATUS_CASE
default: return "unknown macro name";
}
}
#define MAYBE_REPORT_ERROR(msg) \
do { \

View File

@ -24,7 +24,7 @@
#include "common/engine.hpp"
#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
#include "gpu/intel/sycl/l0/utils.hpp"
#include "gpu/intel/sycl/utils.hpp"
#endif
#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA

View File

@ -91,6 +91,7 @@ public:
explicit LevelZeroCodeGenerator(DebugConfig debugConfig) : LevelZeroCodeGenerator({genericProductFamily(hw), 0}, debugConfig) {}
inline ze_kernel_handle_t getKernel(ze_module_handle_t module);
inline ze_module_handle_t getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = "");
static inline HW detectHW(ze_context_handle_t context, ze_device_handle_t device);
static inline Product detectHWInfo(ze_context_handle_t context, ze_device_handle_t device);
@ -138,6 +139,17 @@ static inline std::vector<uint8_t> getDummyModuleBinary(ze_context_handle_t cont
}; /* namespace detail */
template <HW hw>
ze_kernel_handle_t LevelZeroCodeGenerator<hw>::getKernel(ze_module_handle_t module)
{
auto kernelName = ELFCodeGenerator<hw>::interface_.getExternalName().c_str();
ze_kernel_handle_t kernelL0;
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernelName};
detail::handleL0(dynamic::zeKernelCreate(module, &kernelDesc, &kernelL0));
return kernelL0;
}
template <HW hw>
ze_module_handle_t LevelZeroCodeGenerator<hw>::getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options)
{