mirror of
https://github.com/uxlfoundation/oneDNN.git
synced 2025-10-20 18:43:49 +08:00
gpu: intel: add Level Zero backend
This commit is contained in:
committed by
Stefan Palicki
parent
3a756b982b
commit
633a03d736
@ -287,6 +287,7 @@ Runtime-specific dependencies:
|
||||
| `ONEDNN_CPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), TBB (`tbb.dll`), OpenCL loader (`OpenCL.dll`)
|
||||
| `ONEDNN_GPU_RUNTIME=OCL` | any | OpenCL loader (`OpenCL.dll`)
|
||||
| `ONEDNN_GPU_RUNTIME=SYCL` | Intel oneAPI DPC++ Compiler | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), OpenCL loader (`OpenCL.dll`), oneAPI Level Zero loader (`ze_loader.dll`)
|
||||
| `ONEDNN_GPU_RUNTIME=L0` | any | oneAPI Level Zero loader (`ze_loader.dll`)
|
||||
|
||||
#### macOS
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
#===============================================================================
|
||||
# Copyright 2019-2021 Intel Corporation
|
||||
# Copyright 2019-2025 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -22,7 +22,7 @@ if(OpenCL_cmake_included)
|
||||
endif()
|
||||
set(OpenCL_cmake_included true)
|
||||
|
||||
if(DNNL_GPU_RUNTIME STREQUAL "OCL")
|
||||
if("${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|L0)$")
|
||||
message(STATUS "GPU support is enabled (OpenCL)")
|
||||
else()
|
||||
return()
|
||||
|
@ -283,7 +283,7 @@ set(DNNL_GPU_RUNTIME "NONE" CACHE STRING
|
||||
|
||||
Using OpenCL for GPU requires setting OPENCLROOT if the libraries are
|
||||
installed in a non-standard location.")
|
||||
if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL)$")
|
||||
if(NOT "${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|NONE|DPCPP|SYCL|L0)$")
|
||||
message(FATAL_ERROR "Unsupported GPU runtime: ${DNNL_GPU_RUNTIME}")
|
||||
endif()
|
||||
|
||||
|
2
doc/build/build_options.md
vendored
2
doc/build/build_options.md
vendored
@ -7,7 +7,7 @@ oneDNN supports the following build-time options.
|
||||
|:--------------------------------|:----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------|
|
||||
| ONEDNN_LIBRARY_TYPE | **SHARED**, STATIC | Defines the resulting library type |
|
||||
| ONEDNN_CPU_RUNTIME | NONE, **OMP**, TBB, SEQ, THREADPOOL, SYCL | Defines the threading runtime for CPU engines |
|
||||
| ONEDNN_GPU_RUNTIME | **NONE**, OCL, SYCL | Defines the offload runtime for GPU engines |
|
||||
| ONEDNN_GPU_RUNTIME | **NONE**, OCL, SYCL, L0 | Defines the offload runtime for GPU engines |
|
||||
| ONEDNN_BUILD_DOC | **ON**, OFF | Controls building the documentation |
|
||||
| ONEDNN_DOC_VERSIONS_JSON | **""**, *string* | Location of JSON file for [PyData Sphinx Theme version switcher]. Enables documentation version switcher when set. |
|
||||
| ONEDNN_BUILD_EXAMPLES | **ON**, OFF | Controls building the examples |
|
||||
|
@ -136,6 +136,9 @@ foreach(f ${sources})
|
||||
if(NOT DNNL_WITH_SYCL AND ${f_name} MATCHES "^sycl")
|
||||
list(REMOVE_ITEM sources "${f}")
|
||||
endif()
|
||||
if(NOT DNNL_GPU_RUNTIME STREQUAL "L0" AND ${f_name} MATCHES ".*level_zero")
|
||||
list(REMOVE_ITEM sources "${f}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# In case of SYCL, skip CPU examples that directly work with raw pointers
|
||||
@ -180,7 +183,7 @@ foreach(src ${sources})
|
||||
endif()
|
||||
else()
|
||||
set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)")
|
||||
set(gpu_rt_pattern "(OCL|SYCL|DPCPP)")
|
||||
set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)")
|
||||
if(${example_name} MATCHES "sycl.*")
|
||||
set(cpu_rt_pattern "(SYCL|DPCPP)")
|
||||
set(gpu_rt_pattern "(SYCL|DPCPP)")
|
||||
|
@ -200,7 +200,7 @@ foreach(src ${sources})
|
||||
endif()
|
||||
else()
|
||||
set(cpu_rt_pattern "(SEQ|OMP|TBB|SYCL|DPCPP)")
|
||||
set(gpu_rt_pattern "(OCL|SYCL|DPCPP)")
|
||||
set(gpu_rt_pattern "(OCL|L0|SYCL|DPCPP)")
|
||||
if(${example_name} MATCHES "sycl.*")
|
||||
set(cpu_rt_pattern "(SYCL|DPCPP)")
|
||||
set(gpu_rt_pattern "(SYCL|DPCPP)")
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2019-2022 Intel Corporation
|
||||
* Copyright 2019-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -29,6 +29,9 @@
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#include "dnnl_ocl.h"
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#include "dnnl_l0.h"
|
||||
#endif
|
||||
|
||||
#define COMPLAIN_DNNL_ERROR_AND_EXIT(what, status) \
|
||||
do { \
|
||||
@ -160,7 +163,7 @@ static inline void write_to_dnnl_memory(void *handle, dnnl_memory_t mem) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
if (eng_kind == dnnl_gpu) {
|
||||
void *mapped_ptr = NULL;
|
||||
CHECK(dnnl_memory_map_data(mem, &mapped_ptr));
|
||||
|
@ -35,7 +35,11 @@
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#include "dnnl_ocl.hpp"
|
||||
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#include "dnnl_l0.hpp"
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
#include "dnnl_sycl.hpp"
|
||||
#endif
|
||||
|
||||
@ -228,7 +232,7 @@ inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
if (eng.get_kind() == dnnl::engine::kind::gpu) {
|
||||
void *mapped_ptr = mem.map_data();
|
||||
if (mapped_ptr) std::memcpy(handle, mapped_ptr, size);
|
||||
@ -287,7 +291,7 @@ inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
if (eng.get_kind() == dnnl::engine::kind::gpu) {
|
||||
void *mapped_ptr = mem.map_data();
|
||||
if (mapped_ptr) std::memcpy(mapped_ptr, handle, size);
|
||||
|
22
include/dnnl_l0.h
Normal file
22
include/dnnl_l0.h
Normal file
@ -0,0 +1,22 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2020-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef DNNL_L0_H
|
||||
#define DNNL_L0_H
|
||||
|
||||
#include "oneapi/dnnl/dnnl_l0.h"
|
||||
|
||||
#endif /* DNNL_L0_H */
|
22
include/dnnl_l0.hpp
Normal file
22
include/dnnl_l0.hpp
Normal file
@ -0,0 +1,22 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2020-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef DNNL_L0_HPP
|
||||
#define DNNL_L0_HPP
|
||||
|
||||
#include "oneapi/dnnl/dnnl_l0.hpp"
|
||||
|
||||
#endif /* DNNL_L0_HPP */
|
@ -82,6 +82,9 @@
|
||||
/// DPC++ runtime
|
||||
#define DNNL_RUNTIME_DPCPP DNNL_RUNTIME_SYCL
|
||||
|
||||
/// L0 runtime
|
||||
#define DNNL_RUNTIME_L0 1024u
|
||||
|
||||
/// No vendor (corresponding runtime is disabled)
|
||||
#define DNNL_VENDOR_NONE 0u
|
||||
|
||||
@ -119,7 +122,8 @@
|
||||
#endif
|
||||
#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
|
||||
&& (DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL) \
|
||||
&& (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL)
|
||||
&& (DNNL_GPU_RUNTIME != DNNL_RUNTIME_SYCL) \
|
||||
&& (DNNL_GPU_RUNTIME != DNNL_RUNTIME_L0)
|
||||
#error "Unexpected DNNL_GPU_RUNTIME"
|
||||
#endif
|
||||
#if (DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \
|
||||
@ -145,9 +149,6 @@
|
||||
// When defined, DPCPP is supported.
|
||||
#cmakedefine DNNL_WITH_SYCL
|
||||
|
||||
// When defined, Level Zero is supported.
|
||||
#cmakedefine DNNL_WITH_LEVEL_ZERO
|
||||
|
||||
// When defined, SYCL CUDA backend is used.
|
||||
#cmakedefine DNNL_SYCL_CUDA
|
||||
|
||||
|
203
include/oneapi/dnnl/dnnl_l0.h
Normal file
203
include/oneapi/dnnl/dnnl_l0.h
Normal file
@ -0,0 +1,203 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2020-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef ONEAPI_DNNL_DNNL_L0_H
|
||||
#define ONEAPI_DNNL_DNNL_L0_H
|
||||
|
||||
#include "oneapi/dnnl/dnnl.h"
|
||||
|
||||
/// @cond DO_NOT_DOCUMENT_THIS
|
||||
#include "level_zero/ze_api.h"
|
||||
/// @endcond
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
/// @addtogroup dnnl_api
|
||||
/// @{
|
||||
|
||||
/// @addtogroup dnnl_api_interop
|
||||
/// @{
|
||||
|
||||
/// @addtogroup dnnl_api_l0_interop
|
||||
/// @{
|
||||
|
||||
/// Creates an engine associated with a Level Zero device and a Level Zero context.
|
||||
///
|
||||
/// @param engine Output engine.
|
||||
/// @param driver Pointer to the Level Zero driver to use for the engine.
|
||||
/// @param device Pointer to the Level Zero device to use for the engine.
|
||||
/// @param context Pointer to the Level Zero context to use for the engine.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_engine_create(dnnl_engine_t *engine,
|
||||
const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
|
||||
const ze_context_handle_t acontext);
|
||||
|
||||
/// Returns the Level Zero context associated with an engine.
|
||||
///
|
||||
/// @param engine Engine to query.
|
||||
/// @param context Pointer to the underlying Level Zero context of the engine.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_context(
|
||||
dnnl_engine_t engine, ze_context_handle_t context);
|
||||
|
||||
/// Returns the Level Zero device associated with an engine.
|
||||
///
|
||||
/// @param engine Engine to query.
|
||||
/// @param device Pointer to the underlying Level Zero device of the engine.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_device(
|
||||
dnnl_engine_t engine, ze_device_handle_t device);
|
||||
|
||||
/// Returns the Level Zero driver associated with an engine.
|
||||
///
|
||||
/// @param engine Engine to query.
|
||||
/// @param device Pointer to the underlying Level Zero driver of the engine.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_engine_get_driver(
|
||||
dnnl_engine_t engine, ze_driver_handle_t driver);
|
||||
|
||||
/// Creates an execution stream for a given engine associated with a Level Zero
|
||||
/// queue.
|
||||
///
|
||||
/// @param stream Output execution stream.
|
||||
/// @param engine Engine to create the execution stream on.
|
||||
/// @param queue Level Zero command queue to use.
|
||||
/// @param list Level Zero command list to use.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_stream_create(dnnl_stream_t *stream,
|
||||
dnnl_engine_t engine, ze_command_list_handle_t list);
|
||||
|
||||
/// Returns the Level Zero command list associated with an execution stream.
|
||||
///
|
||||
/// @param stream Execution stream to query.
|
||||
/// @param list Output Level Zero command list.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_stream_get_list(
|
||||
dnnl_stream_t stream, ze_command_list_handle_t list);
|
||||
|
||||
/// Creates a memory object.
|
||||
///
|
||||
/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
|
||||
/// constructed memory object will have the underlying buffer set. In this
|
||||
/// case, the buffer will be initialized as if:
|
||||
/// - dnnl_memory_set_data_handle() had been called, if @p memory_kind is equal
|
||||
/// to dnnl_l0_interop_usm, or
|
||||
/// - dnnl_l0_interop_memory_set_buffer() has been called, if @p memory_kind
|
||||
/// is equal to dnnl_l0_interop_buffer.
|
||||
///
|
||||
/// @param memory Output memory object.
|
||||
/// @param memory_desc Memory descriptor.
|
||||
/// @param engine Engine to use.
|
||||
/// @param handle Handle of the memory buffer to use as an underlying storage.
|
||||
/// - A USM pointer to the user-allocated buffer. In this case the library
|
||||
/// doesn't own the buffer. Requires @p memory_kind to be equal to
|
||||
/// dnnl::l0_interop::memory_kind::usm.
|
||||
/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
|
||||
/// allocate the buffer for the memory object. In this case the library
|
||||
/// owns the buffer.
|
||||
/// - The DNNL_MEMORY_NONE specific value. Instructs the library to
|
||||
/// create memory object without an underlying buffer.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory,
|
||||
const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
|
||||
void *handle);
|
||||
|
||||
/// Creates a memory object with multiple handles.
|
||||
///
|
||||
/// @param memory Output memory object.
|
||||
/// @param memory_desc Memory descriptor.
|
||||
/// @param engine Engine to use.
|
||||
/// @param memory_kind Memory allocation kind to specify the type of handles.
|
||||
/// @param nhandles Number of handles.
|
||||
/// @param handles Handles of the memory buffers to use as underlying storages.
|
||||
/// For each element of the @p handles array the following applies:
|
||||
/// - A USM pointer to the user-allocated buffer. In this case the library
|
||||
/// doesn't own the buffer. Requires @p memory_kind to be equal to
|
||||
/// dnnl::l0_interop::memory_kind::usm.
|
||||
/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
|
||||
/// allocate the buffer for the memory object. In this case the library
|
||||
/// owns the buffer.
|
||||
/// - The DNNL_MEMORY_NONE specific value. Instructs the library to
|
||||
/// create memory object without an underlying buffer.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory,
|
||||
const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
|
||||
size_t nhandles, void **handles);
|
||||
|
||||
/// Returns an Level Zero memory object associated with a memory object.
|
||||
///
|
||||
/// @param memory Memory object.
|
||||
/// @param mem_object Output Level Zero memory object.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object(
|
||||
const_dnnl_memory_t memory, void **mem_object);
|
||||
|
||||
/// Sets Level Zero memory object associated with a memory object.
|
||||
///
|
||||
/// For behavioral details, see dnnl_memory_set_data_handle().
|
||||
///
|
||||
/// @param memory Memory object.
|
||||
/// @param mem_object Level Zero memory object.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object(
|
||||
dnnl_memory_t memory, void *mem_object);
|
||||
|
||||
/// Executes computations specified by the primitive in a specified stream and
|
||||
/// returns a Level Zero event.
|
||||
///
|
||||
/// @param primitive Primitive to execute.
|
||||
/// @param stream Stream to use.
|
||||
/// @param nargs Number of arguments.
|
||||
/// @param args Array of arguments. Each argument is an
|
||||
/// <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
|
||||
/// values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
|
||||
/// #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
|
||||
/// descriptor as that returned by
|
||||
/// #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
|
||||
/// @param ndeps Number of dependencies.
|
||||
/// @param deps A pointer to a vector of size @p ndeps that contains
|
||||
/// dependencies.
|
||||
/// @param return_event Output event.
|
||||
/// @returns #dnnl_success on success and a status describing the error
|
||||
/// otherwise.
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_primitive_execute(
|
||||
const_dnnl_primitive_t primitive, dnnl_stream_t stream, size_t nargs,
|
||||
const dnnl_exec_arg_t *args, size_t ndeps,
|
||||
const ze_event_handle_t *deps, ze_event_handle_t *return_event);
|
||||
|
||||
/// @} dnnl_api_l0_interop
|
||||
|
||||
/// @} dnnl_api_interop
|
||||
|
||||
/// @} dnnl_api
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif // ONEAPI_DNNL_DNNL_L0_H
|
259
include/oneapi/dnnl/dnnl_l0.hpp
Normal file
259
include/oneapi/dnnl/dnnl_l0.hpp
Normal file
@ -0,0 +1,259 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2020-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef ONEAPI_DNNL_DNNL_L0_HPP
|
||||
#define ONEAPI_DNNL_DNNL_L0_HPP
|
||||
|
||||
#include "oneapi/dnnl/dnnl.hpp"
|
||||
|
||||
/// @cond DO_NOT_DOCUMENT_THIS
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "oneapi/dnnl/dnnl_l0.h"
|
||||
/// @endcond
|
||||
|
||||
/// @addtogroup dnnl_api
|
||||
/// @{
|
||||
|
||||
namespace dnnl {
|
||||
|
||||
/// @addtogroup dnnl_api_interop
|
||||
/// @{
|
||||
|
||||
/// @addtogroup dnnl_api_l0_interop Level Zero interoperability API
|
||||
/// API extensions to interact with the underlying Level Zero run-time.
|
||||
///
|
||||
/// @sa @ref dev_guide_dpcpp_interoperability in developer guide
|
||||
/// @{
|
||||
|
||||
/// Level Zero interoperability namespace
|
||||
namespace l0_interop {
|
||||
|
||||
/// Constructs an engine from Level Zero device and context objects.
|
||||
///
|
||||
/// @param adriver Level Zero driver.
|
||||
/// @param adevice Level Zero device.
|
||||
/// @param acontext Level Zero context.
|
||||
///
|
||||
/// @returns Created engine.
|
||||
inline engine make_engine(const ze_driver_handle_t adriver,
|
||||
const ze_device_handle_t adevice, const ze_context_handle_t acontext) {
|
||||
dnnl_engine_t aengine;
|
||||
error::wrap_c_api(
|
||||
dnnl_l0_interop_engine_create(&aengine, adriver, adevice, acontext),
|
||||
"could not create an engine");
|
||||
return engine(aengine);
|
||||
}
|
||||
|
||||
/// Returns the Level Zero context associated with an engine.
|
||||
///
|
||||
/// @param aengine Engine to query.
|
||||
///
|
||||
/// @returns The underlying Level Zero device of the engine.
|
||||
inline ze_context_handle_t get_context(const engine &aengine) {
|
||||
ze_context_handle_t ctx = nullptr;
|
||||
error::wrap_c_api(dnnl_l0_interop_engine_get_context(aengine.get(), ctx),
|
||||
"could not get a context handle");
|
||||
return ctx;
|
||||
}
|
||||
|
||||
/// Returns the Level Zero device associated with an engine.
|
||||
///
|
||||
/// @param aengine Engine to query.
|
||||
///
|
||||
/// @returns The underlying Level Zero context of the engine.
|
||||
inline ze_device_handle_t get_device(const engine &aengine) {
|
||||
ze_device_handle_t dev = nullptr;
|
||||
error::wrap_c_api(dnnl_l0_interop_engine_get_device(aengine.get(), dev),
|
||||
"could not get a device handle");
|
||||
return dev;
|
||||
}
|
||||
|
||||
/// Returns the Level Zero driver associated with an engine.
|
||||
///
|
||||
/// @param aengine Engine to query.
|
||||
///
|
||||
/// @returns The underlying Level Zero driver of the engine.
|
||||
inline ze_driver_handle_t get_driver(const engine &aengine) {
|
||||
ze_driver_handle_t dri = nullptr;
|
||||
error::wrap_c_api(dnnl_l0_interop_engine_get_driver(aengine.get(), dri),
|
||||
"could not get a driver handle");
|
||||
return dri;
|
||||
}
|
||||
|
||||
/// Creates an execution stream for a given engine associated with a Level Zero
|
||||
/// queue.
|
||||
///
|
||||
/// @param aengine Engine object to use for the stream.
|
||||
/// @param alist Level Zero immediate command list to use for the stream.
|
||||
///
|
||||
/// @returns An execution stream.
|
||||
inline stream make_stream(
|
||||
const engine &aengine, ze_command_list_handle_t alist) {
|
||||
dnnl_stream_t astream;
|
||||
error::wrap_c_api(
|
||||
dnnl_l0_interop_stream_create(&astream, aengine.get(), alist),
|
||||
"could not create a stream");
|
||||
return stream(astream);
|
||||
}
|
||||
|
||||
/// Returns the Level Zero immediate command list associated with an execution stream.
|
||||
///
|
||||
/// @param astream Execution stream to query.
|
||||
///
|
||||
/// @returns Level Zero immediate command list object.
|
||||
inline ze_command_list_handle_t get_list(const stream &astream) {
|
||||
ze_command_list_handle_t list = nullptr;
|
||||
error::wrap_c_api(dnnl_l0_interop_stream_get_list(astream.get(), list),
|
||||
"could not get a stream handle");
|
||||
return list;
|
||||
}
|
||||
|
||||
/// Creates a memory object with multiple handles.
|
||||
///
|
||||
/// @param memory_desc Memory descriptor.
|
||||
/// @param aengine Engine to use.
|
||||
/// @param handles Handles of the memory buffers to use as underlying storages.
|
||||
/// For each element of the @p handles array the following applies:
|
||||
/// - A USM pointer to the user-allocated buffer. In this case the library
|
||||
/// doesn't own the buffer. Requires @p memory_kind to be equal to
|
||||
/// dnnl::l0_interop::memory_kind::usm.
|
||||
/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
|
||||
/// allocate the buffer for the memory object. In this case the library
|
||||
/// owns the buffer.
|
||||
/// - The DNNL_MEMORY_NONE specific value. Instructs the library to
|
||||
/// create memory object without an underlying buffer.
|
||||
///
|
||||
/// If the @p handles vector is not provided the library will allocate all
|
||||
/// buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
|
||||
///
|
||||
/// @returns Created memory object.
|
||||
inline memory make_memory(const memory::desc &memory_desc,
|
||||
const engine &aengine, std::vector<void *> handles = {}) {
|
||||
if (handles.empty()) {
|
||||
const int nhandles = memory_desc.get_num_handles();
|
||||
handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
|
||||
}
|
||||
|
||||
dnnl_memory_t c_memory;
|
||||
error::wrap_c_api(
|
||||
dnnl_l0_interop_memory_create_v2(&c_memory, memory_desc.get(),
|
||||
aengine.get(), handles.size(), handles.data()),
|
||||
"could not create a memory");
|
||||
return memory(c_memory);
|
||||
}
|
||||
|
||||
/// Creates a memory object.
|
||||
///
|
||||
/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
|
||||
/// constructed memory object will have the underlying buffer set. In this
|
||||
/// case, the buffer will be initialized as if:
|
||||
/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
|
||||
/// equal to dnnl::l0_interop::memory_kind::usm, or
|
||||
/// - dnnl::l0_interop::set_buffer() has been called, if @p memory_kind is
|
||||
/// equal to dnnl::l0_interop::memory_kind::buffer.
|
||||
///
|
||||
/// @param memory_desc Memory descriptor.
|
||||
/// @param aengine Engine to use.
|
||||
/// @param handle Handle of the memory buffer to use as an underlying storage.
|
||||
/// - A USM pointer to the user-allocated buffer. In this case the library
|
||||
/// doesn't own the buffer. Requires @p memory_kind to be equal to
|
||||
/// dnnl::l0_interop::memory_kind::usm.
|
||||
/// - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
|
||||
/// allocate the buffer for the memory object. In this case the library
|
||||
/// owns the buffer.
|
||||
/// - The DNNL_MEMORY_NONE specific value. Instructs the library to
|
||||
/// create memory object without an underlying buffer.
|
||||
///
|
||||
/// @returns Created memory object.
|
||||
inline memory make_memory(
|
||||
const memory::desc &memory_desc, const engine &aengine, void *handle) {
|
||||
return make_memory(memory_desc, aengine, std::vector<void *> {handle});
|
||||
}
|
||||
|
||||
/// Returns the Level Zero memory object associated with the memory object.
|
||||
///
|
||||
/// @param amemory A memory object.
|
||||
/// @returns Underlying Level Zero memory object.
|
||||
inline void *get_mem_object(const memory &amemory) {
|
||||
void *mem_object;
|
||||
error::wrap_c_api(
|
||||
dnnl_l0_interop_memory_get_mem_object(amemory.get(), &mem_object),
|
||||
"could not get Level Zero buffer object from a memory object");
|
||||
return mem_object;
|
||||
}
|
||||
|
||||
/// Sets the Level Zero memory object associated with the memory object.
|
||||
///
|
||||
/// For behavioral details see memory::set_data_handle().
|
||||
///
|
||||
/// @param amemory A memory object.
|
||||
/// @param mem_object Level Zero cl_mem object to use as the underlying
|
||||
/// storage. It must have at least get_desc().get_size() bytes
|
||||
/// allocated.
|
||||
inline void set_mem_object(memory &amemory, void *mem_object) {
|
||||
error::wrap_c_api(
|
||||
dnnl_l0_interop_memory_set_mem_object(amemory.get(), mem_object),
|
||||
"could not set Level Zero buffer object from a memory object");
|
||||
}
|
||||
|
||||
/// Executes computations specified by the primitive in a specified stream and
|
||||
/// returns a Level Zero event.
|
||||
///
|
||||
/// Arguments are passed via an arguments map containing
|
||||
/// <index, memory object> pairs. The index must be one of the `DNNL_ARG_*`
|
||||
/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
|
||||
/// matching the one returned by
|
||||
/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using
|
||||
/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
|
||||
///
|
||||
/// @param aprimitive Primitive to execute.
|
||||
/// @param astream Stream object. The stream must belong to the same engine
|
||||
/// as the primitive.
|
||||
/// @param args Arguments map.
|
||||
/// @param deps Optional vector with `ze_event_handle_t` dependencies.
|
||||
///
|
||||
/// @returns Output event.
|
||||
inline ze_event_handle_t execute(const dnnl::primitive &aprimitive,
|
||||
const stream &astream, const std::unordered_map<int, memory> &args,
|
||||
const std::vector<ze_event_handle_t> &deps = {}) {
|
||||
std::vector<dnnl_exec_arg_t> c_args;
|
||||
c_args.reserve(args.size());
|
||||
for (const auto &a : args)
|
||||
c_args.push_back({a.first, a.second.get()});
|
||||
|
||||
const ze_event_handle_t *c_deps = deps.empty() ? nullptr : deps.data();
|
||||
|
||||
ze_event_handle_t return_event;
|
||||
error::wrap_c_api(dnnl_l0_interop_primitive_execute(aprimitive.get(),
|
||||
astream.get(), c_args.size(), c_args.data(),
|
||||
deps.size(), c_deps, &return_event),
|
||||
"could not execute a primitive");
|
||||
return return_event;
|
||||
}
|
||||
|
||||
} // namespace l0_interop
|
||||
|
||||
/// @} dnnl_api_l0_interop
|
||||
|
||||
/// @} dnnl_api_interop
|
||||
|
||||
} // namespace dnnl
|
||||
|
||||
/// @} dnnl_api
|
||||
|
||||
#endif // ONEAPI_DNNL_DNNL_L0_HPP
|
@ -284,7 +284,7 @@ if(DNNL_CPU_THREADING_RUNTIME STREQUAL "TBB")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(DNNL_GPU_RUNTIME STREQUAL "OCL" OR (DNNL_GPU_SYCL AND DNNL_GPU_VENDOR STREQUAL "INTEL"))
|
||||
if("${DNNL_GPU_RUNTIME}" MATCHES "^(OCL|L0)$" OR (DNNL_GPU_SYCL AND DNNL_GPU_VENDOR STREQUAL "INTEL"))
|
||||
install(FILES
|
||||
"../cmake/FindOpenCL.cmake"
|
||||
DESTINATION ${LIB_CONFIG_INSTALL_DIR})
|
||||
|
@ -1953,6 +1953,7 @@ enum runtime_kind_t {
|
||||
dnnl_runtime_threadpool,
|
||||
dnnl_runtime_ocl,
|
||||
dnnl_runtime_sycl,
|
||||
dnnl_runtime_l0,
|
||||
};
|
||||
|
||||
namespace runtime_kind {
|
||||
@ -1963,6 +1964,7 @@ const runtime_kind_t tbb = dnnl_runtime_tbb;
|
||||
const runtime_kind_t threadpool = dnnl_runtime_threadpool;
|
||||
const runtime_kind_t ocl = dnnl_runtime_ocl;
|
||||
const runtime_kind_t sycl = dnnl_runtime_sycl;
|
||||
const runtime_kind_t l0 = dnnl_runtime_l0;
|
||||
} // namespace runtime_kind
|
||||
|
||||
using primitive_kind_t = dnnl_primitive_kind_t;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2016-2024 Intel Corporation
|
||||
* Copyright 2016-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -29,12 +29,16 @@
|
||||
#include "cpu/cpu_engine.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef DNNL_WITH_SYCL
|
||||
#include "xpu/sycl/engine_factory.hpp"
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#include "xpu/ocl/engine_factory.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef DNNL_WITH_SYCL
|
||||
#include "xpu/sycl/engine_factory.hpp"
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#include "gpu/intel/l0/engine_factory.hpp"
|
||||
#endif
|
||||
|
||||
namespace dnnl {
|
||||
@ -42,23 +46,27 @@ namespace impl {
|
||||
|
||||
static inline std::unique_ptr<engine_factory_t> get_engine_factory(
|
||||
engine_kind_t kind, runtime_kind_t runtime_kind) {
|
||||
|
||||
#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
|
||||
if (kind == engine_kind::cpu && is_native_runtime(runtime_kind)) {
|
||||
return std::unique_ptr<engine_factory_t>(
|
||||
new cpu::cpu_engine_factory_t());
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DNNL_WITH_SYCL
|
||||
if (runtime_kind == runtime_kind::sycl) {
|
||||
return xpu::sycl::get_engine_factory(kind);
|
||||
}
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
if (kind == engine_kind::gpu && runtime_kind == runtime_kind::ocl) {
|
||||
return std::unique_ptr<engine_factory_t>(
|
||||
new xpu::ocl::engine_factory_t(kind));
|
||||
}
|
||||
#endif
|
||||
#ifdef DNNL_WITH_SYCL
|
||||
if (runtime_kind == runtime_kind::sycl)
|
||||
return xpu::sycl::get_engine_factory(kind);
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
if (kind == engine_kind::gpu && runtime_kind == runtime_kind::l0) {
|
||||
return gpu::intel::l0::get_engine_factory(kind);
|
||||
}
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2016-2024 Intel Corporation
|
||||
* Copyright 2016-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -185,6 +185,8 @@ inline runtime_kind_t get_default_runtime(engine_kind_t kind) {
|
||||
if (kind == engine_kind::gpu) return runtime_kind::ocl;
|
||||
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
if (kind == engine_kind::gpu) return runtime_kind::sycl;
|
||||
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
if (kind == engine_kind::gpu) return runtime_kind::l0;
|
||||
#endif
|
||||
#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SEQ
|
||||
return runtime_kind::seq;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2018-2024 Intel Corporation
|
||||
* Copyright 2018-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -291,7 +291,8 @@ std::string get_jit_profiling_jitdumpdir() {
|
||||
|
||||
bool is_destroying_cache_safe() {
|
||||
#if defined(_WIN32) \
|
||||
&& (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL)
|
||||
&& (defined(DNNL_WITH_SYCL) || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0 \
|
||||
|| DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL)
|
||||
// The ntdll.dll library is located in system32, therefore setting
|
||||
// additional environment is not required.
|
||||
HMODULE handle = LoadLibraryExA(
|
||||
|
@ -58,9 +58,12 @@ add_subdirectory(jit)
|
||||
|
||||
if(DNNL_GPU_RUNTIME STREQUAL "OCL")
|
||||
add_subdirectory(ocl)
|
||||
elseif(DNNL_GPU_RUNTIME STREQUAL "L0")
|
||||
add_subdirectory(l0)
|
||||
elseif(DNNL_WITH_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
add_subdirectory(ocl)
|
||||
add_subdirectory(l0/utils)
|
||||
endif()
|
||||
|
||||
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel)
|
||||
|
@ -16,16 +16,21 @@
|
||||
|
||||
#include "gpu/intel/compute/ukernels.hpp"
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#include "gpu/intel/ocl/engine.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
#include "gpu/intel/sycl/engine.hpp"
|
||||
#include "gpu/intel/sycl/utils.hpp"
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#include "gpu/intel/ocl/engine.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
#endif
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
@ -51,6 +56,11 @@ bool mayiuse_microkernels(const engine_t *engine) {
|
||||
|
||||
auto mayiuse_mk = [](const engine_t *engine) {
|
||||
switch (engine->runtime_kind()) {
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
case runtime_kind::sycl:
|
||||
return sycl::mayiuse_microkernels(
|
||||
utils::downcast<const sycl::engine_t *>(engine));
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
case runtime_kind::ocl: {
|
||||
auto *ocl_engine
|
||||
@ -60,10 +70,10 @@ bool mayiuse_microkernels(const engine_t *engine) {
|
||||
cl_microkernels_check_kernel_code);
|
||||
}
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
case runtime_kind::sycl:
|
||||
return sycl::mayiuse_microkernels(
|
||||
utils::downcast<const sycl::engine_t *>(engine));
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
case runtime_kind::l0:
|
||||
return utils::downcast<const l0::engine_t *>(engine)
|
||||
->mayiuse_microkernels();
|
||||
#endif
|
||||
default: return false;
|
||||
}
|
||||
|
119
src/gpu/intel/compute/utils.cpp
Normal file
119
src/gpu/intel/compute/utils.cpp
Normal file
@ -0,0 +1,119 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2019-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/compute/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace compute {
|
||||
|
||||
status_t preprocess_headers(stringstream_t &pp_code, const char *code,
|
||||
const compute::kernel_ctx_t &kernel_ctx) {
|
||||
stringstream_t code_stream(code);
|
||||
|
||||
for (std::string line; std::getline(code_stream, line);) {
|
||||
const size_t include_pos = line.find("#include");
|
||||
if (include_pos != std::string::npos) {
|
||||
static constexpr size_t include_len = 8;
|
||||
const size_t first_quote_pos
|
||||
= line.find("\"", include_pos + include_len);
|
||||
const size_t second_quote_pos
|
||||
= line.find("\"", first_quote_pos + 1);
|
||||
const size_t kernel_name_len
|
||||
= second_quote_pos - first_quote_pos - 1;
|
||||
const auto header_name
|
||||
= line.substr(first_quote_pos + 1, kernel_name_len);
|
||||
const char *header_source
|
||||
= kernel_ctx.get_custom_header(header_name);
|
||||
if (!header_source) header_source = get_kernel_header(header_name);
|
||||
CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
|
||||
} else {
|
||||
pp_code << line << std::endl;
|
||||
}
|
||||
}
|
||||
return status::success;
|
||||
}
|
||||
|
||||
void debugdump_processed_source(const std::string &source,
|
||||
const std::string &options, const std::string &cl_options) {
|
||||
#if defined(__linux__) && defined(DNNL_DEV_MODE)
|
||||
if (get_verbose(verbose_t::debuginfo) >= 10) {
|
||||
auto get_defines = [](const std::string &from) {
|
||||
std::string ret;
|
||||
size_t pos = 0;
|
||||
while (pos < from.length()) {
|
||||
// Find next define argument
|
||||
pos = from.find("-D", pos);
|
||||
|
||||
// Generate argument, quotes are interpreted literally, but
|
||||
// other special shell characters need escaped. Does not
|
||||
// currently handle quotes with the ' character or nested quotes
|
||||
char quote_parity = true;
|
||||
while (pos < from.length()) {
|
||||
if (quote_parity
|
||||
&& utils::one_of(from[pos], '~', '#', '$', '&', '*',
|
||||
'(', ')', '\\', '|', '[', ']', '{', '}',
|
||||
';', '\'', '<', '>', '/', '?', '!')) {
|
||||
ret += '\\';
|
||||
}
|
||||
ret += from[pos];
|
||||
if (from[pos] == '"') quote_parity ^= true;
|
||||
if (from[pos] == ' ' && quote_parity) break;
|
||||
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
auto execute_command = [](const std::string &cmd,
|
||||
const std::string &stdin) {
|
||||
std::string result;
|
||||
std::array<char, 256> buffer;
|
||||
FILE *pipe = popen(cmd.c_str(), "w");
|
||||
fputs(stdin.c_str(), pipe);
|
||||
if (pipe) {
|
||||
while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
|
||||
result += buffer.data();
|
||||
}
|
||||
}
|
||||
pclose(pipe);
|
||||
return result;
|
||||
};
|
||||
|
||||
// Run utilities to evaluate preprocessor defines and format the file
|
||||
// Theoretically, we can accomplish this task with libclang, but it
|
||||
// seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
|
||||
// so that calls to the system are not included in the default build.
|
||||
|
||||
// Due to the use of a different C preprocessor, warnings should not be
|
||||
// ignored, as they may correspond to a different behavior in the OpenCL
|
||||
// C preprocessor
|
||||
auto o = get_defines(options) + get_defines(cl_options);
|
||||
std::string preprocess_cmd
|
||||
= std::string() + "cpp -P " + o + " | clang-format";
|
||||
execute_command(preprocess_cmd, source);
|
||||
std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace compute
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
@ -17,14 +17,8 @@
|
||||
#ifndef GPU_INTEL_COMPUTE_UTILS_HPP
|
||||
#define GPU_INTEL_COMPUTE_UTILS_HPP
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "common/utils.hpp"
|
||||
#include "gpu/intel/compute/device_info.hpp"
|
||||
#include "gpu/intel/compute/kernel_ctx.hpp"
|
||||
#include "gpu/intel/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
@ -155,6 +149,12 @@ private:
|
||||
range_t local_range_;
|
||||
};
|
||||
|
||||
status_t preprocess_headers(stringstream_t &pp_code, const char *code,
|
||||
const compute::kernel_ctx_t &kernel_ctx);
|
||||
|
||||
void debugdump_processed_source(const std::string &source,
|
||||
const std::string &options, const std::string &ocl_options);
|
||||
|
||||
} // namespace compute
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
|
@ -46,7 +46,15 @@ GEMMSTONE_NAMESPACE_START
|
||||
#ifndef GENERATOR_BASE
|
||||
|
||||
#define GENERATOR_SUPER(hw) ngen::OpenCLCodeGenerator<hw>
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
#define FORWARD(hw) NGEN_FORWARD_SYCL(hw);
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw);
|
||||
#endif
|
||||
#define GENERATOR_DEBUGINFO {__FILE__, __LINE__}
|
||||
|
||||
#define GENERATOR_BASE(hw) GENERATOR_SUPER(hw)
|
||||
|
@ -39,6 +39,16 @@
|
||||
#define MAGICSIZEY 2
|
||||
#define MAGICSIZEZ 1
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
#define FORWARD(hw) NGEN_FORWARD_SYCL(hw);
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#define FORWARD(hw) NGEN_FORWARD_LEVEL_ZERO(hw);
|
||||
#endif
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
@ -49,7 +59,7 @@ using namespace ngen;
|
||||
|
||||
template <HW hw>
|
||||
class binary_format_kernel_t : public generator_t<hw> {
|
||||
NGEN_FORWARD_OPENCL(hw);
|
||||
FORWARD(hw);
|
||||
|
||||
public:
|
||||
binary_format_kernel_t()
|
||||
|
@ -37,6 +37,9 @@
|
||||
#ifdef WITH_OPENCL_RUNTIME
|
||||
#include "ngen_opencl.hpp"
|
||||
#endif
|
||||
#ifdef WITH_L0_RUNTIME
|
||||
#include "ngen_level_zero.hpp"
|
||||
#endif
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@ -1816,6 +1819,29 @@ cl_kernel make_kernel(const kernel::iface_t &iface, const stmt_t &body,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WITH_L0_RUNTIME
|
||||
std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
|
||||
const kernel::iface_t &iface, const stmt_t &body,
|
||||
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
|
||||
ze_context_handle_t ctx, ze_device_handle_t dev) {
|
||||
ngen::NEOInterfaceHandler interface = generate_ngen_interface(
|
||||
iface, options, false, body);
|
||||
|
||||
#define GPU_HW_CASE(hw) \
|
||||
ir_to_ngen_generator_t<ngen::LevelZeroCodeGenerator<(hw)>> g( \
|
||||
iface, options, debug_cfg); \
|
||||
g.setInterface(std::move(interface)); \
|
||||
convert_ir_to_ngen(body, g); \
|
||||
auto module = g.getModule(ctx, dev); \
|
||||
auto kernel = g.getKernel(module); \
|
||||
return std::make_pair(module, kernel);
|
||||
|
||||
GPU_HW_SWITCH(options.hw().ngen_hw());
|
||||
#undef GPU_HW_CASE
|
||||
return {};
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace jit
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
|
@ -24,9 +24,13 @@
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
#include <sycl/sycl.hpp>
|
||||
#define WITH_SYCL_RUNTIME
|
||||
#endif
|
||||
#define WITH_OPENCL_RUNTIME
|
||||
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#include <CL/cl.h>
|
||||
#define WITH_OPENCL_RUNTIME
|
||||
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#include "level_zero/ze_api.h"
|
||||
#define WITH_L0_RUNTIME
|
||||
#endif
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@ -44,6 +48,12 @@ cl_kernel make_kernel(const kernel::iface_t &iface, const stmt_t &body,
|
||||
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
|
||||
cl_context ctx, cl_device_id dev);
|
||||
#endif
|
||||
#ifdef WITH_L0_RUNTIME
|
||||
std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
|
||||
const kernel::iface_t &iface, const stmt_t &body,
|
||||
const kernel::options_t &options, const ngen::DebugConfig &debug_cfg,
|
||||
ze_context_handle_t ctx, ze_device_handle_t dev);
|
||||
#endif
|
||||
|
||||
} // namespace jit
|
||||
} // namespace intel
|
||||
|
@ -34,7 +34,7 @@
|
||||
#define DNNL
|
||||
#define MICROKERNEL_INTERFACE
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL || DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#define ZEBIN_OUTPUT
|
||||
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
#define OPENCL_OUTPUT
|
||||
|
@ -40,6 +40,14 @@ inline cl_kernel make_kernel(
|
||||
kernel.debug_cfg, ctx, dev);
|
||||
}
|
||||
#endif
|
||||
#ifdef WITH_L0_RUNTIME
|
||||
inline std::pair<ze_module_handle_t, ze_kernel_handle_t> make_kernel(
|
||||
const kernel_t &kernel, ze_context_handle_t ctx,
|
||||
ze_device_handle_t dev) {
|
||||
return make_kernel(kernel.iface, kernel.body, kernel.options,
|
||||
kernel.debug_cfg, ctx, dev);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace dsl
|
||||
} // namespace jit
|
||||
|
@ -44,6 +44,12 @@
|
||||
#include "ngen_opencl.hpp"
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
#include "gpu/intel/l0/kernel.hpp"
|
||||
#include "ngen_level_zero.hpp"
|
||||
#endif
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
@ -91,6 +97,11 @@ template <gpu_gen_t hw>
|
||||
using ngen_code_generator_t = ngen::OpenCLCodeGenerator<hw>;
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
template <gpu_gen_t hw>
|
||||
using ngen_code_generator_t = ngen::LevelZeroCodeGenerator<hw>;
|
||||
#endif
|
||||
|
||||
void check_kernel_size(const std::string &kernel_name, size_t kernel_size,
|
||||
const intel::engine_t *engine);
|
||||
|
||||
@ -123,6 +134,15 @@ public:
|
||||
auto ocl_kernel = ngen_code_generator_t<hw>::getKernel(
|
||||
ocl_engine->context(), ocl_engine->device());
|
||||
return ocl::kernel_t::make(kernel, ocl_kernel, {});
|
||||
#endif
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_L0
|
||||
auto *l0_engine = utils::downcast<const l0::engine_t *>(engine);
|
||||
auto l0_module = std::make_shared<l0::module_wrapper_t>(
|
||||
ngen_code_generator_t<hw>::getModule(
|
||||
l0_engine->context(), l0_engine->device()));
|
||||
auto l0_kernel
|
||||
= ngen_code_generator_t<hw>::getKernel(*(l0_module.get()));
|
||||
return l0::kernel_t::make(kernel, l0_module, l0_kernel, kernel_name());
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
24
src/gpu/intel/l0/CMakeLists.txt
Normal file
24
src/gpu/intel/l0/CMakeLists.txt
Normal file
@ -0,0 +1,24 @@
|
||||
#===============================================================================
|
||||
# Copyright 2025 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#===============================================================================
|
||||
|
||||
file(GLOB_RECURSE SOURCES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
|
||||
)
|
||||
|
||||
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0)
|
||||
add_library(${OBJ_LIB} OBJECT ${SOURCES})
|
||||
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
|
||||
$<TARGET_OBJECTS:${OBJ_LIB}>)
|
73
src/gpu/intel/l0/capi/engine.cpp
Normal file
73
src/gpu/intel/l0/capi/engine.cpp
Normal file
@ -0,0 +1,73 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2019-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "oneapi/dnnl/dnnl_l0.h"
|
||||
|
||||
#include "common/utils.hpp"
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
#include "gpu/intel/l0/engine_factory.hpp"
|
||||
|
||||
using namespace dnnl::impl;
|
||||
|
||||
dnnl_status_t dnnl_l0_interop_engine_create(dnnl_engine_t *engine,
|
||||
const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
|
||||
const ze_context_handle_t acontext) {
|
||||
bool args_ok = !utils::any_null(engine, adriver, adevice, acontext);
|
||||
if (!args_ok) return status::invalid_arguments;
|
||||
|
||||
gpu::intel::l0::engine_factory_t f(engine_kind::gpu);
|
||||
|
||||
size_t index;
|
||||
CHECK(gpu::intel::l0::get_device_index(adevice, &index));
|
||||
|
||||
return f.engine_create(engine, adriver, adevice, acontext, index);
|
||||
}
|
||||
|
||||
dnnl_status_t dnnl_l0_interop_engine_get_context(
|
||||
dnnl_engine_t engine, ze_context_handle_t context) {
|
||||
bool args_ok = !utils::any_null(engine, context)
|
||||
&& (engine->runtime_kind() == runtime_kind::l0);
|
||||
if (!args_ok) return status::invalid_arguments;
|
||||
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
context = l0_engine->context();
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
dnnl_status_t dnnl_l0_interop_engine_get_device(
|
||||
dnnl_engine_t engine, ze_device_handle_t device) {
|
||||
bool args_ok = !utils::any_null(engine, device)
|
||||
&& (engine->runtime_kind() == runtime_kind::l0);
|
||||
if (!args_ok) return status::invalid_arguments;
|
||||
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
device = l0_engine->device();
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
dnnl_status_t dnnl_l0_interop_engine_get_driver(
|
||||
dnnl_engine_t engine, ze_driver_handle_t driver) {
|
||||
bool args_ok = !utils::any_null(engine, driver)
|
||||
&& (engine->runtime_kind() == runtime_kind::l0);
|
||||
if (!args_ok) return status::invalid_arguments;
|
||||
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
driver = l0_engine->driver();
|
||||
|
||||
return status::success;
|
||||
}
|
129
src/gpu/intel/l0/capi/memory.cpp
Normal file
129
src/gpu/intel/l0/capi/memory.cpp
Normal file
@ -0,0 +1,129 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2019-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "oneapi/dnnl/dnnl_l0.h"
|
||||
|
||||
#include "common/utils.hpp"
|
||||
#include "gpu/intel/l0/memory_storage.hpp"
|
||||
|
||||
using namespace dnnl::impl;
|
||||
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_memory_create(dnnl_memory_t *memory,
|
||||
const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
|
||||
void *handle) {
|
||||
bool ok = !utils::any_null(memory, memory_desc, engine)
|
||||
&& engine->runtime_kind() == runtime_kind::l0;
|
||||
if (!ok) return status::invalid_arguments;
|
||||
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
auto kind = gpu::intel::l0::get_memory_storage_kind(
|
||||
gpu::intel::l0::get_pointer_type(l0_engine->context(), handle));
|
||||
if (handle != DNNL_MEMORY_NONE && handle != DNNL_MEMORY_ALLOCATE
|
||||
&& kind == gpu::intel::l0::memory_storage_kind_t::unknown
|
||||
&& !engine->mayiuse_system_memory_allocators())
|
||||
return status::invalid_arguments;
|
||||
|
||||
const auto mdw = memory_desc_wrapper(memory_desc);
|
||||
if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
|
||||
return status::invalid_arguments;
|
||||
|
||||
unsigned flags = (handle == DNNL_MEMORY_ALLOCATE)
|
||||
? memory_flags_t::alloc
|
||||
: memory_flags_t::use_runtime_ptr;
|
||||
handle = (handle == DNNL_MEMORY_ALLOCATE) ? nullptr : handle;
|
||||
|
||||
std::unique_ptr<memory_storage_t> mem_storage;
|
||||
mem_storage.reset(new gpu::intel::l0::memory_storage_t(
|
||||
engine, gpu::intel::l0::memory_storage_kind_t::device));
|
||||
if (!mem_storage) return status::out_of_memory;
|
||||
|
||||
CHECK(mem_storage->init(
|
||||
flags, dnnl_memory_desc_get_size(memory_desc), handle));
|
||||
|
||||
return safe_ptr_assign(
|
||||
*memory, new memory_t(engine, memory_desc, std::move(mem_storage)));
|
||||
}
|
||||
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_memory_create_v2(dnnl_memory_t *memory,
|
||||
const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
|
||||
size_t nhandles, void **handles) {
|
||||
bool ok = !utils::any_null(memory, memory_desc, engine, handles)
|
||||
&& nhandles > 0 && engine->runtime_kind() == runtime_kind::l0;
|
||||
if (!ok) return status::invalid_arguments;
|
||||
|
||||
const auto mdw = memory_desc_wrapper(memory_desc);
|
||||
if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
|
||||
return status::invalid_arguments;
|
||||
|
||||
std::vector<unsigned> flags_vec(nhandles);
|
||||
std::vector<void *> handles_vec(nhandles);
|
||||
for (size_t i = 0; i < nhandles; i++) {
|
||||
unsigned f = (handles[i] == DNNL_MEMORY_ALLOCATE)
|
||||
? memory_flags_t::alloc
|
||||
: memory_flags_t::use_runtime_ptr;
|
||||
void *h = (handles[i] == DNNL_MEMORY_ALLOCATE) ? nullptr : handles[i];
|
||||
flags_vec[i] = f;
|
||||
handles_vec[i] = h;
|
||||
}
|
||||
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
std::vector<std::unique_ptr<memory_storage_t>> mem_storages(nhandles);
|
||||
for (size_t i = 0; i < nhandles; i++) {
|
||||
auto kind = gpu::intel::l0::get_memory_storage_kind(
|
||||
gpu::intel::l0::get_pointer_type(
|
||||
l0_engine->context(), handles[i]));
|
||||
if (handles[i] != DNNL_MEMORY_NONE && handles[i] != DNNL_MEMORY_ALLOCATE
|
||||
&& kind == gpu::intel::l0::memory_storage_kind_t::unknown
|
||||
&& !engine->mayiuse_system_memory_allocators()) {
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
size_t sz = dnnl_memory_desc_get_size_v2(
|
||||
memory_desc, static_cast<int>(i));
|
||||
mem_storages[i].reset(new gpu::intel::l0::memory_storage_t(
|
||||
engine, gpu::intel::l0::memory_storage_kind_t::device));
|
||||
if (!mem_storages[i]) return status::out_of_memory;
|
||||
CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i]));
|
||||
}
|
||||
|
||||
return safe_ptr_assign(*memory,
|
||||
new memory_t(engine, memory_desc, std::move(mem_storages)));
|
||||
}
|
||||
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_memory_get_mem_object(
|
||||
const memory_t *memory, void **mem_object) {
|
||||
if (utils::any_null(mem_object)) return status::invalid_arguments;
|
||||
|
||||
if (!memory) {
|
||||
mem_object = nullptr;
|
||||
return status::success;
|
||||
}
|
||||
bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0);
|
||||
if (!args_ok) return status::invalid_arguments;
|
||||
|
||||
void *handle;
|
||||
status_t status = memory->get_data_handle(&handle);
|
||||
if (status == status::success) mem_object = &handle;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
dnnl_status_t DNNL_API dnnl_l0_interop_memory_set_mem_object(
|
||||
memory_t *memory, void *mem_object) {
|
||||
bool args_ok = (memory->engine()->runtime_kind() == runtime_kind::l0);
|
||||
if (!args_ok) return status::invalid_arguments;
|
||||
|
||||
return memory->set_data_handle(mem_object);
|
||||
}
|
67
src/gpu/intel/l0/capi/primitive.cpp
Normal file
67
src/gpu/intel/l0/capi/primitive.cpp
Normal file
@ -0,0 +1,67 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2023-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "oneapi/dnnl/dnnl_l0.h"
|
||||
|
||||
#include "common/primitive_desc_iface.hpp"
|
||||
#include "common/primitive_iface.hpp"
|
||||
#include "common/utils.hpp"
|
||||
#include "gpu/intel/l0/stream.hpp"
|
||||
|
||||
using namespace dnnl::impl;
|
||||
|
||||
dnnl_status_t dnnl_l0_interop_primitive_execute(
|
||||
const primitive_iface_t *primitive_iface, dnnl_stream_t stream,
|
||||
size_t nargs, const dnnl_exec_arg_t *args, size_t ndeps,
|
||||
const ze_event_handle_t *deps, ze_event_handle_t *return_event) {
|
||||
const bool ok = !utils::any_null(primitive_iface, stream)
|
||||
&& primitive_iface->engine() == stream->engine()
|
||||
&& primitive_iface->engine()->runtime_kind() == runtime_kind::l0
|
||||
&& IMPLICATION(nargs > 0, args != nullptr)
|
||||
&& IMPLICATION(ndeps > 0, deps != nullptr);
|
||||
if (!ok) return status::invalid_arguments;
|
||||
|
||||
auto *l0_stream = utils::downcast<gpu::intel::l0::stream_t *>(stream);
|
||||
stream->before_exec_hook();
|
||||
|
||||
if (deps != nullptr) {
|
||||
std::vector<ze_event_handle_t> events(ndeps);
|
||||
for (size_t i = 0; i < ndeps; i++)
|
||||
events[i] = deps[i];
|
||||
l0_stream->l0_ctx().set_deps(events);
|
||||
}
|
||||
|
||||
// run primitive
|
||||
exec_args_t exec_args;
|
||||
CHECK(cvt_primitive_args(primitive_iface->pd()->impl().get(),
|
||||
static_cast<int>(nargs), args, exec_args));
|
||||
|
||||
exec_ctx_t ctx(stream, std::move(exec_args));
|
||||
CHECK(primitive_execute(primitive_iface, ctx));
|
||||
|
||||
// return output event
|
||||
if (return_event != nullptr) {
|
||||
if (l0_stream->impl()->flags() & stream_flags::in_order) {
|
||||
*return_event = nullptr;
|
||||
} else {
|
||||
*return_event = l0_stream->get_output_event();
|
||||
}
|
||||
}
|
||||
|
||||
stream->after_exec_hook();
|
||||
|
||||
return status::success;
|
||||
}
|
51
src/gpu/intel/l0/capi/stream.cpp
Normal file
51
src/gpu/intel/l0/capi/stream.cpp
Normal file
@ -0,0 +1,51 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2019-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "oneapi/dnnl/dnnl_l0.h"
|
||||
|
||||
#include "common/utils.hpp"
|
||||
#include "gpu/intel/l0/stream.hpp"
|
||||
|
||||
using namespace dnnl::impl;
|
||||
|
||||
dnnl_status_t dnnl_l0_interop_stream_create(dnnl_stream_t *stream,
|
||||
dnnl_engine_t engine, ze_command_list_handle_t list) {
|
||||
bool args_ok = !utils::any_null(stream, engine, list)
|
||||
&& engine->runtime_kind() == runtime_kind::l0;
|
||||
if (!args_ok) return status::invalid_arguments;
|
||||
|
||||
std::unique_ptr<stream_impl_t> stream_impl(
|
||||
new gpu::intel::l0::stream_impl_t(
|
||||
stream_flags::default_flags, list));
|
||||
if (!stream_impl) return status::out_of_memory;
|
||||
|
||||
CHECK(engine->create_stream(stream, stream_impl.get()));
|
||||
stream_impl.release();
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
dnnl_status_t dnnl_l0_interop_stream_get_list(
|
||||
dnnl_stream_t stream, ze_command_list_handle_t list) {
|
||||
bool args_ok = !utils::any_null(list, stream)
|
||||
&& stream->engine()->runtime_kind() == runtime_kind::l0;
|
||||
if (!args_ok) return status::invalid_arguments;
|
||||
|
||||
auto *l0_stream = utils::downcast<const gpu::intel::l0::stream_t *>(stream);
|
||||
list = l0_stream->list();
|
||||
|
||||
return status::success;
|
||||
}
|
189
src/gpu/intel/l0/compiler.cpp
Normal file
189
src/gpu/intel/l0/compiler.cpp
Normal file
@ -0,0 +1,189 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/compiler.hpp"
|
||||
|
||||
#ifdef _WIN32
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include "windows.h"
|
||||
#else
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#include "ocloc_api.h"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
inline void *find_ocloc_symbol(const char *symbol) {
|
||||
#ifdef _WIN32
|
||||
// Use LOAD_LIBRARY_SEARCH_SYSTEM32 flag to avoid DLL hijacking issue.
|
||||
HMODULE handle = LoadLibraryExA(
|
||||
"ocloc64.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
|
||||
if (!handle) return nullptr;
|
||||
return reinterpret_cast<void *>(GetProcAddress(handle, symbol));
|
||||
#else
|
||||
void *handle = dlopen("libocloc.so", RTLD_NOW | RTLD_LOCAL);
|
||||
if (!handle) return nullptr;
|
||||
return dlsym(handle, symbol);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
F find_ocloc_symbol(const char *symbol) {
|
||||
return (F)find_ocloc_symbol(symbol);
|
||||
}
|
||||
|
||||
status_t ocloc_invoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources,
|
||||
const uint8_t **DataSources, const uint64_t *LenSources,
|
||||
const char **NameSources, uint32_t NumInputHeaders,
|
||||
const uint8_t **DataInputHeaders, const uint64_t *LenInputHeaders,
|
||||
const char **NameInputHeaders, uint32_t *NumOutputs,
|
||||
uint8_t ***DataOutputs, uint64_t **LenOutputs, char ***NameOutputs) {
|
||||
static auto f = find_ocloc_symbol<decltype(&oclocInvoke)>("oclocInvoke");
|
||||
if (!f) return status::runtime_error;
|
||||
|
||||
if (f(NumArgs, Argv, NumSources, DataSources, LenSources, NameSources,
|
||||
NumInputHeaders, DataInputHeaders, LenInputHeaders,
|
||||
NameInputHeaders, NumOutputs, DataOutputs, LenOutputs,
|
||||
NameOutputs))
|
||||
return status::runtime_error;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t ocloc_free(uint32_t *numOutputs, uint8_t ***dataOutputs,
|
||||
uint64_t **lenOutputs, char ***nameOutputs) {
|
||||
static auto f
|
||||
= find_ocloc_symbol<decltype(&oclocFreeOutput)>("oclocFreeOutput");
|
||||
if (!f) return status::runtime_error;
|
||||
|
||||
if (f(numOutputs, dataOutputs, lenOutputs, nameOutputs))
|
||||
return status::runtime_error;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t ocloc_get_extensions(std::string &extensions) {
|
||||
std::vector<const char *> args = {"ocloc", "query", "CL_DEVICE_EXTENSIONS"};
|
||||
|
||||
uint32_t num_outputs = 0;
|
||||
uint8_t **data_outputs = nullptr;
|
||||
uint64_t *len_outputs = nullptr;
|
||||
char **name_outputs = nullptr;
|
||||
|
||||
CHECK(ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(), 0,
|
||||
nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr, &num_outputs,
|
||||
&data_outputs, &len_outputs, &name_outputs));
|
||||
|
||||
for (uint32_t i = 0; i < num_outputs; i++) {
|
||||
if (!strcmp(name_outputs[i], "stdout.log")) {
|
||||
if (len_outputs[i] > 0) {
|
||||
extensions = std::string(
|
||||
reinterpret_cast<const char *>(data_outputs[i]));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CHECK(ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
bool ocloc_mayiuse_microkernels(const std::string &kernel_code) {
|
||||
std::vector<const char *> args
|
||||
= {"ocloc", "compile", "-q", "-file", "test.cl"};
|
||||
const uint8_t *data_sources[]
|
||||
= {reinterpret_cast<const uint8_t *>(kernel_code.c_str())};
|
||||
const uint64_t len_sources[] = {kernel_code.length() + 1};
|
||||
const char *name_sources[] = {"test.cl"};
|
||||
|
||||
uint32_t num_outputs = 0;
|
||||
uint8_t **data_outputs = nullptr;
|
||||
uint64_t *len_outputs = nullptr;
|
||||
char **name_outputs = nullptr;
|
||||
|
||||
bool compilation_successful = true;
|
||||
if (ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(), 1,
|
||||
data_sources, len_sources, name_sources, 0, nullptr, nullptr,
|
||||
nullptr, &num_outputs, &data_outputs, &len_outputs,
|
||||
&name_outputs))
|
||||
compilation_successful = false;
|
||||
ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs);
|
||||
|
||||
return compilation_successful;
|
||||
}
|
||||
|
||||
status_t ocloc_build_kernels(const std::string &kernel_code,
|
||||
const std::string &options, const std::string &ip_version,
|
||||
xpu::binary_t &binary) {
|
||||
std::vector<const char *> args = {"ocloc", "compile", "-q", "--format",
|
||||
"zebin", "-exclude_ir", "-output_no_suffix", "-file", "main.cl",
|
||||
"-device", ip_version.c_str(), "-options", options.c_str()};
|
||||
const uint8_t *data_sources[]
|
||||
= {reinterpret_cast<const uint8_t *>(kernel_code.c_str())};
|
||||
const uint64_t len_sources[] = {kernel_code.length() + 1};
|
||||
const char *name_sources[] = {"main.cl"};
|
||||
|
||||
uint32_t num_outputs = 0;
|
||||
uint8_t **data_outputs = nullptr;
|
||||
uint64_t *len_outputs = nullptr;
|
||||
char **name_outputs = nullptr;
|
||||
|
||||
status_t ret = ocloc_invoke(static_cast<uint32_t>(args.size()), args.data(),
|
||||
1, data_sources, len_sources, name_sources, 0, nullptr, nullptr,
|
||||
nullptr, &num_outputs, &data_outputs, &len_outputs, &name_outputs);
|
||||
if (ret != status::success) {
|
||||
std::string output_string;
|
||||
for (uint32_t i = 0; i < num_outputs; i++) {
|
||||
if (!strcmp(name_outputs[i], "stdout.log")) {
|
||||
if (len_outputs[i] > 0) {
|
||||
output_string = std::string(
|
||||
reinterpret_cast<const char *>(data_outputs[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
CHECK(ocloc_free(
|
||||
&num_outputs, &data_outputs, &len_outputs, &name_outputs));
|
||||
throw std::runtime_error(output_string);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_outputs; i++) {
|
||||
if (!strcmp(name_outputs[i], "main.bin")) {
|
||||
if (len_outputs[i] > 0) {
|
||||
binary.resize(len_outputs[i]);
|
||||
std::memcpy(binary.data(), data_outputs[i], len_outputs[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CHECK(ocloc_free(&num_outputs, &data_outputs, &len_outputs, &name_outputs));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
40
src/gpu/intel/l0/compiler.hpp
Normal file
40
src/gpu/intel/l0/compiler.hpp
Normal file
@ -0,0 +1,40 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_COMPILER_HPP
|
||||
#define GPU_INTEL_L0_COMPILER_HPP
|
||||
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
status_t ocloc_get_extensions(std::string &extensions);
|
||||
bool ocloc_mayiuse_microkernels(const std::string &kernel_code);
|
||||
status_t ocloc_build_kernels(const std::string &kernel_code,
|
||||
const std::string &options, const std::string &ip_version,
|
||||
xpu::binary_t &binary);
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_COMPILER_HPP
|
28
src/gpu/intel/l0/context.cpp
Normal file
28
src/gpu/intel/l0/context.cpp
Normal file
@ -0,0 +1,28 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2023-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/context.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
103
src/gpu/intel/l0/context.hpp
Normal file
103
src/gpu/intel/l0/context.hpp
Normal file
@ -0,0 +1,103 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2023-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_CONTEXT_HPP
|
||||
#define GPU_INTEL_L0_CONTEXT_HPP
|
||||
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
#include "xpu/context.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
struct event_t : public xpu::event_t {
|
||||
event_t() = default;
|
||||
event_t(const event_t &) = default;
|
||||
event_t(const std::vector<ze_event_handle_t> &event) : events_(event) {}
|
||||
event_t(std::vector<ze_event_handle_t> &&event)
|
||||
: events_(std::move(event)) {}
|
||||
event_t(ze_event_handle_t &&event) {
|
||||
events_.emplace_back(std::move(event));
|
||||
}
|
||||
~event_t() override = default;
|
||||
|
||||
event_t &operator=(event_t &&other) {
|
||||
std::swap(events_, other.events_);
|
||||
return *this;
|
||||
}
|
||||
event_t &operator=(const event_t &other) {
|
||||
events_ = other.events_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const ze_event_handle_t &operator[](size_t i) const { return events_[i]; }
|
||||
ze_event_handle_t &operator[](size_t i) { return events_[i]; }
|
||||
size_t size() const { return events_.size(); }
|
||||
|
||||
static event_t &from(xpu::event_t &event) {
|
||||
return *utils::downcast<event_t *>(&event);
|
||||
}
|
||||
static const event_t &from(const xpu::event_t &event) {
|
||||
return *utils::downcast<const event_t *>(&event);
|
||||
}
|
||||
std::unique_ptr<xpu::event_t> clone() const override {
|
||||
return std::unique_ptr<xpu::event_t>(new event_t(*this));
|
||||
}
|
||||
void append(const xpu::event_t &event) {
|
||||
auto &other = *utils::downcast<const event_t *>(&event);
|
||||
events_.insert(
|
||||
events_.end(), other.events_.begin(), other.events_.end());
|
||||
}
|
||||
|
||||
std::vector<ze_event_handle_t> events_;
|
||||
};
|
||||
|
||||
class context_t final : public xpu::context_t {
|
||||
public:
|
||||
context_t() = default;
|
||||
~context_t() override = default;
|
||||
|
||||
context_t &operator=(const context_t &other) {
|
||||
events_ = other.events_;
|
||||
return *this;
|
||||
}
|
||||
void set_deps(std::vector<ze_event_handle_t> &&event) {
|
||||
events_ = event_t(event);
|
||||
}
|
||||
void set_deps(event_t &&events) { events_ = std::move(events); }
|
||||
|
||||
xpu::event_t &get_deps() override { return events_; }
|
||||
const xpu::event_t &get_deps() const override { return events_; }
|
||||
void append_deps(const xpu::event_t &event) override {
|
||||
events_.append(event);
|
||||
}
|
||||
|
||||
status_t get_event(ze_event_handle_t *new_event);
|
||||
|
||||
private:
|
||||
event_t events_;
|
||||
};
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_CONTEXT_HPP
|
145
src/gpu/intel/l0/device_info.cpp
Normal file
145
src/gpu/intel/l0/device_info.cpp
Normal file
@ -0,0 +1,145 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/device_info.hpp"
|
||||
#include "gpu/intel/l0/compiler.hpp"
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
#include "ngen_level_zero.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
status_t device_info_t::init_arch(impl::engine_t *engine) {
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
auto context = l0_engine->context();
|
||||
auto device = l0_engine->device();
|
||||
|
||||
return init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_,
|
||||
gpu_product_, native_extensions_, mayiuse_systolic_,
|
||||
mayiuse_ngen_kernels_);
|
||||
}
|
||||
|
||||
status_t device_info_t::init_device_name(impl::engine_t *engine) {
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
auto device = l0_engine->device();
|
||||
|
||||
ze_device_properties_t device_properties = {};
|
||||
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
device_properties.pNext = nullptr;
|
||||
|
||||
CHECK(l0::zeDeviceGetProperties(device, &device_properties));
|
||||
name_ = std::string(device_properties.name);
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t device_info_t::init_runtime_version(impl::engine_t *engine) {
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
auto driver = l0_engine->driver();
|
||||
|
||||
ze_driver_properties_t driver_properties = {};
|
||||
driver_properties.stype = ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES;
|
||||
driver_properties.pNext = nullptr;
|
||||
|
||||
l0::zeDriverGetProperties(driver, &driver_properties);
|
||||
|
||||
runtime_version_.major
|
||||
= (driver_properties.driverVersion & 0xFF000000) >> 24;
|
||||
runtime_version_.minor
|
||||
= (driver_properties.driverVersion & 0x00FF0000) >> 16;
|
||||
runtime_version_.build = driver_properties.driverVersion & 0x0000FFFF;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t device_info_t::init_extensions(impl::engine_t *engine) {
|
||||
std::string extension_string;
|
||||
CHECK(ocloc_get_extensions(extension_string));
|
||||
|
||||
for (uint64_t i_ext = 1; i_ext < (uint64_t)compute::device_ext_t::last;
|
||||
i_ext <<= 1) {
|
||||
const char *s_ext = ext2cl_str((compute::device_ext_t)i_ext);
|
||||
|
||||
if (s_ext && extension_string.find(s_ext) != std::string::npos) {
|
||||
extensions_ |= i_ext;
|
||||
}
|
||||
}
|
||||
|
||||
extensions_
|
||||
|= (uint64_t)get_future_extensions(gpu_arch(), mayiuse_systolic());
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t device_info_t::init_attributes(impl::engine_t *engine) {
|
||||
auto *l0_engine = utils::downcast<const gpu::intel::l0::engine_t *>(engine);
|
||||
auto device = l0_engine->device();
|
||||
|
||||
ze_device_properties_t device_properties = {};
|
||||
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
device_properties.pNext = nullptr;
|
||||
|
||||
CHECK(l0::zeDeviceGetProperties(device, &device_properties));
|
||||
|
||||
eu_count_ = device_properties.numSlices
|
||||
* device_properties.numSubslicesPerSlice
|
||||
* device_properties.numEUsPerSubslice;
|
||||
|
||||
ze_device_compute_properties_t device_compute_properties = {};
|
||||
device_compute_properties.stype
|
||||
= ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
|
||||
device_compute_properties.pNext = nullptr;
|
||||
|
||||
CHECK(l0::zeDeviceGetComputeProperties(device, &device_compute_properties));
|
||||
|
||||
max_wg_size_ = device_compute_properties.maxTotalGroupSize;
|
||||
|
||||
uint32_t device_cache_properties_count = 0;
|
||||
CHECK(l0::zeDeviceGetCacheProperties(
|
||||
device, &device_cache_properties_count, nullptr));
|
||||
|
||||
std::vector<ze_device_cache_properties_t> device_cache_properties(
|
||||
device_cache_properties_count);
|
||||
for (ze_device_cache_properties_t &p : device_cache_properties) {
|
||||
p.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
|
||||
p.pNext = nullptr;
|
||||
}
|
||||
|
||||
CHECK(l0::zeDeviceGetCacheProperties(device, &device_cache_properties_count,
|
||||
device_cache_properties.data()));
|
||||
l3_cache_size_ = device_cache_properties[0].cacheSize;
|
||||
|
||||
ze_device_memory_access_properties_t device_memory_access_properties = {};
|
||||
device_memory_access_properties.stype
|
||||
= ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
|
||||
device_memory_access_properties.pNext = nullptr;
|
||||
|
||||
l0::zeDeviceGetMemoryAccessProperties(
|
||||
device, &device_memory_access_properties);
|
||||
mayiuse_system_memory_allocators_
|
||||
= device_memory_access_properties.sharedSystemAllocCapabilities;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
44
src/gpu/intel/l0/device_info.hpp
Normal file
44
src/gpu/intel/l0/device_info.hpp
Normal file
@ -0,0 +1,44 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_DEVICE_INFO_HPP
|
||||
#define GPU_INTEL_L0_DEVICE_INFO_HPP
|
||||
|
||||
#include "gpu/intel/compute/device_info.hpp"
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
class device_info_t : public compute::device_info_t {
|
||||
protected:
|
||||
status_t init_device_name(impl::engine_t *engine) override;
|
||||
status_t init_arch(impl::engine_t *engine) override;
|
||||
status_t init_runtime_version(impl::engine_t *engine) override;
|
||||
status_t init_extensions(impl::engine_t *engine) override;
|
||||
status_t init_attributes(impl::engine_t *engine) override;
|
||||
};
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_DEVICE_INFO_HPP
|
292
src/gpu/intel/l0/engine.cpp
Normal file
292
src/gpu/intel/l0/engine.cpp
Normal file
@ -0,0 +1,292 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
#include "gpu/intel/l0/compiler.hpp"
|
||||
#include "gpu/intel/l0/device_info.hpp"
|
||||
#include "gpu/intel/l0/kernel.hpp"
|
||||
#include "gpu/intel/l0/memory_storage.hpp"
|
||||
#include "gpu/intel/l0/stream.hpp"
|
||||
|
||||
#include "gpu/intel/compute/ukernels.hpp"
|
||||
#include "gpu/intel/jit/dsl/runtime.hpp"
|
||||
#include "gpu/intel/jit/generator.hpp"
|
||||
#include "gpu/intel/microkernels/fuser.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
class engine_impl_t : public impl::engine_impl_t {
|
||||
public:
|
||||
engine_impl_t(engine_kind_t kind, const ze_driver_handle_t driver,
|
||||
const ze_device_handle_t device, const ze_context_handle_t context,
|
||||
size_t index)
|
||||
: impl::engine_impl_t(kind, runtime_kind::l0, index)
|
||||
, driver_(driver)
|
||||
, device_(device)
|
||||
, context_(context) {}
|
||||
~engine_impl_t() override { l0::zeContextDestroy(context_); }
|
||||
|
||||
const ze_driver_handle_t driver() const { return driver_; }
|
||||
const ze_device_handle_t device() const { return device_; }
|
||||
const ze_context_handle_t context() const { return context_; }
|
||||
|
||||
status_t create_stream_impl(
|
||||
impl::stream_impl_t **stream_impl, unsigned flags) const override {
|
||||
auto *si = new stream_impl_t(flags, context_, device_);
|
||||
if (!si) return status::out_of_memory;
|
||||
|
||||
*stream_impl = si;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t create_memory_storage(impl::memory_storage_t **storage,
|
||||
impl::engine_t *engine, unsigned flags, size_t size,
|
||||
void *handle) const override {
|
||||
std::unique_ptr<memory_storage_t> _storage;
|
||||
_storage.reset(
|
||||
new memory_storage_t(engine, memory_storage_kind_t::device));
|
||||
if (!_storage) return status::out_of_memory;
|
||||
|
||||
status_t status = _storage->init(flags, size, handle);
|
||||
if (status != status::success) return status;
|
||||
|
||||
*storage = _storage.release();
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
engine_id_t engine_id() const override {
|
||||
return engine_id_t(new engine_id_impl_t(
|
||||
device(), context(), kind(), runtime_kind(), index()));
|
||||
}
|
||||
|
||||
int get_buffer_alignment() const override { return 128; }
|
||||
|
||||
private:
|
||||
ze_driver_handle_t driver_;
|
||||
ze_device_handle_t device_;
|
||||
ze_context_handle_t context_;
|
||||
|
||||
engine_impl_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(engine_impl_t);
|
||||
};
|
||||
|
||||
status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
|
||||
const ze_driver_handle_t dri, const ze_device_handle_t dev,
|
||||
const ze_context_handle_t ctx, size_t index) {
|
||||
std::unique_ptr<gpu::intel::l0::engine_t, engine_deleter_t> e(
|
||||
(new gpu::intel::l0::engine_t(dri, dev, ctx, index)));
|
||||
if (!e) return status::out_of_memory;
|
||||
|
||||
CHECK(e->init());
|
||||
*engine = e.release();
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
engine_t::engine_t(ze_driver_handle_t driver, ze_device_handle_t device,
|
||||
ze_context_handle_t context, size_t index)
|
||||
: gpu::intel::engine_t(new engine_impl_t(
|
||||
engine_kind::gpu, driver, device, context, index)) {}
|
||||
|
||||
status_t engine_t::init() {
|
||||
CHECK(init_impl());
|
||||
CHECK(gpu::intel::engine_t::init());
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t engine_t::create_stream(
|
||||
impl::stream_t **stream, impl::stream_impl_t *stream_impl) {
|
||||
return gpu::intel::l0::stream_t::create_stream(stream, this, stream_impl);
|
||||
}
|
||||
|
||||
status_t engine_t::create_kernel(
|
||||
compute::kernel_t *kernel, jit::generator_base_t *jitter) const {
|
||||
if (kind() != engine_kind::gpu) {
|
||||
assert(!"not expected");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
|
||||
return jitter->get_kernel(*kernel, this);
|
||||
}
|
||||
|
||||
status_t engine_t::create_kernel(
|
||||
compute::kernel_t &kernel, const jit::dsl::kernel_t &kernel_dsl) const {
|
||||
if (kind() != engine_kind::gpu) {
|
||||
assert(!"not expected");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
|
||||
auto module_and_kernel
|
||||
= jit::dsl::make_kernel(kernel_dsl, context(), device());
|
||||
auto l0_module_ptr
|
||||
= std::make_shared<module_wrapper_t>(module_and_kernel.first);
|
||||
return kernel_t::make(kernel, l0_module_ptr, module_and_kernel.second, {});
|
||||
}
|
||||
|
||||
status_t engine_t::convert_to_l0(
|
||||
std::vector<gpu::intel::compute::kernel_t> &kernels,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
xpu::binary_t &binary) const {
|
||||
ze_module_handle_t l0_module = nullptr;
|
||||
std::vector<ze_kernel_handle_t> l0_kernels;
|
||||
CHECK(gpu::intel::l0::create_kernels_from_binary(
|
||||
device(), context(), kernel_names, binary, &l0_module, l0_kernels));
|
||||
auto l0_module_ptr = std::make_shared<module_wrapper_t>(l0_module);
|
||||
|
||||
kernels = std::vector<gpu::intel::compute::kernel_t>(kernel_names.size());
|
||||
for (size_t i = 0; i < kernel_names.size(); i++) {
|
||||
if (!l0_kernels[i]) continue;
|
||||
CHECK(kernel_t::make(
|
||||
kernels[i], l0_module_ptr, l0_kernels[i], kernel_names[i]));
|
||||
}
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t engine_t::create_kernels(std::vector<compute::kernel_t> *kernels,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
const compute::kernel_ctx_t &kernel_ctx) const {
|
||||
if (kind() != engine_kind::gpu) {
|
||||
assert(!"not expected");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
|
||||
const char *source = nullptr;
|
||||
for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++)
|
||||
source = intel::get_kernel_source(kernel_names[i]);
|
||||
|
||||
std::string options = kernel_ctx.options();
|
||||
auto *dev_info = utils::downcast<const device_info_t *>(device_info());
|
||||
options += " " + dev_info->get_cl_ext_options();
|
||||
|
||||
stringstream_t code_ss;
|
||||
CHECK(compute::preprocess_headers(code_ss, source, kernel_ctx));
|
||||
std::string code = code_ss.str();
|
||||
|
||||
gpu::intel::compute::program_src_t src(code);
|
||||
if (src) { options += " -g -s " + std::string(src.name()); }
|
||||
|
||||
compute::debugdump_processed_source(
|
||||
code, options, dev_info->get_cl_ext_options());
|
||||
|
||||
xpu::binary_t binary;
|
||||
#if 1
|
||||
CHECK(ocloc_build_kernels(
|
||||
code, options, std::to_string(dev_info->ip_version()), binary));
|
||||
#else
|
||||
CHECK(gpu::intel::l0::compile_ocl_module(
|
||||
device(), context(), code, options, binary));
|
||||
#endif
|
||||
|
||||
const char *code_c = code.c_str();
|
||||
if (kernel_ctx.has_custom_headers() && micro::hasMicrokernels(code_c)) {
|
||||
try {
|
||||
micro::fuseMicrokernels(binary, code_c);
|
||||
} catch (...) { return status::runtime_error; }
|
||||
}
|
||||
|
||||
CHECK(convert_to_l0(*kernels, kernel_names, binary));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t engine_t::create_kernel_from_binary(compute::kernel_t &kernel,
|
||||
const xpu::binary_t &binary, const char *kernel_name,
|
||||
const compute::program_src_t &src) const {
|
||||
std::vector<const char *> kernel_names = {kernel_name};
|
||||
ze_module_handle_t l0_module = nullptr;
|
||||
std::vector<ze_kernel_handle_t> l0_kernels;
|
||||
CHECK(gpu::intel::l0::create_kernels_from_binary(
|
||||
device(), context(), kernel_names, binary, &l0_module, l0_kernels));
|
||||
auto l0_module_ptr = std::make_shared<module_wrapper_t>(l0_module);
|
||||
|
||||
CHECK(kernel_t::make(kernel, l0_module_ptr, l0_kernels[0], kernel_name));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t engine_t::create_kernels_from_cache_blob(
|
||||
const cache_blob_t &cache_blob, std::vector<compute::kernel_t> &kernels,
|
||||
const std::vector<const char *> &kernel_names) const {
|
||||
if (kind() != engine_kind::gpu) {
|
||||
assert(!"not expected");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
|
||||
kernels = std::vector<compute::kernel_t>(kernel_names.size());
|
||||
for (size_t i = 0; i < kernel_names.size(); i++) {
|
||||
if (!kernel_names[i] && kernel_names.size() > 1) continue;
|
||||
std::string kernel_name(kernel_names[i] ? kernel_names[i] : "");
|
||||
|
||||
const uint8_t *binary_data = nullptr;
|
||||
size_t binary_size = 0;
|
||||
CHECK(cache_blob.get_binary(&binary_data, &binary_size));
|
||||
|
||||
xpu::binary_t binary(binary_data, binary_data + binary_size);
|
||||
CHECK(create_kernel_from_binary(kernels[i], binary, kernel_names[i],
|
||||
gpu::intel::compute::program_src_t()));
|
||||
}
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
gpu_utils::device_id_t engine_t::device_id() const {
|
||||
return std::tuple_cat(
|
||||
std::make_tuple(1), gpu::intel::l0::get_device_uuid(device()));
|
||||
}
|
||||
|
||||
const ze_driver_handle_t engine_t::driver() const {
|
||||
return static_cast<const engine_impl_t *>(impl())->driver();
|
||||
}
|
||||
|
||||
const ze_device_handle_t engine_t::device() const {
|
||||
return static_cast<const engine_impl_t *>(impl())->device();
|
||||
}
|
||||
|
||||
const ze_context_handle_t engine_t::context() const {
|
||||
return static_cast<const engine_impl_t *>(impl())->context();
|
||||
}
|
||||
|
||||
bool engine_t::mayiuse_microkernels() const {
|
||||
return ocloc_mayiuse_microkernels(
|
||||
std::string(compute::cl_microkernels_check_kernel_code));
|
||||
}
|
||||
|
||||
status_t engine_t::init_device_info() {
|
||||
device_info_ = std::make_shared<gpu::intel::l0::device_info_t>();
|
||||
CHECK(device_info_->init(this));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t engine_t::init_device_info(const std::vector<uint8_t> &cache_blob) {
|
||||
gpu_assert(false) << "unimplemented function init_device_info() called";
|
||||
|
||||
return status::runtime_error;
|
||||
}
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
117
src/gpu/intel/l0/engine.hpp
Normal file
117
src/gpu/intel/l0/engine.hpp
Normal file
@ -0,0 +1,117 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_ENGINE_HPP
|
||||
#define GPU_INTEL_L0_ENGINE_HPP
|
||||
|
||||
// #include <list>
|
||||
|
||||
#include "gpu/intel/engine.hpp"
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
struct engine_id_impl_t : public impl::engine_id_impl_t {
|
||||
engine_id_impl_t(const ze_device_handle_t device,
|
||||
const ze_context_handle_t context, engine_kind_t kind,
|
||||
runtime_kind_t runtime_kind, size_t index)
|
||||
: impl::engine_id_impl_t(kind, runtime_kind, index)
|
||||
, device_(device)
|
||||
, context_(context) {}
|
||||
~engine_id_impl_t() override = default;
|
||||
|
||||
private:
|
||||
bool compare_resource(
|
||||
const impl::engine_id_impl_t *id_impl) const override {
|
||||
const auto *typed_id
|
||||
= utils::downcast<const engine_id_impl_t *>(id_impl);
|
||||
return device_ == typed_id->device_ && context_ == typed_id->context_;
|
||||
}
|
||||
|
||||
size_t hash_resource() const override {
|
||||
size_t seed = 0;
|
||||
seed = hash_combine(seed, device_);
|
||||
seed = hash_combine(seed, context_);
|
||||
return seed;
|
||||
}
|
||||
|
||||
ze_device_handle_t device_;
|
||||
ze_context_handle_t context_;
|
||||
|
||||
engine_id_impl_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(engine_id_impl_t);
|
||||
};
|
||||
|
||||
status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
|
||||
const ze_driver_handle_t dri, const ze_device_handle_t dev,
|
||||
const ze_context_handle_t ctx, size_t index);
|
||||
|
||||
class engine_t : public intel::engine_t {
|
||||
public:
|
||||
engine_t(ze_driver_handle_t driver, ze_device_handle_t device,
|
||||
ze_context_handle_t context, size_t index);
|
||||
~engine_t() override = default;
|
||||
|
||||
status_t init() override;
|
||||
|
||||
status_t create_stream(
|
||||
impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
|
||||
|
||||
status_t create_kernel(compute::kernel_t *kernel,
|
||||
jit::generator_base_t *jitter) const override;
|
||||
status_t create_kernel(compute::kernel_t &kernel,
|
||||
const jit::kernel_t &kernel_ir) const override;
|
||||
status_t create_kernels(std::vector<compute::kernel_t> *kernels,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
const compute::kernel_ctx_t &kernel_ctx) const override;
|
||||
status_t create_kernel_from_binary(compute::kernel_t &kernel,
|
||||
const xpu::binary_t &binary, const char *kernel_name,
|
||||
const compute::program_src_t &src) const override;
|
||||
status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob,
|
||||
std::vector<compute::kernel_t> &kernels,
|
||||
const std::vector<const char *> &kernel_names) const override;
|
||||
|
||||
gpu::intel::gpu_utils::device_id_t device_id() const override;
|
||||
|
||||
const ze_driver_handle_t driver() const;
|
||||
const ze_device_handle_t device() const;
|
||||
const ze_context_handle_t context() const;
|
||||
|
||||
bool mayiuse_microkernels() const;
|
||||
|
||||
private:
|
||||
status_t init_device_info() override;
|
||||
status_t init_device_info(const std::vector<uint8_t> &cache_blob) override;
|
||||
|
||||
status_t convert_to_l0(std::vector<gpu::intel::compute::kernel_t> &kernels,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
xpu::binary_t &binary) const;
|
||||
|
||||
engine_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(engine_t);
|
||||
};
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_ENGINE_HPP
|
88
src/gpu/intel/l0/engine_factory.cpp
Normal file
88
src/gpu/intel/l0/engine_factory.cpp
Normal file
@ -0,0 +1,88 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/engine_factory.hpp"
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
engine_factory_t::engine_factory_t(engine_kind_t engine_kind)
|
||||
: engine_kind_(engine_kind) {
|
||||
assert(utils::one_of(engine_kind_, engine_kind::gpu));
|
||||
}
|
||||
|
||||
size_t engine_factory_t::count() const {
|
||||
uint32_t driver_count = 0;
|
||||
l0::zeDriverGet(&driver_count, nullptr);
|
||||
|
||||
std::vector<ze_driver_handle_t> drivers(driver_count);
|
||||
l0::zeDriverGet(&driver_count, drivers.data());
|
||||
|
||||
uint32_t device_count = 0;
|
||||
l0::zeDeviceGet(drivers[0], &device_count, nullptr);
|
||||
|
||||
return device_count;
|
||||
}
|
||||
|
||||
status_t engine_factory_t::engine_create(
|
||||
impl::engine_t **engine, size_t index) const {
|
||||
ze_driver_handle_t driver = nullptr;
|
||||
ze_device_handle_t device = nullptr;
|
||||
ze_context_handle_t context = nullptr;
|
||||
|
||||
uint32_t driver_count = 0;
|
||||
CHECK(l0::zeDriverGet(&driver_count, nullptr));
|
||||
|
||||
std::vector<ze_driver_handle_t> drivers(driver_count);
|
||||
CHECK(l0::zeDriverGet(&driver_count, drivers.data()));
|
||||
driver = drivers[0];
|
||||
|
||||
uint32_t device_count = 0;
|
||||
CHECK(l0::zeDeviceGet(driver, &device_count, nullptr));
|
||||
VERROR_ENGINE(index < device_count, status::invalid_arguments,
|
||||
"asked for device %zu but only %u devices are found", index,
|
||||
device_count);
|
||||
|
||||
std::vector<ze_device_handle_t> devices(device_count);
|
||||
CHECK(l0::zeDeviceGet(driver, &device_count, devices.data()));
|
||||
device = devices[index];
|
||||
|
||||
ze_context_desc_t context_desc = {};
|
||||
context_desc.stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC;
|
||||
context_desc.pNext = nullptr;
|
||||
context_desc.flags = 0;
|
||||
|
||||
CHECK(l0::zeContextCreate(driver, &context_desc, &context));
|
||||
|
||||
return engine_create(engine, driver, device, context, index);
|
||||
}
|
||||
|
||||
status_t engine_factory_t::engine_create(impl::engine_t **engine,
|
||||
const ze_driver_handle_t driver, const ze_device_handle_t device,
|
||||
const ze_context_handle_t context, size_t index) const {
|
||||
return gpu::intel::l0::engine_create(
|
||||
engine, engine_kind_, driver, device, context, index);
|
||||
}
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
59
src/gpu/intel/l0/engine_factory.hpp
Normal file
59
src/gpu/intel/l0/engine_factory.hpp
Normal file
@ -0,0 +1,59 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2019-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_ENGINE_FACTORY_HPP
|
||||
#define GPU_INTEL_L0_ENGINE_FACTORY_HPP
|
||||
|
||||
#include "common/engine.hpp"
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
class engine_factory_t : public impl::engine_factory_t {
|
||||
public:
|
||||
engine_factory_t(engine_kind_t engine_kind);
|
||||
~engine_factory_t() override = default;
|
||||
|
||||
size_t count() const override;
|
||||
status_t engine_create(
|
||||
impl::engine_t **engine, size_t index) const override;
|
||||
status_t engine_create(impl::engine_t **engine,
|
||||
const ze_driver_handle_t adriver, const ze_device_handle_t adevice,
|
||||
const ze_context_handle_t acontext, size_t index) const;
|
||||
|
||||
private:
|
||||
engine_kind_t engine_kind_;
|
||||
|
||||
engine_factory_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(engine_factory_t);
|
||||
};
|
||||
|
||||
inline std::unique_ptr<engine_factory_t> get_engine_factory(
|
||||
engine_kind_t engine_kind) {
|
||||
return std::unique_ptr<engine_factory_t>(new engine_factory_t(engine_kind));
|
||||
};
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_ENGINE_FACTORY_HPP
|
215
src/gpu/intel/l0/kernel.cpp
Normal file
215
src/gpu/intel/l0/kernel.cpp
Normal file
@ -0,0 +1,215 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/kernel.hpp"
|
||||
#include "gpu/intel/l0/context.hpp"
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
#include "gpu/intel/l0/memory_storage.hpp"
|
||||
#include "gpu/intel/l0/stream.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
// This class is to get around std::make_shared requirement to have a public
|
||||
// constructor. We keep the original constructor as private but expose it here
|
||||
// to use with std::make_shared.
|
||||
class kernel_compat_t : public kernel_t {
|
||||
public:
|
||||
template <typename... Args>
|
||||
kernel_compat_t(Args &&...args) : kernel_t(std::forward<Args>(args)...) {}
|
||||
};
|
||||
|
||||
status_t kernel_t::make(compute::kernel_t &compute_kernel,
|
||||
const std::shared_ptr<module_wrapper_t> module_ptr,
|
||||
const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name) {
|
||||
compute_kernel = compute::kernel_t(std::make_shared<kernel_compat_t>(
|
||||
module_ptr, kernel_ptr, kernel_name));
|
||||
return status::success;
|
||||
}
|
||||
|
||||
kernel_t::kernel_t(const std::shared_ptr<module_wrapper_t> module_ptr,
|
||||
const ze_kernel_handle_t kernel_ptr, const std::string &kernel_name)
|
||||
: module_(module_ptr), kernel_(kernel_ptr), kernel_name_(kernel_name) {}
|
||||
|
||||
kernel_t::~kernel_t() {
|
||||
l0::zeKernelDestroy(kernel_);
|
||||
}
|
||||
|
||||
status_t kernel_t::check_alignment(
|
||||
const compute::kernel_arg_list_t &arg_list) const {
|
||||
for (int i = 0; i < arg_list.nargs(); ++i) {
|
||||
auto &arg = arg_list.get(i);
|
||||
if (!arg.is_global()) continue;
|
||||
|
||||
auto *mem_storage = static_cast<const memory_storage_t *>(arg.value());
|
||||
if (!*mem_storage) continue;
|
||||
|
||||
CHECK(compute::kernel_impl_t::check_alignment(
|
||||
mem_storage->data_handle(), i));
|
||||
}
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t kernel_t::set_arg(
|
||||
int arg_index, size_t arg_size, const void *arg_value) const {
|
||||
return l0::zeKernelSetArgumentValue(
|
||||
kernel_, arg_index, arg_size, arg_value);
|
||||
}
|
||||
|
||||
status_t kernel_t::parallel_for(impl::stream_t &stream,
|
||||
const compute::nd_range_t &range,
|
||||
const compute::kernel_arg_list_t &arg_list, const xpu::event_t &deps,
|
||||
xpu::event_t &out_dep) {
|
||||
CHECK(check_scalar_arguments(arg_list));
|
||||
CHECK(check_alignment(arg_list));
|
||||
|
||||
auto l0_stream = utils::downcast<stream_t *>(&stream);
|
||||
auto l0_engine = l0_stream->l0_engine();
|
||||
auto l0_device_info = l0_engine->device_info();
|
||||
|
||||
const size_t pointer_size = l0_device_info->device_address_bits() / 8;
|
||||
|
||||
size_t param_bytes = 0;
|
||||
for (int i = 0; i < arg_list.nargs(); ++i) {
|
||||
auto &arg = arg_list.get(i);
|
||||
if (arg.is_global()) {
|
||||
auto *mem_storage
|
||||
= static_cast<const memory_storage_t *>(arg.value());
|
||||
if (!mem_storage->is_null()) {
|
||||
auto memory_storage_ctx
|
||||
= utils::downcast<engine_t *>(mem_storage->engine())
|
||||
->context();
|
||||
if (l0_engine->context() != memory_storage_ctx) {
|
||||
VERROR(primitive, gpu,
|
||||
"mismatched Level Zero context for "
|
||||
"primitive/memory");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
|
||||
void *ptr = mem_storage->ptr();
|
||||
CHECK(set_arg(i, pointer_size, &ptr));
|
||||
param_bytes += pointer_size;
|
||||
} else {
|
||||
CHECK(set_arg(i, pointer_size, nullptr));
|
||||
param_bytes += pointer_size;
|
||||
}
|
||||
} else if (arg.is_local()) {
|
||||
CHECK(set_arg(i, arg.size(), arg.value()));
|
||||
param_bytes += pointer_size;
|
||||
} else {
|
||||
CHECK(set_arg(i, arg.size(), arg.value()));
|
||||
param_bytes += arg.size();
|
||||
}
|
||||
}
|
||||
if (param_bytes > l0_device_info->max_kernel_param_size()) {
|
||||
VERROR(primitive, gpu,
|
||||
"parameter bytes requirements greater than device supports");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
|
||||
if (range.is_zero()) { return status::success; }
|
||||
|
||||
std::vector<uint32_t> global_size(3, 1);
|
||||
switch (range.global_range().ndims()) {
|
||||
case 3: global_size[2] = static_cast<uint32_t>(range.global_range()[2]);
|
||||
case 2: global_size[1] = static_cast<uint32_t>(range.global_range()[1]);
|
||||
case 1:
|
||||
global_size[0] = static_cast<uint32_t>(range.global_range()[0]);
|
||||
break;
|
||||
default:
|
||||
VERROR(primitive, gpu,
|
||||
"incorrect number of global range dimensions");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> group_size(3, 1);
|
||||
if (range.local_range()) {
|
||||
switch (range.local_range().ndims()) {
|
||||
case 3:
|
||||
group_size[2] = static_cast<uint32_t>(range.local_range()[2]);
|
||||
case 2:
|
||||
group_size[1] = static_cast<uint32_t>(range.local_range()[1]);
|
||||
case 1:
|
||||
group_size[0] = static_cast<uint32_t>(range.local_range()[0]);
|
||||
break;
|
||||
default:
|
||||
VERROR(primitive, gpu,
|
||||
"incorrect number of local range dimensions");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
} else {
|
||||
CHECK(l0::zeKernelSuggestGroupSize(kernel_, global_size[0],
|
||||
global_size[1], global_size[2], &group_size[0], &group_size[1],
|
||||
&group_size[2]));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < global_size.size(); i++) {
|
||||
if (global_size[i] % group_size[i] != 0) {
|
||||
VERROR(primitive, gpu, "only uniform work-groups are supported");
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK(l0::zeKernelSetGroupSize(
|
||||
kernel_, group_size[0], group_size[1], group_size[2]));
|
||||
ze_group_count_t group_count = {global_size[0] / group_size[0],
|
||||
global_size[1] / group_size[1], global_size[2] / group_size[2]};
|
||||
|
||||
std::vector<ze_event_handle_t> l0_deps
|
||||
= utils::downcast<const event_t *>(&deps)->events_;
|
||||
std::vector<ze_event_handle_t> l0_out_deps
|
||||
= utils::downcast<const event_t *>(&out_dep)->events_;
|
||||
|
||||
event_ = l0_stream->create_event();
|
||||
ze_event_handle_t out_event = *(event_.get());
|
||||
|
||||
CHECK(l0::zeCommandListAppendLaunchKernel(l0_stream->list(), kernel_,
|
||||
&group_count, out_event, static_cast<uint32_t>(l0_deps.size()),
|
||||
l0_deps.size() ? l0_deps.data() : nullptr));
|
||||
|
||||
if (out_event) l0_out_deps.push_back(out_event);
|
||||
if (stream.is_profiling_enabled()) {
|
||||
l0_stream->profiler().register_event(
|
||||
utils::make_unique<event_t>(std::move(out_event)));
|
||||
}
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t kernel_t::get_kernel_binary(xpu::binary_t &binary) const {
|
||||
return l0::get_kernel_binary(kernel_, binary);
|
||||
}
|
||||
|
||||
std::string kernel_t::name() const {
|
||||
return kernel_name_;
|
||||
}
|
||||
|
||||
status_t kernel_t::dump() const {
|
||||
xpu::binary_t binary;
|
||||
CHECK(get_kernel_binary(binary));
|
||||
|
||||
return gpu_utils::dump_kernel_binary(binary, kernel_name_);
|
||||
}
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
76
src/gpu/intel/l0/kernel.hpp
Normal file
76
src/gpu/intel/l0/kernel.hpp
Normal file
@ -0,0 +1,76 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_KERNEL_HPP
|
||||
#define GPU_INTEL_L0_KERNEL_HPP
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "common/rw_mutex.hpp"
|
||||
#include "gpu/intel/compute/kernel.hpp"
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
class kernel_t : public compute::kernel_impl_t {
|
||||
public:
|
||||
static status_t make(compute::kernel_t &compute_kernel,
|
||||
const std::shared_ptr<module_wrapper_t> module_ptr,
|
||||
const ze_kernel_handle_t kernel_ptr,
|
||||
const std::string &kernel_name);
|
||||
~kernel_t() override;
|
||||
|
||||
status_t check_alignment(
|
||||
const compute::kernel_arg_list_t &arg_list) const override;
|
||||
status_t set_arg(
|
||||
int arg_index, size_t arg_size, const void *arg_value) const;
|
||||
status_t parallel_for(impl::stream_t &stream,
|
||||
const compute::nd_range_t &range,
|
||||
const compute::kernel_arg_list_t &arg_list,
|
||||
const xpu::event_t &deps, xpu::event_t &out_dep) override;
|
||||
|
||||
status_t get_kernel_binary(xpu::binary_t &binary) const override;
|
||||
std::string name() const override;
|
||||
status_t dump() const override;
|
||||
|
||||
private:
|
||||
friend class kernel_compat_t;
|
||||
kernel_t(const std::shared_ptr<module_wrapper_t> module_ptr,
|
||||
const ze_kernel_handle_t kernel_ptr,
|
||||
const std::string &kernel_name);
|
||||
|
||||
std::shared_ptr<module_wrapper_t> module_;
|
||||
ze_kernel_handle_t kernel_;
|
||||
std::string kernel_name_;
|
||||
|
||||
std::shared_ptr<ze_event_pool_handle_t> event_pool_;
|
||||
std::shared_ptr<event_wrapper_t> event_;
|
||||
|
||||
kernel_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(kernel_t);
|
||||
};
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_KERNEL_HPP
|
215
src/gpu/intel/l0/memory_storage.cpp
Normal file
215
src/gpu/intel/l0/memory_storage.cpp
Normal file
@ -0,0 +1,215 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2021-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/memory_storage.hpp"
|
||||
#include "common/memory_map_manager.hpp"
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
#include "gpu/intel/l0/stream.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
status_t memory_storage_t::get_data_handle(void **handle) const {
|
||||
*handle = ptr_.get();
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t memory_storage_t::set_data_handle(void *handle) {
|
||||
ptr_ = decltype(ptr_)(handle, [](void *) {});
|
||||
kind_ = get_memory_storage_kind(
|
||||
get_pointer_type(l0_engine()->context(), handle));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
bool memory_storage_t::is_host_accessible() const {
|
||||
return utils::one_of(kind_, memory_storage_kind_t::host,
|
||||
memory_storage_kind_t::shared, memory_storage_kind_t::unknown);
|
||||
}
|
||||
|
||||
struct map_usm_tag;
|
||||
|
||||
status_t memory_storage_t::map_data(
|
||||
void **mapped_ptr, impl::stream_t *stream, size_t size) const {
|
||||
if (is_host_accessible()) {
|
||||
*mapped_ptr = ptr();
|
||||
return status::success;
|
||||
}
|
||||
|
||||
if (!ptr() || size == 0) {
|
||||
*mapped_ptr = nullptr;
|
||||
return status::success;
|
||||
}
|
||||
|
||||
if (!stream) CHECK(engine()->get_service_stream(stream));
|
||||
|
||||
void *host_ptr = malloc_host(size);
|
||||
if (!host_ptr) return status::out_of_memory;
|
||||
|
||||
auto leak_guard = decltype(ptr_)(host_ptr, [this](void *p) { free(p); });
|
||||
CHECK(memcpy(stream, host_ptr, ptr(), size));
|
||||
CHECK(stream->wait());
|
||||
leak_guard.release();
|
||||
|
||||
auto *usm_ptr_for_unmap = ptr();
|
||||
auto unmap_callback = [size, usm_ptr_for_unmap, this](
|
||||
impl::stream_t *stream, void *mapped_ptr) {
|
||||
CHECK(memcpy(stream, usm_ptr_for_unmap, mapped_ptr, size));
|
||||
CHECK(stream->wait());
|
||||
free(mapped_ptr);
|
||||
|
||||
return status::success;
|
||||
};
|
||||
|
||||
auto &map_manager = memory_map_manager_t<map_usm_tag>::instance();
|
||||
|
||||
*mapped_ptr = host_ptr;
|
||||
|
||||
return map_manager.map(this, stream, *mapped_ptr, unmap_callback);
|
||||
}
|
||||
|
||||
status_t memory_storage_t::unmap_data(
|
||||
void *mapped_ptr, impl::stream_t *stream) const {
|
||||
if (!mapped_ptr || is_host_accessible()) return status::success;
|
||||
|
||||
if (!stream) CHECK(engine()->get_service_stream(stream));
|
||||
|
||||
auto &map_manager = memory_map_manager_t<map_usm_tag>::instance();
|
||||
|
||||
return map_manager.unmap(this, stream, mapped_ptr);
|
||||
}
|
||||
|
||||
std::unique_ptr<impl::memory_storage_t> memory_storage_t::get_sub_storage(
|
||||
size_t offset, size_t size) const {
|
||||
void *sub_ptr
|
||||
= ptr_ ? reinterpret_cast<uint8_t *>(ptr_.get()) + offset : nullptr;
|
||||
|
||||
auto storage = utils::make_unique<memory_storage_t>(engine(), kind_);
|
||||
if (!storage) return nullptr;
|
||||
|
||||
auto status = storage->init(memory_flags_t::use_runtime_ptr, size, sub_ptr);
|
||||
if (status != status::success) return nullptr;
|
||||
|
||||
// XXX: Clang has a bug that prevents implicit conversion.
|
||||
return std::unique_ptr<memory_storage_t>(storage.release());
|
||||
}
|
||||
|
||||
std::unique_ptr<impl::memory_storage_t> memory_storage_t::clone() const {
|
||||
auto storage = utils::make_unique<memory_storage_t>(engine(), kind_);
|
||||
if (!storage) return nullptr;
|
||||
|
||||
auto status = storage->init(memory_flags_t::use_runtime_ptr, 0, nullptr);
|
||||
if (status != status::success) return nullptr;
|
||||
|
||||
storage->ptr_ = decltype(ptr_)(ptr_.get(), [](void *) {});
|
||||
storage->kind_ = kind_;
|
||||
|
||||
// XXX: Clang has a bug that prevents implicit conversion.
|
||||
return std::unique_ptr<memory_storage_t>(storage.release());
|
||||
}
|
||||
|
||||
status_t memory_storage_t::init_allocate(size_t size) {
|
||||
if (kind_ == memory_storage_kind_t::unknown)
|
||||
kind_ = memory_storage_kind_t::device;
|
||||
|
||||
void *ptr_alloc = nullptr;
|
||||
|
||||
switch (kind_) {
|
||||
case memory_storage_kind_t::host: ptr_alloc = malloc_host(size); break;
|
||||
case memory_storage_kind_t::device:
|
||||
ptr_alloc = malloc_device(size);
|
||||
break;
|
||||
case memory_storage_kind_t::shared:
|
||||
ptr_alloc = malloc_shared(size);
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
if (!ptr_alloc) return status::out_of_memory;
|
||||
|
||||
ptr_ = decltype(ptr_)(ptr_alloc, [&](void *ptr) { free(ptr); });
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
void *memory_storage_t::malloc_host(size_t size) const {
|
||||
void *pptr = nullptr;
|
||||
|
||||
ze_host_mem_alloc_desc_t host_mem_alloc_desc = {};
|
||||
host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
|
||||
host_mem_alloc_desc.pNext = nullptr;
|
||||
host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
|
||||
|
||||
l0::zeMemAllocHost(
|
||||
l0_engine()->context(), &host_mem_alloc_desc, size, 0, &pptr);
|
||||
|
||||
return pptr;
|
||||
}
|
||||
|
||||
void *memory_storage_t::malloc_device(size_t size) const {
|
||||
void *pptr = nullptr;
|
||||
|
||||
ze_device_mem_alloc_desc_t device_mem_alloc_desc = {};
|
||||
device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
|
||||
device_mem_alloc_desc.pNext = nullptr;
|
||||
device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
|
||||
device_mem_alloc_desc.ordinal = 0;
|
||||
|
||||
l0::zeMemAllocDevice(l0_engine()->context(), &device_mem_alloc_desc, size,
|
||||
0, l0_engine()->device(), &pptr);
|
||||
|
||||
return pptr;
|
||||
}
|
||||
|
||||
void *memory_storage_t::malloc_shared(size_t size) const {
|
||||
void *pptr = nullptr;
|
||||
|
||||
ze_device_mem_alloc_desc_t device_mem_alloc_desc = {};
|
||||
device_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
|
||||
device_mem_alloc_desc.pNext = nullptr;
|
||||
device_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
|
||||
device_mem_alloc_desc.ordinal = 0;
|
||||
|
||||
ze_host_mem_alloc_desc_t host_mem_alloc_desc = {};
|
||||
host_mem_alloc_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
|
||||
host_mem_alloc_desc.pNext = nullptr;
|
||||
host_mem_alloc_desc.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW;
|
||||
|
||||
l0::zeMemAllocShared(l0_engine()->context(), &device_mem_alloc_desc,
|
||||
&host_mem_alloc_desc, size, 0, l0_engine()->device(), &pptr);
|
||||
|
||||
return pptr;
|
||||
}
|
||||
|
||||
void memory_storage_t::free(void *ptr) const {
|
||||
l0::zeMemFree(l0_engine()->context(), ptr);
|
||||
}
|
||||
|
||||
status_t memory_storage_t::memcpy(
|
||||
impl::stream_t *stream, void *dst, const void *src, size_t size) const {
|
||||
auto *l0_stream = utils::downcast<stream_t *>(stream);
|
||||
return l0::zeCommandListAppendMemoryCopy(
|
||||
l0_stream->list(), dst, src, size, nullptr, 0, nullptr);
|
||||
}
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
91
src/gpu/intel/l0/memory_storage.hpp
Normal file
91
src/gpu/intel/l0/memory_storage.hpp
Normal file
@ -0,0 +1,91 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2021-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_MEMORY_STORAGE_HPP
|
||||
#define GPU_INTEL_L0_MEMORY_STORAGE_HPP
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "common/c_types_map.hpp"
|
||||
#include "common/memory_storage.hpp"
|
||||
#include "common/utils.hpp"
|
||||
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
enum class memory_storage_kind_t { unknown, host, device, shared };
|
||||
inline memory_storage_kind_t get_memory_storage_kind(ze_memory_type_t type) {
|
||||
switch (type) {
|
||||
case ZE_MEMORY_TYPE_HOST: return memory_storage_kind_t::host;
|
||||
case ZE_MEMORY_TYPE_DEVICE: return memory_storage_kind_t::device;
|
||||
case ZE_MEMORY_TYPE_SHARED: return memory_storage_kind_t::shared;
|
||||
default: return memory_storage_kind_t::unknown;
|
||||
}
|
||||
};
|
||||
|
||||
class memory_storage_t : public impl::memory_storage_t {
|
||||
public:
|
||||
memory_storage_t(impl::engine_t *engine, memory_storage_kind_t kind)
|
||||
: impl::memory_storage_t(engine), kind_(kind) {}
|
||||
|
||||
void *ptr() const { return ptr_.get(); }
|
||||
|
||||
status_t get_data_handle(void **handle) const override;
|
||||
status_t set_data_handle(void *handle) override;
|
||||
|
||||
bool is_host_accessible() const override;
|
||||
|
||||
status_t map_data(void **mapped_ptr, impl::stream_t *stream,
|
||||
size_t size) const override;
|
||||
status_t unmap_data(
|
||||
void *mapped_ptr, impl::stream_t *stream) const override;
|
||||
|
||||
std::unique_ptr<impl::memory_storage_t> get_sub_storage(
|
||||
size_t offset, size_t size) const override;
|
||||
std::unique_ptr<impl::memory_storage_t> clone() const override;
|
||||
|
||||
private:
|
||||
status_t init_allocate(size_t size) override;
|
||||
|
||||
gpu::intel::l0::engine_t *l0_engine() const {
|
||||
return utils::downcast<gpu::intel::l0::engine_t *>(engine());
|
||||
}
|
||||
|
||||
void *malloc_host(size_t size) const;
|
||||
void *malloc_device(size_t size) const;
|
||||
void *malloc_shared(size_t size) const;
|
||||
void free(void *ptr) const;
|
||||
status_t memcpy(impl::stream_t *stream, void *dst, const void *src,
|
||||
size_t size) const;
|
||||
|
||||
std::unique_ptr<void, std::function<void(void *)>> ptr_;
|
||||
memory_storage_kind_t kind_ = memory_storage_kind_t::unknown;
|
||||
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(memory_storage_t);
|
||||
};
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_MEMORY_STORAGE_HPP
|
217
src/gpu/intel/l0/stream.cpp
Normal file
217
src/gpu/intel/l0/stream.cpp
Normal file
@ -0,0 +1,217 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/stream.hpp"
|
||||
#include "gpu/intel/l0/engine.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
status_t stream_t::create_stream(impl::stream_t **stream,
|
||||
impl::engine_t *engine, impl::stream_impl_t *stream_impl) {
|
||||
std::unique_ptr<intel::l0::stream_t> s(
|
||||
new intel::l0::stream_t(engine, stream_impl));
|
||||
if (!s) return status::out_of_memory;
|
||||
|
||||
*stream = s.release();
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
void stream_t::before_exec_hook() {
|
||||
if (is_profiling_enabled()) profiler_->start_profiling();
|
||||
}
|
||||
|
||||
void stream_t::after_exec_hook() {
|
||||
l0_ctx().set_deps(event_t());
|
||||
|
||||
if (is_profiling_enabled()) profiler_->stop_profiling();
|
||||
}
|
||||
|
||||
status_t stream_t::reset_profiling() {
|
||||
if (!is_profiling_enabled()) return status::invalid_arguments;
|
||||
|
||||
profiler_->reset();
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t stream_t::get_profiling_data(profiling_data_kind_t data_kind,
|
||||
int *num_entries, uint64_t *data) const {
|
||||
if (!is_profiling_enabled()) return status::invalid_arguments;
|
||||
|
||||
return profiler_->get_info(data_kind, num_entries, data);
|
||||
}
|
||||
|
||||
stream_impl_t::stream_impl_t(unsigned flags, ze_command_list_handle_t list)
|
||||
: impl::stream_impl_t(flags)
|
||||
, allocated_(false)
|
||||
, list_(list)
|
||||
, event_pool_(nullptr) {
|
||||
l0::zeCommandListGetContextHandle(list_, &context_);
|
||||
if (flags & stream_flags::out_of_order || is_profiling_enabled())
|
||||
create_event_pool();
|
||||
}
|
||||
|
||||
stream_impl_t::stream_impl_t(
|
||||
unsigned flags, ze_context_handle_t context, ze_device_handle_t device)
|
||||
: impl::stream_impl_t(flags)
|
||||
, context_(context)
|
||||
, allocated_(true)
|
||||
, event_pool_(nullptr) {
|
||||
ze_command_queue_desc_t command_queue_desc = {};
|
||||
command_queue_desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
|
||||
command_queue_desc.pNext = nullptr;
|
||||
command_queue_desc.ordinal = 0;
|
||||
command_queue_desc.index = 0;
|
||||
command_queue_desc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER;
|
||||
command_queue_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT;
|
||||
command_queue_desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
|
||||
|
||||
l0::zeCommandListCreateImmediate(
|
||||
context_, device, &command_queue_desc, &list_);
|
||||
|
||||
if (flags & stream_flags::out_of_order || is_profiling_enabled())
|
||||
create_event_pool();
|
||||
}
|
||||
|
||||
void stream_impl_t::create_event_pool() {
|
||||
ze_event_pool_desc_t event_pool_desc = {};
|
||||
event_pool_desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
|
||||
event_pool_desc.pNext = nullptr;
|
||||
event_pool_desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||
if (is_profiling_enabled())
|
||||
event_pool_desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
event_pool_desc.count = 16384;
|
||||
|
||||
ze_event_pool_handle_t event_pool;
|
||||
l0::zeEventPoolCreate(context_, &event_pool_desc, 0, nullptr, &event_pool);
|
||||
event_pool_ = std::make_shared<event_pool_wrapper_t>(event_pool);
|
||||
}
|
||||
|
||||
stream_impl_t::~stream_impl_t() {
|
||||
wait();
|
||||
if (allocated_) l0::zeCommandListDestroy(list_);
|
||||
}
|
||||
|
||||
xpu::context_t &stream_impl_t::ctx() {
|
||||
return l0_ctx();
|
||||
}
|
||||
|
||||
const xpu::context_t &stream_impl_t::ctx() const {
|
||||
return l0_ctx();
|
||||
}
|
||||
|
||||
context_t &stream_impl_t::l0_ctx() {
|
||||
const context_t &ctx = const_cast<const stream_impl_t *>(this)->l0_ctx();
|
||||
return *const_cast<context_t *>(&ctx);
|
||||
}
|
||||
|
||||
const context_t &stream_impl_t::l0_ctx() const {
|
||||
static context_t empty_ctx;
|
||||
return ctx_.get(empty_ctx);
|
||||
}
|
||||
|
||||
ze_event_handle_t stream_impl_t::get_output_event() const {
|
||||
auto &deps = event_t::from(ctx().get_deps()).events_;
|
||||
if (deps.size()) return deps[0];
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<event_wrapper_t> stream_impl_t::create_event() {
|
||||
if (!event_pool_.get()) return std::make_shared<event_wrapper_t>(nullptr);
|
||||
|
||||
ze_event_desc_t event_desc = {};
|
||||
event_desc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
|
||||
event_desc.pNext = nullptr;
|
||||
event_desc.index = static_cast<uint32_t>(events_.size());
|
||||
event_desc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
event_desc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_event_handle_t event;
|
||||
l0::zeEventCreate(*(event_pool_.get()), &event_desc, &event);
|
||||
|
||||
std::shared_ptr<event_wrapper_t> event_ptr
|
||||
= std::make_shared<event_wrapper_t>(event);
|
||||
events_.push_back(event_ptr);
|
||||
|
||||
return event_ptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<event_pool_wrapper_t> stream_impl_t::get_event_pool() {
|
||||
return event_pool_;
|
||||
}
|
||||
|
||||
ze_command_list_handle_t stream_impl_t::list() {
|
||||
return list_;
|
||||
}
|
||||
|
||||
status_t stream_impl_t::wait() {
|
||||
CHECK(l0::zeCommandListHostSynchronize(list_, UINT64_MAX));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t stream_impl_t::barrier() {
|
||||
CHECK(l0::zeCommandListAppendBarrier(list_, nullptr, 0, nullptr));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t stream_impl_t::copy(const impl::memory_storage_t &src,
|
||||
const impl::memory_storage_t &dst, size_t size,
|
||||
const xpu::event_t &deps, xpu::event_t &out_dep) {
|
||||
if (size == 0) return status::success;
|
||||
std::vector<ze_event_handle_t> l0_deps
|
||||
= utils::downcast<const event_t *>(&deps)->events_;
|
||||
|
||||
ze_event_handle_t out_event = *(create_event().get());
|
||||
CHECK(l0::zeCommandListAppendMemoryCopy(list_, dst.data_handle(),
|
||||
src.data_handle(), size, out_event,
|
||||
static_cast<uint32_t>(l0_deps.size()),
|
||||
l0_deps.size() ? l0_deps.data() : nullptr));
|
||||
if (out_event)
|
||||
utils::downcast<event_t *>(&out_dep)->events_.push_back(out_event);
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t stream_impl_t::fill(const impl::memory_storage_t &dst, uint8_t pattern,
|
||||
size_t size, const xpu::event_t &deps, xpu::event_t &out_dep) {
|
||||
if (size == 0) return status::success;
|
||||
std::vector<ze_event_handle_t> l0_deps
|
||||
= utils::downcast<const event_t *>(&deps)->events_;
|
||||
|
||||
ze_event_handle_t out_event = *(create_event().get());
|
||||
CHECK(l0::zeCommandListAppendMemoryFill(list_, dst.data_handle(), &pattern,
|
||||
sizeof(pattern), size, out_event,
|
||||
static_cast<uint32_t>(l0_deps.size()),
|
||||
l0_deps.size() ? l0_deps.data() : nullptr));
|
||||
if (out_event)
|
||||
utils::downcast<event_t *>(&out_dep)->events_.push_back(out_event);
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
138
src/gpu/intel/l0/stream.hpp
Normal file
138
src/gpu/intel/l0/stream.hpp
Normal file
@ -0,0 +1,138 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_STREAM_HPP
|
||||
#define GPU_INTEL_L0_STREAM_HPP
|
||||
|
||||
#include <list>
|
||||
|
||||
#include "common/thread_local_storage.hpp"
|
||||
#include "gpu/intel/l0/context.hpp"
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
#include "gpu/intel/stream.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
class stream_impl_t : public impl::stream_impl_t {
|
||||
public:
|
||||
stream_impl_t(unsigned flags, ze_command_list_handle_t list);
|
||||
stream_impl_t(unsigned flags, ze_context_handle_t context,
|
||||
ze_device_handle_t device);
|
||||
~stream_impl_t();
|
||||
|
||||
context_t &l0_ctx();
|
||||
const context_t &l0_ctx() const;
|
||||
xpu::context_t &ctx();
|
||||
const xpu::context_t &ctx() const;
|
||||
ze_event_handle_t get_output_event() const;
|
||||
std::shared_ptr<event_wrapper_t> create_event();
|
||||
std::shared_ptr<event_pool_wrapper_t> get_event_pool();
|
||||
|
||||
ze_command_list_handle_t list();
|
||||
|
||||
status_t wait();
|
||||
status_t barrier();
|
||||
|
||||
status_t copy(const impl::memory_storage_t &src,
|
||||
const impl::memory_storage_t &dst, size_t size,
|
||||
const xpu::event_t &deps, xpu::event_t &out_dep);
|
||||
status_t fill(const impl::memory_storage_t &dst, uint8_t pattern,
|
||||
size_t size, const xpu::event_t &deps, xpu::event_t &out_dep);
|
||||
|
||||
private:
|
||||
void create_event_pool();
|
||||
|
||||
ze_context_handle_t context_;
|
||||
bool allocated_;
|
||||
ze_command_list_handle_t list_;
|
||||
|
||||
std::shared_ptr<event_pool_wrapper_t> event_pool_;
|
||||
std::list<std::shared_ptr<event_wrapper_t>> events_;
|
||||
|
||||
mutable utils::thread_local_storage_t<context_t> ctx_;
|
||||
|
||||
stream_impl_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(stream_impl_t);
|
||||
};
|
||||
|
||||
class stream_t : public intel::stream_t {
|
||||
public:
|
||||
static status_t create_stream(impl::stream_t **stream,
|
||||
impl::engine_t *engine, impl::stream_impl_t *stream_impl);
|
||||
|
||||
stream_impl_t *impl() const {
|
||||
return static_cast<stream_impl_t *>(impl::stream_t::impl_.get());
|
||||
}
|
||||
|
||||
engine_t *l0_engine() const {
|
||||
return utils::downcast<engine_t *>(engine());
|
||||
}
|
||||
|
||||
context_t &l0_ctx() { return impl()->l0_ctx(); }
|
||||
const context_t &l0_ctx() const { return impl()->l0_ctx(); }
|
||||
xpu::context_t &ctx() override { return impl()->ctx(); }
|
||||
const xpu::context_t &ctx() const override { return impl()->ctx(); }
|
||||
ze_event_handle_t get_output_event() const {
|
||||
return impl()->get_output_event();
|
||||
}
|
||||
std::shared_ptr<event_wrapper_t> create_event() {
|
||||
return impl()->create_event();
|
||||
}
|
||||
std::shared_ptr<event_pool_wrapper_t> get_event_pool() {
|
||||
return impl()->get_event_pool();
|
||||
}
|
||||
|
||||
const ze_command_list_handle_t list() const { return impl()->list(); }
|
||||
|
||||
status_t wait() override { return impl()->wait(); }
|
||||
status_t barrier() override { return impl()->barrier(); }
|
||||
|
||||
void before_exec_hook() override;
|
||||
void after_exec_hook() override;
|
||||
status_t reset_profiling() override;
|
||||
status_t get_profiling_data(profiling_data_kind_t data_kind,
|
||||
int *num_entries, uint64_t *data) const override;
|
||||
|
||||
status_t copy(const impl::memory_storage_t &src,
|
||||
const impl::memory_storage_t &dst, size_t size,
|
||||
const xpu::event_t &deps, xpu::event_t &out_dep) override {
|
||||
return impl()->copy(src, dst, size, deps, out_dep);
|
||||
}
|
||||
status_t fill(const impl::memory_storage_t &dst, uint8_t pattern,
|
||||
size_t size, const xpu::event_t &deps,
|
||||
xpu::event_t &out_dep) override {
|
||||
return impl()->fill(dst, pattern, size, deps, out_dep);
|
||||
}
|
||||
|
||||
private:
|
||||
stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl)
|
||||
: gpu::intel::stream_t(engine, stream_impl) {}
|
||||
|
||||
stream_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(stream_t);
|
||||
};
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_STREAM_HPP
|
24
src/gpu/intel/l0/utils/CMakeLists.txt
Normal file
24
src/gpu/intel/l0/utils/CMakeLists.txt
Normal file
@ -0,0 +1,24 @@
|
||||
#===============================================================================
|
||||
# Copyright 2025 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#===============================================================================
|
||||
|
||||
file(GLOB_RECURSE SOURCES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
|
||||
)
|
||||
|
||||
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_l0_utils)
|
||||
add_library(${OBJ_LIB} OBJECT ${SOURCES})
|
||||
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
|
||||
$<TARGET_OBJECTS:${OBJ_LIB}>)
|
387
src/gpu/intel/l0/utils/utils.cpp
Normal file
387
src/gpu/intel/l0/utils/utils.cpp
Normal file
@ -0,0 +1,387 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
#include "gpu/intel/jit/binary_format.hpp"
|
||||
#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
|
||||
#include "ngen_level_zero.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
event_wrapper_t::event_wrapper_t(ze_event_handle_t event) : event_(event) {}
|
||||
|
||||
event_wrapper_t::~event_wrapper_t() {
|
||||
if (event_) {
|
||||
l0::zeEventHostSynchronize(event_, UINT64_MAX);
|
||||
l0::zeEventDestroy(event_);
|
||||
}
|
||||
}
|
||||
|
||||
event_wrapper_t::operator ze_event_handle_t() const {
|
||||
return event_;
|
||||
}
|
||||
|
||||
event_pool_wrapper_t::event_pool_wrapper_t(ze_event_pool_handle_t event_pool)
|
||||
: event_pool_(event_pool) {}
|
||||
|
||||
event_pool_wrapper_t::~event_pool_wrapper_t() {
|
||||
if (event_pool_) l0::zeEventPoolDestroy(event_pool_);
|
||||
}
|
||||
|
||||
event_pool_wrapper_t::operator ze_event_pool_handle_t() const {
|
||||
return event_pool_;
|
||||
}
|
||||
|
||||
module_wrapper_t::module_wrapper_t(ze_module_handle_t module)
|
||||
: module_(module) {};
|
||||
|
||||
module_wrapper_t::~module_wrapper_t() {
|
||||
if (module_) l0::zeModuleDestroy(module_);
|
||||
};
|
||||
|
||||
module_wrapper_t::operator ze_module_handle_t() const {
|
||||
return module_;
|
||||
}
|
||||
|
||||
status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
|
||||
ze_device_ip_version_ext_t device_ip_version_ext = {};
|
||||
device_ip_version_ext.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
|
||||
device_ip_version_ext.pNext = nullptr;
|
||||
|
||||
ze_device_properties_t device_properties = {};
|
||||
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
device_properties.pNext = &device_ip_version_ext;
|
||||
|
||||
CHECK(l0::zeDeviceGetProperties(device, &device_properties));
|
||||
|
||||
ip_version = device_ip_version_ext.ipVersion;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t get_l0_device_enabled_systolic_intel(
|
||||
ze_device_handle_t device, bool &mayiuse_systolic) {
|
||||
ze_intel_device_module_dp_exp_properties_t
|
||||
intel_device_module_dp_exp_properties
|
||||
= {};
|
||||
intel_device_module_dp_exp_properties.stype
|
||||
= ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
|
||||
intel_device_module_dp_exp_properties.pNext = nullptr;
|
||||
|
||||
ze_device_module_properties_t device_module_properties = {};
|
||||
device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
|
||||
device_module_properties.pNext = &intel_device_module_dp_exp_properties;
|
||||
|
||||
CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties));
|
||||
|
||||
mayiuse_systolic = intel_device_module_dp_exp_properties.flags
|
||||
& ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t get_l0_device_enabled_native_float_atomics(
|
||||
ze_device_handle_t device, uint64_t &native_extensions) {
|
||||
using namespace gpu::intel::compute;
|
||||
|
||||
ze_float_atomic_ext_properties_t float_atomic_ext_properties = {};
|
||||
float_atomic_ext_properties.stype
|
||||
= ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
|
||||
float_atomic_ext_properties.pNext = nullptr;
|
||||
|
||||
ze_device_module_properties_t device_module_properties = {};
|
||||
device_module_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
|
||||
device_module_properties.pNext = &float_atomic_ext_properties;
|
||||
|
||||
CHECK(l0::zeDeviceGetModuleProperties(device, &device_module_properties));
|
||||
|
||||
ze_device_fp_atomic_ext_flags_t atomic_load_store
|
||||
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
|
||||
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
|
||||
ze_device_fp_atomic_ext_flags_t atomic_add
|
||||
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
|
||||
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
|
||||
ze_device_fp_atomic_ext_flags_t atomic_min_max
|
||||
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
|
||||
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
|
||||
|
||||
if ((float_atomic_ext_properties.fp16Flags & atomic_load_store)
|
||||
== atomic_load_store)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
|
||||
if ((float_atomic_ext_properties.fp16Flags & atomic_add) == atomic_add)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
|
||||
if ((float_atomic_ext_properties.fp16Flags & atomic_min_max)
|
||||
== atomic_min_max)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
|
||||
|
||||
if ((float_atomic_ext_properties.fp32Flags & atomic_load_store)
|
||||
== atomic_load_store)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
|
||||
if ((float_atomic_ext_properties.fp32Flags & atomic_add) == atomic_add)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
|
||||
if ((float_atomic_ext_properties.fp32Flags & atomic_min_max)
|
||||
== atomic_min_max)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
|
||||
|
||||
if ((float_atomic_ext_properties.fp64Flags & atomic_load_store)
|
||||
== atomic_load_store)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
|
||||
if ((float_atomic_ext_properties.fp64Flags & atomic_add) == atomic_add)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
|
||||
if ((float_atomic_ext_properties.fp64Flags & atomic_min_max)
|
||||
== atomic_min_max)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
|
||||
ze_eu_count_ext_t eu_count_ext = {};
|
||||
eu_count_ext.stype = ZE_STRUCTURE_TYPE_EU_COUNT_EXT;
|
||||
eu_count_ext.pNext = nullptr;
|
||||
|
||||
ze_device_properties_t device_properties = {};
|
||||
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
device_properties.pNext = &eu_count_ext;
|
||||
|
||||
CHECK(l0::zeDeviceGetProperties(device, &device_properties));
|
||||
|
||||
eu_count = eu_count_ext.numTotalEUs;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
|
||||
ze_context_handle_t context, uint32_t &ip_version,
|
||||
compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_,
|
||||
uint64_t &native_extensions, bool &mayiuse_systolic,
|
||||
bool &mayiuse_ngen_kernels) {
|
||||
using namespace ngen;
|
||||
ngen::Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
|
||||
context, device);
|
||||
|
||||
gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
|
||||
std::memcpy(&product_, &product, sizeof(ngen::Product));
|
||||
|
||||
mayiuse_systolic = false;
|
||||
if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic)
|
||||
!= status::success)
|
||||
mayiuse_systolic = false;
|
||||
|
||||
/* Some old drivers do not report systolic availability. Manually override
|
||||
systolic availability based on product family. */
|
||||
switch (product.family) {
|
||||
case ProductFamily::DG2:
|
||||
case ProductFamily::ARL:
|
||||
case ProductFamily::PVC: mayiuse_systolic = true;
|
||||
default: break;
|
||||
}
|
||||
|
||||
CHECK(get_l0_device_enabled_native_float_atomics(
|
||||
device, native_extensions));
|
||||
|
||||
auto status
|
||||
= jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
|
||||
if (status != status::success) mayiuse_ngen_kernels = false;
|
||||
|
||||
ip_version = 0;
|
||||
|
||||
return get_device_ip(device, ip_version);
|
||||
}
|
||||
|
||||
xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device) {
|
||||
static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16,
|
||||
"ZE_MAX_DEVICE_UUID_SIZE is expected to be 16");
|
||||
|
||||
ze_device_properties_t device_properties = {};
|
||||
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
device_properties.pNext = nullptr;
|
||||
|
||||
auto status = l0::zeDeviceGetProperties(device, &device_properties);
|
||||
MAYBE_UNUSED(status);
|
||||
assert(status == status::success);
|
||||
|
||||
const auto &device_id = device_properties.uuid.id;
|
||||
|
||||
uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {};
|
||||
for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
|
||||
size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
|
||||
uuid[i / sizeof(uint64_t)] |= (((uint64_t)device_id[i]) << shift);
|
||||
}
|
||||
|
||||
return xpu::device_uuid_t(uuid[0], uuid[1]);
|
||||
}
|
||||
|
||||
status_t get_device_index(const ze_device_handle_t device, size_t *index) {
|
||||
uint32_t driver_count = 0;
|
||||
CHECK(l0::zeDriverGet(&driver_count, nullptr));
|
||||
|
||||
std::vector<ze_driver_handle_t> drivers(driver_count);
|
||||
CHECK(l0::zeDriverGet(&driver_count, drivers.data()));
|
||||
|
||||
uint32_t device_count = 0;
|
||||
CHECK(l0::zeDeviceGet(drivers[0], &device_count, nullptr));
|
||||
|
||||
std::vector<ze_device_handle_t> devices(device_count);
|
||||
CHECK(l0::zeDeviceGet(drivers[0], &device_count, devices.data()));
|
||||
|
||||
for (size_t i = 0; i < device_count; i++) {
|
||||
if (device == devices[i]) {
|
||||
*index = i;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
}
|
||||
|
||||
return status::invalid_arguments;
|
||||
}
|
||||
|
||||
std::string get_kernel_name(const ze_kernel_handle_t kernel) {
|
||||
std::string kernel_name;
|
||||
|
||||
size_t kernel_name_size = 0;
|
||||
l0::zeKernelGetName(kernel, &kernel_name_size, nullptr);
|
||||
|
||||
kernel_name.resize(kernel_name_size, 0);
|
||||
l0::zeKernelGetName(kernel, &kernel_name_size, &kernel_name[0]);
|
||||
|
||||
// Remove the null terminator as std::string already includes it
|
||||
kernel_name.resize(kernel_name_size - 1);
|
||||
|
||||
return kernel_name;
|
||||
}
|
||||
|
||||
status_t get_kernel_binary(
|
||||
const ze_kernel_handle_t kernel, xpu::binary_t &binary) {
|
||||
size_t binary_size = 0;
|
||||
CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, nullptr));
|
||||
|
||||
binary.resize(binary_size);
|
||||
CHECK(l0::zeKernelGetBinaryExp(kernel, &binary_size, binary.data()));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t get_module_binary(
|
||||
const ze_module_handle_t module, xpu::binary_t &binary) {
|
||||
size_t module_binary_size;
|
||||
CHECK(l0::zeModuleGetNativeBinary(module, &module_binary_size, nullptr));
|
||||
|
||||
binary.resize(module_binary_size);
|
||||
CHECK(l0::zeModuleGetNativeBinary(
|
||||
module, &module_binary_size, binary.data()));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
#define ZE_MODULE_FORMAT_OCLC (ze_module_format_t)3U
|
||||
status_t compile_ocl_module(const ze_device_handle_t device,
|
||||
const ze_context_handle_t context, std::string &code,
|
||||
std::string &options, xpu::binary_t &binary) {
|
||||
ze_module_desc_t module_desc;
|
||||
module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
|
||||
module_desc.pNext = nullptr;
|
||||
module_desc.format = ZE_MODULE_FORMAT_OCLC;
|
||||
module_desc.inputSize = code.size();
|
||||
module_desc.pInputModule = reinterpret_cast<const uint8_t *>(code.c_str());
|
||||
module_desc.pBuildFlags = options.c_str();
|
||||
module_desc.pConstants = nullptr;
|
||||
|
||||
ze_module_handle_t module;
|
||||
ze_module_build_log_handle_t module_build_log;
|
||||
if (l0::zeModuleCreate(
|
||||
context, device, &module_desc, &module, &module_build_log)
|
||||
!= status::success) {
|
||||
size_t build_log_size = 0;
|
||||
CHECK(l0::zeModuleBuildLogGetString(
|
||||
module_build_log, &build_log_size, nullptr));
|
||||
char *build_log = new char[build_log_size];
|
||||
CHECK(l0::zeModuleBuildLogGetString(
|
||||
module_build_log, &build_log_size, build_log));
|
||||
std::cout << std::endl << "Build log: " << build_log << std::endl;
|
||||
delete[] build_log;
|
||||
|
||||
return status::runtime_error;
|
||||
}
|
||||
CHECK(l0::zeModuleBuildLogDestroy(module_build_log));
|
||||
|
||||
CHECK(get_module_binary(module, binary));
|
||||
|
||||
CHECK(l0::zeModuleDestroy(module));
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t create_kernels_from_binary(const ze_device_handle_t device,
|
||||
const ze_context_handle_t context,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
const xpu::binary_t &binary, ze_module_handle_t *module,
|
||||
std::vector<ze_kernel_handle_t> &kernels) {
|
||||
ze_module_desc_t module_desc;
|
||||
module_desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
|
||||
module_desc.pNext = nullptr;
|
||||
module_desc.format = ZE_MODULE_FORMAT_NATIVE;
|
||||
module_desc.inputSize = binary.size();
|
||||
module_desc.pInputModule = binary.data();
|
||||
module_desc.pBuildFlags = "";
|
||||
module_desc.pConstants = nullptr;
|
||||
|
||||
CHECK(l0::zeModuleCreate(context, device, &module_desc, module, nullptr));
|
||||
|
||||
kernels.resize(kernel_names.size(), nullptr);
|
||||
for (size_t i = 0; i < kernel_names.size(); i++) {
|
||||
if (kernel_names[i] == nullptr) {
|
||||
kernels[i] = nullptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
ze_kernel_desc_t kernel_desc = {};
|
||||
kernel_desc.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
|
||||
kernel_desc.pNext = nullptr;
|
||||
kernel_desc.flags = 0;
|
||||
kernel_desc.pKernelName = kernel_names[i];
|
||||
|
||||
ze_kernel_handle_t kernel;
|
||||
CHECK(l0::zeKernelCreate(*module, &kernel_desc, &kernel));
|
||||
|
||||
kernels[i] = kernel;
|
||||
}
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
ze_memory_type_t get_pointer_type(
|
||||
const ze_context_handle_t context, const void *ptr) {
|
||||
ze_memory_allocation_properties_t memory_allocation_properties;
|
||||
memory_allocation_properties.stype
|
||||
= ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
|
||||
memory_allocation_properties.pNext = nullptr;
|
||||
|
||||
l0::zeMemGetAllocProperties(
|
||||
context, ptr, &memory_allocation_properties, nullptr);
|
||||
|
||||
return memory_allocation_properties.type;
|
||||
}
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
238
src/gpu/intel/l0/utils/utils.hpp
Normal file
238
src/gpu/intel/l0/utils/utils.hpp
Normal file
@ -0,0 +1,238 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_L0_UTILS_HPP
|
||||
#define GPU_INTEL_L0_UTILS_HPP
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <dlfcn.h>
|
||||
#elif defined(_WIN32)
|
||||
#include "windows.h"
|
||||
#else
|
||||
#error "Level Zero is supported on Linux and Windows only"
|
||||
#endif
|
||||
|
||||
#include "gpu/intel/compute/kernel.hpp"
|
||||
|
||||
#include "level_zero/ze_api.h"
|
||||
#include "level_zero/ze_intel_gpu.h"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace l0 {
|
||||
|
||||
inline std::string to_string(ze_result_t r) {
|
||||
#define ZE_STATUS_CASE(status) \
|
||||
case status: return #status
|
||||
switch (r) {
|
||||
ZE_STATUS_CASE(ZE_RESULT_SUCCESS);
|
||||
ZE_STATUS_CASE(ZE_RESULT_NOT_READY);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN);
|
||||
ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32);
|
||||
default: return std::to_string((int)r);
|
||||
}
|
||||
#undef ZE_STATUS_CASE
|
||||
};
|
||||
|
||||
#define ZE_CHECK(f) \
|
||||
do { \
|
||||
ze_result_t res_ = (f); \
|
||||
if (res_ != ZE_RESULT_SUCCESS) { \
|
||||
std::string err_str_ = to_string(res_); \
|
||||
VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \
|
||||
return status::runtime_error; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
inline void *find_symbol(const char *symbol) {
|
||||
#if defined(_WIN32)
|
||||
HMODULE handle = LoadLibraryExA(
|
||||
"ze_loader.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
|
||||
if (!handle) return nullptr;
|
||||
return reinterpret_cast<void *>(GetProcAddress(handle, symbol));
|
||||
#elif defined(__linux__)
|
||||
void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
|
||||
if (!handle) return nullptr;
|
||||
return dlsym(handle, symbol);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
F find_ze_symbol(const char *symbol) {
|
||||
auto f = (F)find_symbol(symbol);
|
||||
if (!f) VERROR(common, level_zero, "cannot find symbol: %s", symbol);
|
||||
return f;
|
||||
}
|
||||
#undef L0_LIB_NAME
|
||||
|
||||
#define INDIRECT_L0_CALL(f) \
|
||||
template <typename... Args> \
|
||||
status_t f(Args &&...args) { \
|
||||
const ze_init_flags_t default_ze_flags = 0; \
|
||||
static auto init_ = find_ze_symbol<decltype(&::zeInit)>("zeInit"); \
|
||||
ZE_CHECK(init_(default_ze_flags)); \
|
||||
static auto f_ = find_ze_symbol<decltype(&::f)>(#f); \
|
||||
ZE_CHECK(f_(std::forward<Args>(args)...)); \
|
||||
return status::success; \
|
||||
}
|
||||
INDIRECT_L0_CALL(zeDriverGet)
|
||||
INDIRECT_L0_CALL(zeDriverGetProperties)
|
||||
INDIRECT_L0_CALL(zeDeviceGet)
|
||||
INDIRECT_L0_CALL(zeDeviceGetProperties)
|
||||
INDIRECT_L0_CALL(zeDeviceGetComputeProperties)
|
||||
INDIRECT_L0_CALL(zeDeviceGetModuleProperties)
|
||||
INDIRECT_L0_CALL(zeDeviceGetMemoryAccessProperties)
|
||||
INDIRECT_L0_CALL(zeDeviceGetCacheProperties)
|
||||
INDIRECT_L0_CALL(zeContextCreate)
|
||||
INDIRECT_L0_CALL(zeContextDestroy)
|
||||
INDIRECT_L0_CALL(zeCommandListCreateImmediate)
|
||||
INDIRECT_L0_CALL(zeCommandListDestroy)
|
||||
INDIRECT_L0_CALL(zeCommandListHostSynchronize)
|
||||
INDIRECT_L0_CALL(zeCommandListGetContextHandle)
|
||||
INDIRECT_L0_CALL(zeCommandListAppendBarrier)
|
||||
INDIRECT_L0_CALL(zeCommandListAppendMemoryCopy)
|
||||
INDIRECT_L0_CALL(zeCommandListAppendMemoryFill)
|
||||
INDIRECT_L0_CALL(zeEventPoolCreate)
|
||||
INDIRECT_L0_CALL(zeEventPoolDestroy)
|
||||
INDIRECT_L0_CALL(zeEventCreate)
|
||||
INDIRECT_L0_CALL(zeEventDestroy)
|
||||
INDIRECT_L0_CALL(zeEventHostSynchronize)
|
||||
INDIRECT_L0_CALL(zeMemAllocShared)
|
||||
INDIRECT_L0_CALL(zeMemAllocDevice)
|
||||
INDIRECT_L0_CALL(zeMemAllocHost)
|
||||
INDIRECT_L0_CALL(zeMemFree)
|
||||
INDIRECT_L0_CALL(zeMemGetAllocProperties)
|
||||
INDIRECT_L0_CALL(zeModuleCreate)
|
||||
INDIRECT_L0_CALL(zeModuleDestroy)
|
||||
INDIRECT_L0_CALL(zeModuleBuildLogDestroy)
|
||||
INDIRECT_L0_CALL(zeModuleBuildLogGetString)
|
||||
INDIRECT_L0_CALL(zeModuleGetNativeBinary)
|
||||
INDIRECT_L0_CALL(zeKernelCreate)
|
||||
INDIRECT_L0_CALL(zeKernelDestroy)
|
||||
INDIRECT_L0_CALL(zeKernelSetArgumentValue)
|
||||
INDIRECT_L0_CALL(zeKernelGetName)
|
||||
INDIRECT_L0_CALL(zeKernelGetBinaryExp)
|
||||
INDIRECT_L0_CALL(zeKernelSetGroupSize)
|
||||
INDIRECT_L0_CALL(zeKernelSuggestGroupSize)
|
||||
INDIRECT_L0_CALL(zeCommandListAppendLaunchKernel)
|
||||
#undef INDIRECT_L0_CALL
|
||||
|
||||
class event_wrapper_t {
|
||||
public:
|
||||
event_wrapper_t(ze_event_handle_t event);
|
||||
~event_wrapper_t();
|
||||
operator ze_event_handle_t() const;
|
||||
|
||||
private:
|
||||
ze_event_handle_t event_;
|
||||
|
||||
event_wrapper_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(event_wrapper_t);
|
||||
};
|
||||
|
||||
class event_pool_wrapper_t {
|
||||
public:
|
||||
event_pool_wrapper_t(ze_event_pool_handle_t event_pool);
|
||||
~event_pool_wrapper_t();
|
||||
operator ze_event_pool_handle_t() const;
|
||||
|
||||
private:
|
||||
ze_event_pool_handle_t event_pool_;
|
||||
|
||||
event_pool_wrapper_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(event_pool_wrapper_t);
|
||||
};
|
||||
|
||||
class module_wrapper_t {
|
||||
public:
|
||||
module_wrapper_t(ze_module_handle_t module);
|
||||
~module_wrapper_t();
|
||||
operator ze_module_handle_t() const;
|
||||
|
||||
private:
|
||||
ze_module_handle_t module_;
|
||||
|
||||
module_wrapper_t() = delete;
|
||||
DNNL_DISALLOW_COPY_AND_ASSIGN(module_wrapper_t);
|
||||
};
|
||||
|
||||
status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
|
||||
ze_context_handle_t context, uint32_t &ip_version,
|
||||
compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product,
|
||||
uint64_t &native_extensions, bool &mayiuse_systolic,
|
||||
bool &mayiuse_ngen_kernels);
|
||||
xpu::device_uuid_t get_device_uuid(const ze_device_handle_t device);
|
||||
status_t get_device_index(const ze_device_handle_t device, size_t *index);
|
||||
std::string get_kernel_name(const ze_kernel_handle_t kernel);
|
||||
status_t get_kernel_binary(
|
||||
const ze_kernel_handle_t kernel, xpu::binary_t &binary);
|
||||
status_t get_module_binary(
|
||||
const ze_module_handle_t module, xpu::binary_t &binary);
|
||||
status_t compile_ocl_module(const ze_device_handle_t device,
|
||||
const ze_context_handle_t context, std::string &code,
|
||||
std::string &options, xpu::binary_t &binary);
|
||||
status_t create_kernels_from_binary(const ze_device_handle_t device,
|
||||
const ze_context_handle_t context,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
const xpu::binary_t &binary, ze_module_handle_t *module,
|
||||
std::vector<ze_kernel_handle_t> &kernels);
|
||||
ze_memory_type_t get_pointer_type(const ze_context_handle_t, const void *ptr);
|
||||
|
||||
} // namespace l0
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_L0_UTILS_HPP
|
@ -15,9 +15,7 @@
|
||||
#===============================================================================
|
||||
|
||||
file(GLOB_RECURSE SOURCES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
|
||||
)
|
||||
|
||||
|
@ -22,7 +22,7 @@
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include "gpu/intel/compute/device_info.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
|
@ -32,7 +32,7 @@
|
||||
#include "gpu/intel/ocl/engine.hpp"
|
||||
#include "gpu/intel/ocl/kernel.hpp"
|
||||
#include "gpu/intel/ocl/stream.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@ -40,33 +40,6 @@ namespace gpu {
|
||||
namespace intel {
|
||||
namespace ocl {
|
||||
|
||||
status_t preprocess_headers(stringstream_t &pp_code, const char *code,
|
||||
const compute::kernel_ctx_t &kernel_ctx) {
|
||||
stringstream_t code_stream(code);
|
||||
|
||||
for (std::string line; std::getline(code_stream, line);) {
|
||||
const size_t include_pos = line.find("#include");
|
||||
if (include_pos != std::string::npos) {
|
||||
static constexpr size_t include_len = 8;
|
||||
const size_t first_quote_pos
|
||||
= line.find("\"", include_pos + include_len);
|
||||
const size_t second_quote_pos
|
||||
= line.find("\"", first_quote_pos + 1);
|
||||
const size_t kernel_name_len
|
||||
= second_quote_pos - first_quote_pos - 1;
|
||||
const auto header_name
|
||||
= line.substr(first_quote_pos + 1, kernel_name_len);
|
||||
const char *header_source
|
||||
= kernel_ctx.get_custom_header(header_name);
|
||||
if (!header_source) header_source = get_kernel_header(header_name);
|
||||
CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
|
||||
} else {
|
||||
pp_code << line << std::endl;
|
||||
}
|
||||
}
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
|
||||
cl_device_id dev, cl_context ctx, size_t index,
|
||||
const std::vector<uint8_t> &cache_blob) {
|
||||
@ -249,14 +222,14 @@ status_t engine_t::build_program_from_source(
|
||||
// `clCompileProgram` `clBuildProgram` doesn't take headers. Because of
|
||||
// that, a manual preprocessing of `include` header directives in the
|
||||
// OpenCL kernels is required.
|
||||
CHECK(preprocess_headers(pp_code, code_string, kernel_ctx));
|
||||
CHECK(compute::preprocess_headers(pp_code, code_string, kernel_ctx));
|
||||
std::string pp_code_str = pp_code.str();
|
||||
const char *pp_code_str_ptr = pp_code_str.c_str();
|
||||
|
||||
src = {pp_code_str};
|
||||
if (src) { options += " -g -s " + std::string(src.name()); }
|
||||
|
||||
debugdump_processed_source(
|
||||
compute::debugdump_processed_source(
|
||||
pp_code_str, options, dev_info->get_cl_ext_options());
|
||||
|
||||
auto ctx = context();
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "common/utils.hpp"
|
||||
#include "gpu/gpu_impl_list.hpp"
|
||||
#include "gpu/intel/engine.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
#include "xpu/ocl/engine_impl.hpp"
|
||||
#include "xpu/utils.hpp"
|
||||
|
||||
@ -30,9 +31,6 @@ namespace gpu {
|
||||
namespace intel {
|
||||
namespace ocl {
|
||||
|
||||
status_t preprocess_headers(stringstream_t &pp_code, const char *code,
|
||||
const compute::kernel_ctx_t &kernel_ctx);
|
||||
|
||||
status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
|
||||
cl_device_id dev, cl_context ctx, size_t index,
|
||||
const std::vector<uint8_t> &cache_blob);
|
||||
|
@ -15,7 +15,7 @@
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/ocl/hw_info.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
#include "gpu/intel/jit/binary_format.hpp"
|
||||
#include "gpu/intel/jit/generator.hpp"
|
||||
|
@ -31,7 +31,7 @@
|
||||
|
||||
#include "gpu/intel/ocl/engine.hpp"
|
||||
#include "gpu/intel/ocl/stream.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
|
@ -28,7 +28,7 @@
|
||||
#include <dlfcn.h>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
#include "mdapi/metrics_discovery_api.h"
|
||||
|
||||
#ifndef CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL
|
||||
|
@ -26,7 +26,7 @@
|
||||
|
||||
#include "gpu/intel/ocl/engine.hpp"
|
||||
#include "gpu/intel/ocl/stream.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
|
@ -26,9 +26,11 @@
|
||||
#include "xpu/ocl/context.hpp"
|
||||
#include "xpu/ocl/stream_impl.hpp"
|
||||
|
||||
#include "gpu/intel/ocl/mdapi_utils.hpp"
|
||||
#include "gpu/intel/stream.hpp"
|
||||
|
||||
#include "gpu/intel/ocl/mdapi_utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
|
24
src/gpu/intel/ocl/utils/CMakeLists.txt
Normal file
24
src/gpu/intel/ocl/utils/CMakeLists.txt
Normal file
@ -0,0 +1,24 @@
|
||||
#===============================================================================
|
||||
# Copyright 2025 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#===============================================================================
|
||||
|
||||
file(GLOB_RECURSE SOURCES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
|
||||
)
|
||||
|
||||
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_ocl_utils)
|
||||
add_library(${OBJ_LIB} OBJECT ${SOURCES})
|
||||
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
|
||||
$<TARGET_OBJECTS:${OBJ_LIB}>)
|
@ -23,7 +23,7 @@
|
||||
#include "gpu/intel/ocl/engine.hpp"
|
||||
#include "gpu/intel/ocl/hw_info.hpp"
|
||||
#include "gpu/intel/ocl/kernel.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
#include "xpu/ocl/utils.hpp"
|
||||
|
||||
#ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
|
||||
@ -217,69 +217,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary) {
|
||||
return status::success;
|
||||
}
|
||||
|
||||
void debugdump_processed_source(const std::string &source,
|
||||
const std::string &options, const std::string &cl_options) {
|
||||
#if defined(__linux__) && defined(DNNL_DEV_MODE)
|
||||
if (get_verbose(verbose_t::debuginfo) >= 10) {
|
||||
auto get_defines = [](const std::string &from) {
|
||||
std::string ret;
|
||||
size_t pos = 0;
|
||||
while (pos < from.length()) {
|
||||
// Find next define argument
|
||||
pos = from.find("-D", pos);
|
||||
|
||||
// Generate argument, quotes are interpreted literally, but
|
||||
// other special shell characters need escaped. Does not
|
||||
// currently handle quotes with the ' character or nested quotes
|
||||
char quote_parity = true;
|
||||
while (pos < from.length()) {
|
||||
if (quote_parity
|
||||
&& utils::one_of(from[pos], '~', '#', '$', '&', '*',
|
||||
'(', ')', '\\', '|', '[', ']', '{', '}',
|
||||
';', '\'', '<', '>', '/', '?', '!')) {
|
||||
ret += '\\';
|
||||
}
|
||||
ret += from[pos];
|
||||
if (from[pos] == '"') quote_parity ^= true;
|
||||
if (from[pos] == ' ' && quote_parity) break;
|
||||
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
auto execute_command = [](const std::string &cmd,
|
||||
const std::string &stdin) {
|
||||
std::string result;
|
||||
std::array<char, 256> buffer;
|
||||
FILE *pipe = popen(cmd.c_str(), "w");
|
||||
fputs(stdin.c_str(), pipe);
|
||||
if (pipe) {
|
||||
while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
|
||||
result += buffer.data();
|
||||
}
|
||||
}
|
||||
pclose(pipe);
|
||||
return result;
|
||||
};
|
||||
|
||||
// Run utilities to evaluate preprocessor defines and format the file
|
||||
// Theoretically, we can accomplish this task with libclang, but it
|
||||
// seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
|
||||
// so that calls to the system are not included in the default build.
|
||||
|
||||
// Due to the use of a different C preprocessor, warnings should not be
|
||||
// ignored, as they may correspond to a different behavior in the OpenCL
|
||||
// C preprocessor
|
||||
auto o = get_defines(options) + get_defines(cl_options);
|
||||
std::string preprocess_cmd
|
||||
= std::string() + "cpp -P " + o + " | clang-format";
|
||||
execute_command(preprocess_cmd, source);
|
||||
std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
status_t get_kernel_arg_types(cl_kernel ocl_kernel,
|
||||
std::vector<gpu::intel::compute::scalar_type_t> *arg_types) {
|
||||
cl_uint nargs;
|
@ -55,9 +55,6 @@ status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary);
|
||||
status_t get_ocl_program_binary_size(
|
||||
cl_kernel kernel, cl_device_id device, size_t *size);
|
||||
|
||||
void debugdump_processed_source(const std::string &source,
|
||||
const std::string &options, const std::string &ocl_options);
|
||||
|
||||
status_t get_kernel_arg_types(cl_kernel ocl_kernel,
|
||||
std::vector<gpu::intel::compute::scalar_type_t> *arg_types);
|
||||
|
@ -35,7 +35,7 @@
|
||||
#include "gpu/intel/compute/device_info.hpp"
|
||||
#include "gpu/intel/sycl/compat.hpp"
|
||||
#include "gpu/intel/sycl/engine.hpp"
|
||||
#include "gpu/intel/sycl/l0/utils.hpp"
|
||||
#include "gpu/intel/sycl/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
|
@ -19,11 +19,11 @@
|
||||
#include "gpu/intel/sycl/compat.hpp"
|
||||
#include "gpu/intel/sycl/device_info.hpp"
|
||||
#include "gpu/intel/sycl/engine.hpp"
|
||||
#include "gpu/intel/sycl/l0/utils.hpp"
|
||||
#include "gpu/intel/sycl/utils.hpp"
|
||||
|
||||
#include "gpu/intel/ocl/hw_info.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
@ -56,7 +56,7 @@ status_t device_info_t::init_arch(impl::engine_t *engine) {
|
||||
auto ze_dev = xpu::sycl::compat::get_native<ze_device_handle_t>(device);
|
||||
auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(ctx);
|
||||
|
||||
status = gpu::intel::sycl::init_gpu_hw_info(engine, ze_dev, ze_ctx,
|
||||
status = gpu::intel::l0::init_gpu_hw_info(engine, ze_dev, ze_ctx,
|
||||
ip_version_, gpu_arch_, gpu_product_, native_extensions_,
|
||||
mayiuse_systolic_, mayiuse_ngen_kernels_);
|
||||
} else {
|
||||
|
@ -114,18 +114,25 @@ status_t engine_t::create_kernels(
|
||||
|
||||
const char *source = nullptr;
|
||||
for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++)
|
||||
source = ocl::get_kernel_source(kernel_names[i]);
|
||||
source = get_kernel_source(kernel_names[i]);
|
||||
VERROR_ENGINE(source, status::runtime_error,
|
||||
"No OpenCL source was found for kernel");
|
||||
|
||||
stringstream_t pp_code;
|
||||
CHECK(gpu::intel::ocl::preprocess_headers(pp_code, source, kernel_ctx));
|
||||
CHECK(compute::preprocess_headers(pp_code, source, kernel_ctx));
|
||||
std::string code_str = pp_code.str();
|
||||
|
||||
std::string build_options = kernel_ctx.options();
|
||||
build_options += " " + device_info()->get_cl_ext_options();
|
||||
|
||||
gpu::intel::compute::program_src_t src(code_str);
|
||||
if (src) { build_options += " -g -s " + std::string(src.name()); }
|
||||
|
||||
compute::debugdump_processed_source(
|
||||
code_str, build_options, device_info()->get_cl_ext_options());
|
||||
|
||||
auto kb_src = syclex::create_kernel_bundle_from_source(
|
||||
context(), syclex::source_language::opencl, pp_code.str());
|
||||
context(), syclex::source_language::opencl, code_str);
|
||||
auto kb_exe = syclex::build(
|
||||
kb_src, syclex::properties {syclex::build_options(build_options)});
|
||||
*kernels = std::vector<compute::kernel_t>(kernel_names.size());
|
||||
@ -133,8 +140,7 @@ status_t engine_t::create_kernels(
|
||||
if (!kernel_names[i]) continue;
|
||||
|
||||
CHECK(interop_kernel_t::make((*kernels)[i],
|
||||
kb_exe.ext_oneapi_get_kernel(kernel_names[i]),
|
||||
gpu::intel::compute::program_src_t(pp_code.str())));
|
||||
kb_exe.ext_oneapi_get_kernel(kernel_names[i]), src));
|
||||
}
|
||||
|
||||
return status::success;
|
||||
|
@ -31,7 +31,7 @@
|
||||
#include "gpu/intel/ocl/engine.hpp"
|
||||
#include "gpu/intel/ocl/kernel.hpp"
|
||||
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
#include "gpu/intel/sycl/compat.hpp"
|
||||
#include "gpu/intel/sycl/utils.hpp"
|
||||
|
||||
|
@ -19,8 +19,7 @@
|
||||
#include "common/verbose.hpp"
|
||||
#include "gpu/intel/compute/types_interop.hpp"
|
||||
#include "gpu/intel/compute/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/sycl/l0/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
#include "gpu/intel/sycl/stream.hpp"
|
||||
#include "gpu/intel/sycl/utils.hpp"
|
||||
#include "xpu/sycl/c_types_map.hpp"
|
||||
|
@ -1,436 +0,0 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2020-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "gpu/intel/sycl/l0/utils.hpp"
|
||||
#include "oneapi/dnnl/dnnl_config.h"
|
||||
|
||||
#include "gpu/intel/jit/binary_format.hpp"
|
||||
#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
|
||||
#include "ngen_level_zero.hpp"
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <dlfcn.h>
|
||||
#elif defined(_WIN32)
|
||||
#include "windows.h"
|
||||
#else
|
||||
#error "Level Zero is supported on Linux and Windows only"
|
||||
#endif
|
||||
|
||||
#include "level_zero/ze_api.h"
|
||||
#include "level_zero/ze_intel_gpu.h"
|
||||
|
||||
#if !defined(__SYCL_COMPILER_VERSION)
|
||||
#error "Unsupported compiler"
|
||||
#endif
|
||||
|
||||
#if (__SYCL_COMPILER_VERSION < 20200818)
|
||||
#error "Level Zero is not supported with this compiler version"
|
||||
#endif
|
||||
|
||||
#include "common/c_types_map.hpp"
|
||||
#include "common/verbose.hpp"
|
||||
|
||||
#include "gpu/intel/sycl/utils.hpp"
|
||||
#include <sycl/ext/oneapi/backend/level_zero.hpp>
|
||||
|
||||
#include "gpu/intel/sycl/engine.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace sycl {
|
||||
|
||||
namespace {
|
||||
|
||||
std::string to_string(ze_result_t r) {
|
||||
#define ZE_STATUS_CASE(status) \
|
||||
case status: return #status
|
||||
switch (r) {
|
||||
ZE_STATUS_CASE(ZE_RESULT_SUCCESS);
|
||||
ZE_STATUS_CASE(ZE_RESULT_NOT_READY);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_LOST);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNINITIALIZED);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SIZE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS);
|
||||
ZE_STATUS_CASE(ZE_RESULT_ERROR_UNKNOWN);
|
||||
ZE_STATUS_CASE(ZE_RESULT_FORCE_UINT32);
|
||||
default: return std::to_string((int)r);
|
||||
}
|
||||
#undef ZE_STATUS_CASE
|
||||
};
|
||||
|
||||
#define ZE_CHECK_COMMON(f, retval) \
|
||||
do { \
|
||||
ze_result_t res_ = (f); \
|
||||
if (res_ != ZE_RESULT_SUCCESS) { \
|
||||
std::string err_str_ = to_string(res_); \
|
||||
VERROR(common, level_zero, "errcode %s", err_str_.c_str()); \
|
||||
return retval; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define ZE_CHECK(f) ZE_CHECK_COMMON(f, status::runtime_error)
|
||||
#define ZE_CHECK_VP(f) ZE_CHECK_COMMON(f, nullptr)
|
||||
|
||||
void *find_ze_symbol(const char *symbol) {
|
||||
#if defined(__linux__)
|
||||
void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
|
||||
#elif defined(_WIN32)
|
||||
// Use LOAD_LIBRARY_SEARCH_SYSTEM32 flag to avoid DLL hijacking issue.
|
||||
HMODULE handle = LoadLibraryExA(
|
||||
"ze_loader.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
|
||||
#endif
|
||||
if (!handle) {
|
||||
VERROR(common, level_zero, "cannot find loader library");
|
||||
assert(!"not expected");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
using zeInit_decl_t = ze_result_t (*)(ze_init_flags_t flags);
|
||||
const ze_init_flags_t default_ze_flags = 0;
|
||||
#if defined(__linux__)
|
||||
static const ze_result_t ze_result = reinterpret_cast<zeInit_decl_t>(
|
||||
dlsym(handle, "zeInit"))(default_ze_flags);
|
||||
void *f = reinterpret_cast<void *>(dlsym(handle, symbol));
|
||||
#elif defined(_WIN32)
|
||||
static const ze_result_t ze_result = reinterpret_cast<zeInit_decl_t>(
|
||||
GetProcAddress(handle, "zeInit"))(default_ze_flags);
|
||||
void *f = reinterpret_cast<void *>(GetProcAddress(handle, symbol));
|
||||
#endif
|
||||
ZE_CHECK_VP(ze_result);
|
||||
|
||||
if (!f) {
|
||||
VERROR(common, level_zero, "cannot find symbol: %s", symbol);
|
||||
assert(!"not expected");
|
||||
}
|
||||
return f;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
F find_ze_symbol(const char *symbol) {
|
||||
return (F)find_ze_symbol(symbol);
|
||||
}
|
||||
|
||||
status_t func_zeModuleCreate(ze_context_handle_t hContext,
|
||||
ze_device_handle_t hDevice, const ze_module_desc_t *desc,
|
||||
ze_module_handle_t *phModule,
|
||||
ze_module_build_log_handle_t *phBuildLog) {
|
||||
static auto f = find_ze_symbol<decltype(&zeModuleCreate)>("zeModuleCreate");
|
||||
|
||||
if (!f) return status::runtime_error;
|
||||
ZE_CHECK(f(hContext, hDevice, desc, phModule, phBuildLog));
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t func_zeDeviceGetProperties(
|
||||
ze_device_handle_t hDevice, ze_device_properties_t *pDeviceProperties) {
|
||||
static auto f = find_ze_symbol<decltype(&zeDeviceGetProperties)>(
|
||||
"zeDeviceGetProperties");
|
||||
|
||||
if (!f) return status::runtime_error;
|
||||
ZE_CHECK(f(hDevice, pDeviceProperties));
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t func_zeDeviceGetModuleProperties(ze_device_handle_t hDevice,
|
||||
ze_device_module_properties_t *pDeviceProperties) {
|
||||
static auto f = find_ze_symbol<decltype(&zeDeviceGetModuleProperties)>(
|
||||
"zeDeviceGetModuleProperties");
|
||||
|
||||
if (!f) {
|
||||
VERROR(common, level_zero,
|
||||
"failed to find systolic query extension (maybe update the "
|
||||
"driver?)");
|
||||
return status::runtime_error;
|
||||
}
|
||||
ZE_CHECK(f(hDevice, pDeviceProperties));
|
||||
return status::success;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// This function is called from compatibility layer that ensures compatibility
|
||||
// with SYCL 2017 API. Once the compatibility layer is removed this function
|
||||
// can be moved to the anonymous namespace above and a function with SYCL
|
||||
// data types in its interface can be created to call it.
|
||||
status_t func_zeKernelCreate(ze_module_handle_t hModule,
|
||||
const ze_kernel_desc_t *desc, ze_kernel_handle_t *phKernel) {
|
||||
static auto f = find_ze_symbol<decltype(&zeKernelCreate)>("zeKernelCreate");
|
||||
|
||||
if (!f) return status::runtime_error;
|
||||
ZE_CHECK(f(hModule, desc, phKernel));
|
||||
return status::success;
|
||||
}
|
||||
|
||||
#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
|
||||
status_t func_zeGetKernelBinary(
|
||||
ze_kernel_handle_t hKernel, size_t *pSize, uint8_t *pKernelBinary) {
|
||||
static auto f = find_ze_symbol<decltype(&zeKernelGetBinaryExp)>(
|
||||
"zeKernelGetBinaryExp");
|
||||
|
||||
if (!f) return status::runtime_error;
|
||||
ZE_CHECK(f(hKernel, pSize, pKernelBinary));
|
||||
return status::success;
|
||||
}
|
||||
#else
|
||||
status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize,
|
||||
uint8_t *pModuleNativeBinary) {
|
||||
static auto f = find_ze_symbol<decltype(&zeModuleGetNativeBinary)>(
|
||||
"zeModuleGetNativeBinary");
|
||||
|
||||
if (!f) return status::runtime_error;
|
||||
ZE_CHECK(f(hModule, pSize, pModuleNativeBinary));
|
||||
return status::success;
|
||||
}
|
||||
#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
|
||||
|
||||
// FIXME: Currently SYCL doesn't provide any API to get device UUID so
|
||||
// we query it directly from Level0 with the zeDeviceGetProperties function.
|
||||
// The `get_device_uuid` function packs 128 bits of the device UUID, which are
|
||||
// represented as an uint8_t array of size 16, to 2 uint64_t values.
|
||||
xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
|
||||
static_assert(ZE_MAX_DEVICE_UUID_SIZE == 16,
|
||||
"ZE_MAX_DEVICE_UUID_SIZE is expected to be 16");
|
||||
|
||||
auto ze_device_properties = ze_device_properties_t();
|
||||
ze_device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
|
||||
auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(dev);
|
||||
auto status = func_zeDeviceGetProperties(ze_device, &ze_device_properties);
|
||||
MAYBE_UNUSED(status);
|
||||
assert(status == status::success);
|
||||
|
||||
const auto &ze_device_id = ze_device_properties.uuid.id;
|
||||
|
||||
uint64_t uuid[ZE_MAX_DEVICE_UUID_SIZE / sizeof(uint64_t)] = {};
|
||||
for (size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
|
||||
size_t shift = i % sizeof(uint64_t) * CHAR_BIT;
|
||||
uuid[i / sizeof(uint64_t)] |= (((uint64_t)ze_device_id[i]) << shift);
|
||||
}
|
||||
return xpu::device_uuid_t(uuid[0], uuid[1]);
|
||||
}
|
||||
|
||||
status_t sycl_create_kernels_with_level_zero(
|
||||
std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
const gpu::intel::sycl::engine_t *sycl_engine,
|
||||
const xpu::binary_t &binary) {
|
||||
auto desc = ze_module_desc_t();
|
||||
desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
|
||||
desc.format = ZE_MODULE_FORMAT_NATIVE;
|
||||
desc.inputSize = binary.size();
|
||||
desc.pInputModule = binary.data();
|
||||
desc.pBuildFlags = "";
|
||||
desc.pConstants = nullptr;
|
||||
|
||||
ze_module_handle_t ze_module;
|
||||
|
||||
auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(
|
||||
sycl_engine->device());
|
||||
auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(
|
||||
sycl_engine->context());
|
||||
|
||||
CHECK(func_zeModuleCreate(ze_ctx, ze_device, &desc, &ze_module, nullptr));
|
||||
::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle
|
||||
= ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero,
|
||||
::sycl::bundle_state::executable>(
|
||||
{ze_module}, sycl_engine->context());
|
||||
|
||||
sycl_kernels.resize(kernel_names.size());
|
||||
for (size_t i = 0; i < kernel_names.size(); i++) {
|
||||
if (kernel_names[i] == nullptr) continue;
|
||||
ze_kernel_handle_t ze_kernel;
|
||||
ze_kernel_desc_t ze_kernel_desc {
|
||||
ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_names[i]};
|
||||
CHECK(func_zeKernelCreate(ze_module, &ze_kernel_desc, &ze_kernel));
|
||||
auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
|
||||
{kernel_bundle, ze_kernel}, sycl_engine->context());
|
||||
sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
|
||||
}
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
|
||||
auto lhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(lhs);
|
||||
auto rhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(rhs);
|
||||
|
||||
return lhs_ze_handle == rhs_ze_handle;
|
||||
}
|
||||
|
||||
status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
|
||||
auto devicePropsIP = ze_device_ip_version_ext_t();
|
||||
devicePropsIP.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
|
||||
|
||||
auto deviceProps = ze_device_properties_t();
|
||||
deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
deviceProps.pNext = &devicePropsIP;
|
||||
|
||||
CHECK(func_zeDeviceGetProperties(device, &deviceProps));
|
||||
ip_version = devicePropsIP.ipVersion;
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t get_l0_device_enabled_systolic_intel(
|
||||
ze_device_handle_t device, bool &mayiuse_systolic) {
|
||||
// Note: supported by Intel Driver 24.05 and onwards
|
||||
auto deviceModPropsExt = ze_intel_device_module_dp_exp_properties_t();
|
||||
deviceModPropsExt.stype
|
||||
= ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
|
||||
|
||||
auto deviceModProps = ze_device_module_properties_t();
|
||||
deviceModProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
|
||||
deviceModProps.pNext = &deviceModPropsExt;
|
||||
|
||||
CHECK(func_zeDeviceGetModuleProperties(device, &deviceModProps));
|
||||
mayiuse_systolic
|
||||
= deviceModPropsExt.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t get_l0_device_enabled_native_float_atomics(
|
||||
ze_device_handle_t device, uint64_t &native_extensions) {
|
||||
using namespace gpu::intel::compute;
|
||||
|
||||
auto fltAtom = ze_float_atomic_ext_properties_t();
|
||||
fltAtom.stype = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
|
||||
|
||||
auto deviceProps = ze_device_module_properties_t();
|
||||
deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
|
||||
deviceProps.pNext = &fltAtom;
|
||||
|
||||
CHECK(func_zeDeviceGetModuleProperties(device, &deviceProps));
|
||||
|
||||
ze_device_fp_atomic_ext_flags_t atomic_load_store
|
||||
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
|
||||
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
|
||||
ze_device_fp_atomic_ext_flags_t atomic_add
|
||||
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
|
||||
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
|
||||
ze_device_fp_atomic_ext_flags_t atomic_min_max
|
||||
= ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
|
||||
| ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
|
||||
|
||||
if ((fltAtom.fp16Flags & atomic_load_store) == atomic_load_store)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
|
||||
if ((fltAtom.fp16Flags & atomic_add) == atomic_add)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
|
||||
if ((fltAtom.fp16Flags & atomic_min_max) == atomic_min_max)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
|
||||
|
||||
if ((fltAtom.fp32Flags & atomic_load_store) == atomic_load_store)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
|
||||
if ((fltAtom.fp32Flags & atomic_add) == atomic_add)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
|
||||
if ((fltAtom.fp32Flags & atomic_min_max) == atomic_min_max)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
|
||||
|
||||
if ((fltAtom.fp64Flags & atomic_load_store) == atomic_load_store)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
|
||||
if ((fltAtom.fp64Flags & atomic_add) == atomic_add)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
|
||||
if ((fltAtom.fp64Flags & atomic_min_max) == atomic_min_max)
|
||||
native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
|
||||
auto eucnt = ze_eu_count_ext_t();
|
||||
auto deviceProps = ze_device_properties_t();
|
||||
deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
deviceProps.pNext = &eucnt;
|
||||
|
||||
CHECK(func_zeDeviceGetProperties(device, &deviceProps));
|
||||
eu_count = eucnt.numTotalEUs;
|
||||
return status::success;
|
||||
}
|
||||
|
||||
status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
|
||||
ze_context_handle_t context, uint32_t &ip_version,
|
||||
compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product_,
|
||||
uint64_t &native_extensions, bool &mayiuse_systolic,
|
||||
bool &mayiuse_ngen_kernels) {
|
||||
using namespace ngen;
|
||||
ngen::Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
|
||||
context, device);
|
||||
|
||||
gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
|
||||
std::memcpy(&product_, &product, sizeof(ngen::Product));
|
||||
|
||||
mayiuse_systolic = false;
|
||||
if (get_l0_device_enabled_systolic_intel(device, mayiuse_systolic)
|
||||
!= status::success)
|
||||
mayiuse_systolic = false;
|
||||
|
||||
/* Some old drivers do not report systolic availability. Manually override
|
||||
systolic availability based on product family. */
|
||||
switch (product.family) {
|
||||
case ProductFamily::DG2:
|
||||
case ProductFamily::ARL:
|
||||
case ProductFamily::PVC: mayiuse_systolic = true;
|
||||
default: break;
|
||||
}
|
||||
|
||||
CHECK(get_l0_device_enabled_native_float_atomics(
|
||||
device, native_extensions));
|
||||
|
||||
auto status
|
||||
= jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
|
||||
if (status != status::success) mayiuse_ngen_kernels = false;
|
||||
|
||||
ip_version = 0;
|
||||
return get_device_ip(device, ip_version);
|
||||
}
|
||||
|
||||
} // namespace sycl
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
@ -1,65 +0,0 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 2020-2025 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef GPU_INTEL_SYCL_L0_UTILS_HPP
|
||||
#define GPU_INTEL_SYCL_L0_UTILS_HPP
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu/intel/compute/kernel.hpp"
|
||||
#include "gpu/intel/sycl/compat.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
namespace intel {
|
||||
namespace sycl {
|
||||
|
||||
class engine_t;
|
||||
|
||||
xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
|
||||
|
||||
status_t sycl_create_kernels_with_level_zero(
|
||||
std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
const gpu::intel::sycl::engine_t *sycl_engine,
|
||||
const xpu::binary_t &binary);
|
||||
|
||||
bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
|
||||
|
||||
#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
|
||||
status_t func_zeGetKernelBinary(
|
||||
ze_kernel_handle_t hKernel, size_t *pSize, uint8_t *pKernelBinary);
|
||||
#else
|
||||
status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize,
|
||||
uint8_t *pModuleNativeBinary);
|
||||
#endif // DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
|
||||
|
||||
status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
|
||||
ze_context_handle_t context, uint32_t &ip_version,
|
||||
compute::gpu_arch_t &gpu_arch, compute::gpu_product_t &product,
|
||||
uint64_t &native_extensions, bool &mayiuse_systolic,
|
||||
bool &mayiuse_ngen_kernels);
|
||||
|
||||
} // namespace sycl
|
||||
} // namespace intel
|
||||
} // namespace gpu
|
||||
} // namespace impl
|
||||
} // namespace dnnl
|
||||
|
||||
#endif // GPU_INTEL_SYCL_L0_UTILS_HPP
|
@ -24,7 +24,7 @@
|
||||
|
||||
#include "gpu/intel/sycl/stream.hpp"
|
||||
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
|
@ -35,6 +35,8 @@
|
||||
#include "gpu/intel/engine.hpp"
|
||||
#include "gpu/intel/stream.hpp"
|
||||
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
namespace gpu {
|
||||
|
@ -17,9 +17,9 @@
|
||||
#include "gpu/intel/sycl/utils.hpp"
|
||||
|
||||
#include "gpu/intel/compute/ukernels.hpp"
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/l0/utils/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
#include "gpu/intel/sycl/engine.hpp"
|
||||
#include "gpu/intel/sycl/l0/utils.hpp"
|
||||
#include "xpu/ocl/engine_factory.hpp"
|
||||
#include "xpu/ocl/utils.hpp"
|
||||
#include "xpu/sycl/compat.hpp"
|
||||
@ -32,6 +32,53 @@ namespace gpu {
|
||||
namespace intel {
|
||||
namespace sycl {
|
||||
|
||||
// FIXME: Currently SYCL doesn't provide any API to get device UUID so
|
||||
// we query it directly from Level0 with the zeDeviceGetProperties function.
|
||||
// The `get_device_uuid` function packs 128 bits of the device UUID, which are
|
||||
// represented as an uint8_t array of size 16, to 2 uint64_t values.
|
||||
xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
|
||||
return gpu::intel::l0::get_device_uuid(
|
||||
xpu::sycl::compat::get_native<ze_device_handle_t>(dev));
|
||||
}
|
||||
|
||||
bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
|
||||
auto lhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(lhs);
|
||||
auto rhs_ze_handle = xpu::sycl::compat::get_native<ze_device_handle_t>(rhs);
|
||||
|
||||
return lhs_ze_handle == rhs_ze_handle;
|
||||
}
|
||||
|
||||
status_t sycl_create_kernels_with_level_zero(
|
||||
std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
const gpu::intel::sycl::engine_t *sycl_engine,
|
||||
const xpu::binary_t &binary) {
|
||||
auto ze_device = xpu::sycl::compat::get_native<ze_device_handle_t>(
|
||||
sycl_engine->device());
|
||||
auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(
|
||||
sycl_engine->context());
|
||||
ze_module_handle_t ze_module = nullptr;
|
||||
std::vector<ze_kernel_handle_t> ze_kernels;
|
||||
|
||||
gpu::intel::l0::create_kernels_from_binary(
|
||||
ze_device, ze_ctx, kernel_names, binary, &ze_module, ze_kernels);
|
||||
|
||||
::sycl::kernel_bundle<::sycl::bundle_state::executable> kernel_bundle
|
||||
= ::sycl::make_kernel_bundle<::sycl::backend::ext_oneapi_level_zero,
|
||||
::sycl::bundle_state::executable>(
|
||||
{ze_module}, sycl_engine->context());
|
||||
|
||||
sycl_kernels.resize(kernel_names.size());
|
||||
for (size_t i = 0; i < kernel_names.size(); i++) {
|
||||
if (kernel_names[i] == nullptr) continue;
|
||||
auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
|
||||
{kernel_bundle, ze_kernels[i]}, sycl_engine->context());
|
||||
sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
|
||||
}
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
::sycl::nd_range<3> to_sycl_nd_range(
|
||||
const gpu::intel::compute::nd_range_t &range) {
|
||||
const auto &local_range = range.local_range();
|
||||
@ -150,7 +197,6 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) {
|
||||
}
|
||||
|
||||
*ocl_dev = d;
|
||||
|
||||
return status::success;
|
||||
}
|
||||
|
||||
@ -216,12 +262,7 @@ status_t get_kernel_binary(
|
||||
#ifdef DNNL_EXPERIMENTAL_SYCL_KERNEL_COMPILER
|
||||
auto l0_kernel = ::sycl::get_native<
|
||||
::sycl::backend::ext_oneapi_level_zero>(kernel);
|
||||
size_t binary_size = 0;
|
||||
CHECK(gpu::intel::sycl::func_zeGetKernelBinary(
|
||||
l0_kernel, &binary_size, nullptr));
|
||||
binary.resize(binary_size);
|
||||
CHECK(gpu::intel::sycl::func_zeGetKernelBinary(
|
||||
l0_kernel, &binary_size, binary.data()));
|
||||
CHECK(gpu::intel::l0::get_kernel_binary(l0_kernel, binary));
|
||||
#else
|
||||
auto bundle = kernel.get_kernel_bundle();
|
||||
auto module_vec = ::sycl::get_native<
|
||||
@ -229,11 +270,7 @@ status_t get_kernel_binary(
|
||||
auto module = module_vec[0];
|
||||
size_t module_binary_size;
|
||||
xpu::binary_t module_binary;
|
||||
CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary(
|
||||
module, &module_binary_size, nullptr));
|
||||
module_binary.resize(module_binary_size);
|
||||
CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary(
|
||||
module, &module_binary_size, module_binary.data()));
|
||||
CHECK(gpu::intel::l0::get_mobule_binary(module, binary));
|
||||
{
|
||||
std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
|
||||
ocl_engine;
|
||||
|
@ -29,6 +29,16 @@ namespace sycl {
|
||||
|
||||
class engine_t;
|
||||
|
||||
xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
|
||||
|
||||
status_t sycl_create_kernels_with_level_zero(
|
||||
std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
|
||||
const std::vector<const char *> &kernel_names,
|
||||
const gpu::intel::sycl::engine_t *sycl_engine,
|
||||
const xpu::binary_t &binary);
|
||||
|
||||
bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
|
||||
|
||||
::sycl::nd_range<3> to_sycl_nd_range(
|
||||
const gpu::intel::compute::nd_range_t &range);
|
||||
|
||||
|
@ -93,7 +93,6 @@ public:
|
||||
#ifdef DNNL_WITH_SYCL
|
||||
void set_deps(::sycl::event event) { e_ = std::move(event); }
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
void set_deps(cl_event event) { ocl_e_ = event; }
|
||||
#endif
|
||||
@ -106,7 +105,6 @@ private:
|
||||
#ifdef DNNL_WITH_SYCL
|
||||
::sycl::event e_;
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
|
||||
cl_event ocl_e_;
|
||||
#endif
|
||||
|
@ -31,6 +31,6 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
#endif
|
||||
|
@ -24,7 +24,7 @@
|
||||
|
||||
#include "xpu/ocl/memory_storage_base.hpp"
|
||||
|
||||
#include "gpu/intel/ocl/utils.hpp"
|
||||
#include "gpu/intel/ocl/utils/utils.hpp"
|
||||
|
||||
namespace dnnl {
|
||||
namespace impl {
|
||||
|
@ -32,131 +32,6 @@ namespace impl {
|
||||
namespace xpu {
|
||||
namespace ocl {
|
||||
|
||||
status_t convert_to_dnnl(cl_int cl_status) {
|
||||
switch (cl_status) {
|
||||
case CL_SUCCESS: return status::success;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
|
||||
case CL_DEVICE_NOT_FOUND:
|
||||
case CL_DEVICE_NOT_AVAILABLE:
|
||||
case CL_COMPILER_NOT_AVAILABLE:
|
||||
case CL_PROFILING_INFO_NOT_AVAILABLE:
|
||||
case CL_MEM_COPY_OVERLAP:
|
||||
case CL_IMAGE_FORMAT_MISMATCH:
|
||||
case CL_IMAGE_FORMAT_NOT_SUPPORTED:
|
||||
case CL_BUILD_PROGRAM_FAILURE:
|
||||
case CL_MAP_FAILURE:
|
||||
case CL_MISALIGNED_SUB_BUFFER_OFFSET:
|
||||
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
|
||||
case CL_COMPILE_PROGRAM_FAILURE:
|
||||
case CL_LINKER_NOT_AVAILABLE:
|
||||
case CL_LINK_PROGRAM_FAILURE:
|
||||
case CL_DEVICE_PARTITION_FAILED:
|
||||
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
|
||||
case CL_INVALID_PLATFORM:
|
||||
case CL_INVALID_DEVICE: return status::runtime_error;
|
||||
case CL_INVALID_VALUE:
|
||||
case CL_INVALID_DEVICE_TYPE:
|
||||
case CL_INVALID_CONTEXT:
|
||||
case CL_INVALID_QUEUE_PROPERTIES:
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
case CL_INVALID_HOST_PTR:
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
|
||||
case CL_INVALID_IMAGE_SIZE:
|
||||
case CL_INVALID_SAMPLER:
|
||||
case CL_INVALID_BINARY:
|
||||
case CL_INVALID_BUILD_OPTIONS:
|
||||
case CL_INVALID_PROGRAM:
|
||||
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||
case CL_INVALID_KERNEL_NAME:
|
||||
case CL_INVALID_KERNEL_DEFINITION:
|
||||
case CL_INVALID_KERNEL:
|
||||
case CL_INVALID_ARG_INDEX:
|
||||
case CL_INVALID_ARG_VALUE:
|
||||
case CL_INVALID_ARG_SIZE:
|
||||
case CL_INVALID_KERNEL_ARGS:
|
||||
case CL_INVALID_WORK_DIMENSION:
|
||||
case CL_INVALID_WORK_GROUP_SIZE:
|
||||
case CL_INVALID_WORK_ITEM_SIZE:
|
||||
case CL_INVALID_GLOBAL_OFFSET:
|
||||
case CL_INVALID_EVENT_WAIT_LIST:
|
||||
case CL_INVALID_EVENT:
|
||||
case CL_INVALID_OPERATION:
|
||||
case CL_INVALID_GL_OBJECT:
|
||||
case CL_INVALID_BUFFER_SIZE:
|
||||
case CL_INVALID_MIP_LEVEL:
|
||||
case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments;
|
||||
|
||||
default: return status::runtime_error;
|
||||
}
|
||||
}
|
||||
|
||||
// Ordered by value as defined by opencl
|
||||
const char *convert_cl_int_to_str(cl_int cl_status) {
|
||||
#define CL_STATUS_CASE(status) \
|
||||
case status: return #status
|
||||
switch (cl_status) {
|
||||
CL_STATUS_CASE(CL_SUCCESS);
|
||||
CL_STATUS_CASE(CL_DEVICE_NOT_FOUND);
|
||||
CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
|
||||
CL_STATUS_CASE(CL_OUT_OF_RESOURCES);
|
||||
CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY);
|
||||
CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_MEM_COPY_OVERLAP);
|
||||
CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH);
|
||||
CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
|
||||
CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE);
|
||||
CL_STATUS_CASE(CL_MAP_FAILURE);
|
||||
CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
|
||||
CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
|
||||
CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE);
|
||||
CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE);
|
||||
CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED);
|
||||
CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_INVALID_VALUE);
|
||||
CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE);
|
||||
CL_STATUS_CASE(CL_INVALID_PLATFORM);
|
||||
CL_STATUS_CASE(CL_INVALID_DEVICE);
|
||||
CL_STATUS_CASE(CL_INVALID_CONTEXT);
|
||||
CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES);
|
||||
CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE);
|
||||
CL_STATUS_CASE(CL_INVALID_HOST_PTR);
|
||||
CL_STATUS_CASE(CL_INVALID_MEM_OBJECT);
|
||||
CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
|
||||
CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_SAMPLER);
|
||||
CL_STATUS_CASE(CL_INVALID_BINARY);
|
||||
CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS);
|
||||
CL_STATUS_CASE(CL_INVALID_PROGRAM);
|
||||
CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
|
||||
CL_STATUS_CASE(CL_INVALID_KERNEL_NAME);
|
||||
CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION);
|
||||
CL_STATUS_CASE(CL_INVALID_KERNEL);
|
||||
CL_STATUS_CASE(CL_INVALID_ARG_INDEX);
|
||||
CL_STATUS_CASE(CL_INVALID_ARG_VALUE);
|
||||
CL_STATUS_CASE(CL_INVALID_ARG_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS);
|
||||
CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION);
|
||||
CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET);
|
||||
CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST);
|
||||
CL_STATUS_CASE(CL_INVALID_EVENT);
|
||||
CL_STATUS_CASE(CL_INVALID_OPERATION);
|
||||
CL_STATUS_CASE(CL_INVALID_GL_OBJECT);
|
||||
CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_MIP_LEVEL);
|
||||
CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
|
||||
#undef CL_STATUS_CASE
|
||||
default: return "unknown macro name";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename F>
|
||||
static std::string get_ocl_name(T obj, F get_func, cl_uint name_query) {
|
||||
size_t name_size;
|
||||
|
@ -33,8 +33,130 @@ namespace impl {
|
||||
namespace xpu {
|
||||
namespace ocl {
|
||||
|
||||
status_t convert_to_dnnl(cl_int cl_status);
|
||||
const char *convert_cl_int_to_str(cl_int cl_status);
|
||||
inline status_t convert_to_dnnl(cl_int cl_status) {
|
||||
switch (cl_status) {
|
||||
case CL_SUCCESS: return status::success;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
|
||||
case CL_DEVICE_NOT_FOUND:
|
||||
case CL_DEVICE_NOT_AVAILABLE:
|
||||
case CL_COMPILER_NOT_AVAILABLE:
|
||||
case CL_PROFILING_INFO_NOT_AVAILABLE:
|
||||
case CL_MEM_COPY_OVERLAP:
|
||||
case CL_IMAGE_FORMAT_MISMATCH:
|
||||
case CL_IMAGE_FORMAT_NOT_SUPPORTED:
|
||||
case CL_BUILD_PROGRAM_FAILURE:
|
||||
case CL_MAP_FAILURE:
|
||||
case CL_MISALIGNED_SUB_BUFFER_OFFSET:
|
||||
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
|
||||
case CL_COMPILE_PROGRAM_FAILURE:
|
||||
case CL_LINKER_NOT_AVAILABLE:
|
||||
case CL_LINK_PROGRAM_FAILURE:
|
||||
case CL_DEVICE_PARTITION_FAILED:
|
||||
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
|
||||
case CL_INVALID_PLATFORM:
|
||||
case CL_INVALID_DEVICE: return status::runtime_error;
|
||||
case CL_INVALID_VALUE:
|
||||
case CL_INVALID_DEVICE_TYPE:
|
||||
case CL_INVALID_CONTEXT:
|
||||
case CL_INVALID_QUEUE_PROPERTIES:
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
case CL_INVALID_HOST_PTR:
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
|
||||
case CL_INVALID_IMAGE_SIZE:
|
||||
case CL_INVALID_SAMPLER:
|
||||
case CL_INVALID_BINARY:
|
||||
case CL_INVALID_BUILD_OPTIONS:
|
||||
case CL_INVALID_PROGRAM:
|
||||
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||
case CL_INVALID_KERNEL_NAME:
|
||||
case CL_INVALID_KERNEL_DEFINITION:
|
||||
case CL_INVALID_KERNEL:
|
||||
case CL_INVALID_ARG_INDEX:
|
||||
case CL_INVALID_ARG_VALUE:
|
||||
case CL_INVALID_ARG_SIZE:
|
||||
case CL_INVALID_KERNEL_ARGS:
|
||||
case CL_INVALID_WORK_DIMENSION:
|
||||
case CL_INVALID_WORK_GROUP_SIZE:
|
||||
case CL_INVALID_WORK_ITEM_SIZE:
|
||||
case CL_INVALID_GLOBAL_OFFSET:
|
||||
case CL_INVALID_EVENT_WAIT_LIST:
|
||||
case CL_INVALID_EVENT:
|
||||
case CL_INVALID_OPERATION:
|
||||
case CL_INVALID_GL_OBJECT:
|
||||
case CL_INVALID_BUFFER_SIZE:
|
||||
case CL_INVALID_MIP_LEVEL:
|
||||
case CL_INVALID_GLOBAL_WORK_SIZE: return status::invalid_arguments;
|
||||
|
||||
default: return status::runtime_error;
|
||||
}
|
||||
}
|
||||
|
||||
// Ordered by value as defined by opencl
|
||||
inline const char *convert_cl_int_to_str(cl_int cl_status) {
|
||||
#define CL_STATUS_CASE(status) \
|
||||
case status: return #status
|
||||
switch (cl_status) {
|
||||
CL_STATUS_CASE(CL_SUCCESS);
|
||||
CL_STATUS_CASE(CL_DEVICE_NOT_FOUND);
|
||||
CL_STATUS_CASE(CL_DEVICE_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_COMPILER_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
|
||||
CL_STATUS_CASE(CL_OUT_OF_RESOURCES);
|
||||
CL_STATUS_CASE(CL_OUT_OF_HOST_MEMORY);
|
||||
CL_STATUS_CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_MEM_COPY_OVERLAP);
|
||||
CL_STATUS_CASE(CL_IMAGE_FORMAT_MISMATCH);
|
||||
CL_STATUS_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
|
||||
CL_STATUS_CASE(CL_BUILD_PROGRAM_FAILURE);
|
||||
CL_STATUS_CASE(CL_MAP_FAILURE);
|
||||
CL_STATUS_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
|
||||
CL_STATUS_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
|
||||
CL_STATUS_CASE(CL_COMPILE_PROGRAM_FAILURE);
|
||||
CL_STATUS_CASE(CL_LINKER_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_LINK_PROGRAM_FAILURE);
|
||||
CL_STATUS_CASE(CL_DEVICE_PARTITION_FAILED);
|
||||
CL_STATUS_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
|
||||
CL_STATUS_CASE(CL_INVALID_VALUE);
|
||||
CL_STATUS_CASE(CL_INVALID_DEVICE_TYPE);
|
||||
CL_STATUS_CASE(CL_INVALID_PLATFORM);
|
||||
CL_STATUS_CASE(CL_INVALID_DEVICE);
|
||||
CL_STATUS_CASE(CL_INVALID_CONTEXT);
|
||||
CL_STATUS_CASE(CL_INVALID_QUEUE_PROPERTIES);
|
||||
CL_STATUS_CASE(CL_INVALID_COMMAND_QUEUE);
|
||||
CL_STATUS_CASE(CL_INVALID_HOST_PTR);
|
||||
CL_STATUS_CASE(CL_INVALID_MEM_OBJECT);
|
||||
CL_STATUS_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
|
||||
CL_STATUS_CASE(CL_INVALID_IMAGE_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_SAMPLER);
|
||||
CL_STATUS_CASE(CL_INVALID_BINARY);
|
||||
CL_STATUS_CASE(CL_INVALID_BUILD_OPTIONS);
|
||||
CL_STATUS_CASE(CL_INVALID_PROGRAM);
|
||||
CL_STATUS_CASE(CL_INVALID_PROGRAM_EXECUTABLE);
|
||||
CL_STATUS_CASE(CL_INVALID_KERNEL_NAME);
|
||||
CL_STATUS_CASE(CL_INVALID_KERNEL_DEFINITION);
|
||||
CL_STATUS_CASE(CL_INVALID_KERNEL);
|
||||
CL_STATUS_CASE(CL_INVALID_ARG_INDEX);
|
||||
CL_STATUS_CASE(CL_INVALID_ARG_VALUE);
|
||||
CL_STATUS_CASE(CL_INVALID_ARG_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_KERNEL_ARGS);
|
||||
CL_STATUS_CASE(CL_INVALID_WORK_DIMENSION);
|
||||
CL_STATUS_CASE(CL_INVALID_WORK_GROUP_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_WORK_ITEM_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_GLOBAL_OFFSET);
|
||||
CL_STATUS_CASE(CL_INVALID_EVENT_WAIT_LIST);
|
||||
CL_STATUS_CASE(CL_INVALID_EVENT);
|
||||
CL_STATUS_CASE(CL_INVALID_OPERATION);
|
||||
CL_STATUS_CASE(CL_INVALID_GL_OBJECT);
|
||||
CL_STATUS_CASE(CL_INVALID_BUFFER_SIZE);
|
||||
CL_STATUS_CASE(CL_INVALID_MIP_LEVEL);
|
||||
CL_STATUS_CASE(CL_INVALID_GLOBAL_WORK_SIZE);
|
||||
#undef CL_STATUS_CASE
|
||||
default: return "unknown macro name";
|
||||
}
|
||||
}
|
||||
|
||||
#define MAYBE_REPORT_ERROR(msg) \
|
||||
do { \
|
||||
|
@ -24,7 +24,7 @@
|
||||
#include "common/engine.hpp"
|
||||
|
||||
#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
|
||||
#include "gpu/intel/sycl/l0/utils.hpp"
|
||||
#include "gpu/intel/sycl/utils.hpp"
|
||||
#endif
|
||||
|
||||
#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
|
||||
|
12
third_party/ngen/ngen_level_zero.hpp
vendored
12
third_party/ngen/ngen_level_zero.hpp
vendored
@ -91,6 +91,7 @@ public:
|
||||
|
||||
explicit LevelZeroCodeGenerator(DebugConfig debugConfig) : LevelZeroCodeGenerator({genericProductFamily(hw), 0}, debugConfig) {}
|
||||
|
||||
inline ze_kernel_handle_t getKernel(ze_module_handle_t module);
|
||||
inline ze_module_handle_t getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = "");
|
||||
static inline HW detectHW(ze_context_handle_t context, ze_device_handle_t device);
|
||||
static inline Product detectHWInfo(ze_context_handle_t context, ze_device_handle_t device);
|
||||
@ -138,6 +139,17 @@ static inline std::vector<uint8_t> getDummyModuleBinary(ze_context_handle_t cont
|
||||
|
||||
}; /* namespace detail */
|
||||
|
||||
template <HW hw>
|
||||
ze_kernel_handle_t LevelZeroCodeGenerator<hw>::getKernel(ze_module_handle_t module)
|
||||
{
|
||||
auto kernelName = ELFCodeGenerator<hw>::interface_.getExternalName().c_str();
|
||||
ze_kernel_handle_t kernelL0;
|
||||
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernelName};
|
||||
detail::handleL0(dynamic::zeKernelCreate(module, &kernelDesc, &kernelL0));
|
||||
|
||||
return kernelL0;
|
||||
}
|
||||
|
||||
template <HW hw>
|
||||
ze_module_handle_t LevelZeroCodeGenerator<hw>::getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options)
|
||||
{
|
||||
|
Reference in New Issue
Block a user