mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Build SYCL kernels for ATen XPU ops on Native Windows (take 2) (#127390)
Original PR https://github.com/pytorch/pytorch/pull/126725 is closed due to bad rebase. ------- As proposed in https://github.com/pytorch/pytorch/issues/126719, we are enabling PyTorch XPU on Native Windows on Intel GPU. This PR enables XPU build on Windows as the first step of #126719: - Enable `USE_XPU` build on Windows using MSVC as host compiler. The use of MSVC as host compiler seamlessly aligns with the existing PyTorch build on Windows. - Build oneDNN GPU library on Windows. Co-authored-by: Yu, Guangye <guangye.yu@intel.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/127390 Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/ezyang
This commit is contained in:
@ -242,8 +242,7 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
|
||||
option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
|
||||
option(USE_TSAN "Use Thread Sanitizer" OFF)
|
||||
option(USE_CUDA "Use CUDA" ON)
|
||||
cmake_dependent_option(USE_XPU "Use XPU. Only available on Linux." ON "LINUX"
|
||||
OFF)
|
||||
option(USE_XPU "Use XPU" ON)
|
||||
cmake_dependent_option(
|
||||
BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
|
||||
"USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
|
||||
|
@ -189,7 +189,7 @@ Other potentially useful environment variables may be found in `setup.py`.
|
||||
##### Intel GPU Support
|
||||
If you want to compile with Intel GPU support, follow these
|
||||
- [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions.
|
||||
- Intel GPU is currently supported only for Linux systems.
|
||||
- Intel GPU is supported for Linux and Windows.
|
||||
|
||||
If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
|
||||
Other potentially useful environment variables may be found in `setup.py`.
|
||||
|
@ -12,7 +12,7 @@
|
||||
|
||||
namespace at::native::onednn {
|
||||
|
||||
TORCH_API dnnl::memory make_onednn_memory(
|
||||
TORCH_XPU_API dnnl::memory make_onednn_memory(
|
||||
dnnl::memory::desc md,
|
||||
dnnl::engine& engine,
|
||||
void* ptr);
|
||||
@ -21,7 +21,7 @@ TORCH_API dnnl::memory make_onednn_memory(
|
||||
bool set_onednn_verbose(int level);
|
||||
|
||||
// GpuEngineManager singleton
|
||||
struct TORCH_API GpuEngineManager {
|
||||
struct TORCH_XPU_API GpuEngineManager {
|
||||
static GpuEngineManager& Instance(); // Singleton
|
||||
|
||||
dnnl::engine& get_engine(const Device& device) {
|
||||
@ -51,7 +51,7 @@ struct TORCH_API GpuEngineManager {
|
||||
};
|
||||
|
||||
// GpuStreamManager singleton
|
||||
struct TORCH_API GpuStreamManager {
|
||||
struct TORCH_XPU_API GpuStreamManager {
|
||||
static GpuStreamManager& Instance(); // Singleton
|
||||
|
||||
dnnl::stream get_stream() {
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
struct TORCH_API XPUGeneratorImpl : public GeneratorImpl {
|
||||
struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
|
||||
// Constructors
|
||||
XPUGeneratorImpl(DeviceIndex device_index = -1);
|
||||
~XPUGeneratorImpl() override = default;
|
||||
|
@ -25,7 +25,13 @@ std::string XPUHooks::showConfig() const {
|
||||
|
||||
int32_t XPUHooks::getGlobalIdxFromDevice(const at::Device& device) const {
|
||||
TORCH_CHECK(device.is_xpu(), "Only the XPU device type is expected.");
|
||||
#ifdef _WIN32
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Default context is not supported on XPU on Windows. So we can NOT find its global index of the ATen device.");
|
||||
#else
|
||||
return at::xpu::getGlobalIdxFromDevice(device.index());
|
||||
#endif
|
||||
}
|
||||
|
||||
Generator XPUHooks::getXPUGenerator(DeviceIndex device_index) const {
|
||||
@ -38,7 +44,13 @@ const Generator& XPUHooks::getDefaultXPUGenerator(
|
||||
}
|
||||
|
||||
Device XPUHooks::getDeviceFromPtr(void* data) const {
|
||||
#ifdef _WIN32
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Default context is not supported on XPU on Windows. So we can NOT find the ATen device of a pointer.");
|
||||
#else
|
||||
return at::xpu::getDeviceFromPtr(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
c10::DeviceIndex XPUHooks::getNumGPUs() const {
|
||||
|
@ -4,6 +4,10 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#if defined(SYCL_LANGUAGE_VERSION)
|
||||
#include <sycl/sycl.hpp>
|
||||
#endif
|
||||
|
||||
namespace c10::detail {
|
||||
|
||||
/*
|
||||
@ -33,6 +37,8 @@ inline C10_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) {
|
||||
// guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
|
||||
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
|
||||
uint32_t renorm_shift = __clz(mantissa);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
uint32_t renorm_shift = sycl::clz(mantissa);
|
||||
#elif defined(_MSC_VER)
|
||||
unsigned long nonsign_bsr;
|
||||
_BitScanReverse(&nonsign_bsr, (unsigned long)mantissa);
|
||||
|
@ -8,6 +8,12 @@ if(NOT BUILD_LIBTORCHLESS)
|
||||
find_library(C10_XPU_LIB c10_xpu PATHS $ENV{LIBTORCH_LIB_PATH} NO_DEFAULT_PATH)
|
||||
endif()
|
||||
|
||||
# ---[ Configure macro file.
|
||||
set(C10_XPU_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in xpu_cmake_macros.h.in
|
||||
configure_file(
|
||||
${CMAKE_CURRENT_LIST_DIR}/impl/xpu_cmake_macros.h.in
|
||||
${CMAKE_BINARY_DIR}/c10/xpu/impl/xpu_cmake_macros.h)
|
||||
|
||||
set(C10_XPU_SRCS
|
||||
XPUCachingAllocator.cpp
|
||||
XPUFunctions.cpp
|
||||
@ -50,3 +56,7 @@ foreach(file ${C10_XPU_HEADERS})
|
||||
get_filename_component(dir ${file} DIRECTORY)
|
||||
install(FILES ${file} DESTINATION include/c10/xpu/${dir})
|
||||
endforeach()
|
||||
|
||||
if(MSVC AND C10_XPU_BUILD_SHARED_LIBS)
|
||||
install(FILES $<TARGET_PDB_FILE:c10_xpu> DESTINATION lib OPTIONAL)
|
||||
endif()
|
||||
|
@ -2,8 +2,6 @@
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/xpu/XPUFunctions.h>
|
||||
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
namespace c10::xpu {
|
||||
@ -53,10 +51,20 @@ inline void initGlobalDevicePoolState() {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// default context feature is disabled by default on Windows.
|
||||
std::vector<sycl::device> deviceList;
|
||||
for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end();
|
||||
++it) {
|
||||
deviceList.push_back(*(*it));
|
||||
}
|
||||
gDevicePool.context = std::make_unique<sycl::context>(deviceList);
|
||||
#else
|
||||
// The default context is utilized for each Intel GPU device, allowing the
|
||||
// retrieval of the context from any GPU device.
|
||||
gDevicePool.context = std::make_unique<sycl::context>(
|
||||
gDevicePool.devices[0]->get_platform().ext_oneapi_get_default_context());
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void initDevicePoolCallOnce() {
|
||||
|
@ -1,15 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef C10_USING_CUSTOM_GENERATED_MACROS
|
||||
#include <c10/xpu/impl/xpu_cmake_macros.h>
|
||||
#endif
|
||||
|
||||
// See c10/macros/Export.h for a detailed explanation of what the function
|
||||
// of these macros are. We need one set of macros for every separate library
|
||||
// we build.
|
||||
|
||||
#ifdef _WIN32
|
||||
#if defined(C10_XPU_BUILD_SHARED_LIBS)
|
||||
#define C10_XPU_EXPORT __declspec(dllexport)
|
||||
#define C10_XPU_IMPORT __declspec(dllimport)
|
||||
#else
|
||||
#define C10_XPU_EXPORT
|
||||
#define C10_XPU_IMPORT
|
||||
#endif
|
||||
#else // _WIN32
|
||||
#if defined(__GNUC__)
|
||||
#define C10_XPU_EXPORT __attribute__((__visibility__("default")))
|
||||
#else // defined(__GNUC__)
|
||||
#define C10_XPU_EXPORT
|
||||
#endif // defined(__GNUC__)
|
||||
#define C10_XPU_IMPORT C10_XPU_EXPORT
|
||||
#endif // _WIN32
|
||||
|
||||
// This one is being used by libc10_xpu.so
|
||||
#ifdef C10_XPU_BUILD_MAIN_LIB
|
||||
|
6
c10/xpu/impl/xpu_cmake_macros.h.in
Normal file
6
c10/xpu/impl/xpu_cmake_macros.h.in
Normal file
@ -0,0 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
// Automatically generated header file for the C10 XPU library. Do not
|
||||
// include this file directly. Instead, include c10/xpu/XPUMacros.h
|
||||
|
||||
#cmakedefine C10_XPU_BUILD_SHARED_LIBS
|
@ -1062,9 +1062,16 @@ if(USE_XPU)
|
||||
message(WARNING "Failed to include ATen XPU implementation target")
|
||||
else()
|
||||
target_link_libraries(torch_xpu PRIVATE torch_xpu_ops)
|
||||
if(MSVC)
|
||||
# Windows
|
||||
target_link_libraries(torch_xpu PRIVATE
|
||||
"-WHOLEARCHIVE:\"$<TARGET_FILE:torch_xpu_ops>\"")
|
||||
else()
|
||||
# Linux
|
||||
target_link_libraries(torch_xpu PRIVATE
|
||||
"-Wl,--whole-archive,\"$<TARGET_FILE:torch_xpu_ops>\" -Wl,--no-whole-archive")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT MSVC AND USE_XNNPACK)
|
||||
|
@ -89,8 +89,8 @@ endif()
|
||||
if(USE_XPU)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake)
|
||||
if(NOT PYTORCH_FOUND_XPU)
|
||||
# message(WARNING "Not compiling with XPU. Could NOT find SYCL."
|
||||
# "Suppress this warning with -DUSE_XPU=OFF.")
|
||||
message(WARNING "Not compiling with XPU. Could NOT find SYCL."
|
||||
"Suppress this warning with -DUSE_XPU=OFF.")
|
||||
caffe2_update_option(USE_XPU OFF)
|
||||
endif()
|
||||
endif()
|
||||
|
@ -21,10 +21,16 @@ IF(NOT MKLDNN_FOUND)
|
||||
|
||||
if(USE_XPU) # Build oneDNN GPU library
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set(DNNL_HOST_COMPILER "g++")
|
||||
# Linux
|
||||
# g++ is soft linked to /usr/bin/cxx, oneDNN would not treat it as an absolute path
|
||||
set(DNNL_HOST_COMPILER "g++")
|
||||
set(SYCL_CXX_DRIVER "icpx")
|
||||
set(DNNL_LIB_NAME "libdnnl.a")
|
||||
else()
|
||||
message(FATAL_ERROR "oneDNN library currently only supports GUN g++ compiler for XPU backend")
|
||||
# Windows
|
||||
set(DNNL_HOST_COMPILER "DEFAULT")
|
||||
set(SYCL_CXX_DRIVER "icx")
|
||||
set(DNNL_LIB_NAME "dnnl.lib")
|
||||
endif()
|
||||
|
||||
set(DNNL_MAKE_COMMAND "cmake" "--build" ".")
|
||||
@ -41,8 +47,7 @@ IF(NOT MKLDNN_FOUND)
|
||||
PREFIX ${XPU_MKLDNN_DIR_PREFIX}
|
||||
BUILD_IN_SOURCE 0
|
||||
CMAKE_ARGS -DCMAKE_C_COMPILER=icx
|
||||
-DCMAKE_CXX_COMPILER=icpx
|
||||
-DCMAKE_CXX_COMPILER_ID=IntelLLVM
|
||||
-DCMAKE_CXX_COMPILER=${SYCL_CXX_DRIVER}
|
||||
-DDNNL_GPU_RUNTIME=SYCL
|
||||
-DDNNL_CPU_RUNTIME=THREADPOOL
|
||||
-DDNNL_BUILD_TESTS=OFF
|
||||
@ -52,20 +57,20 @@ IF(NOT MKLDNN_FOUND)
|
||||
-DDNNL_DPCPP_HOST_COMPILER=${DNNL_HOST_COMPILER} # Use global cxx compiler as host compiler
|
||||
-G ${CMAKE_GENERATOR} # Align Generator to Torch
|
||||
BUILD_COMMAND ${DNNL_MAKE_COMMAND}
|
||||
BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/libdnnl.a"
|
||||
BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/${DNNL_LIB_NAME}"
|
||||
INSTALL_COMMAND ""
|
||||
)
|
||||
|
||||
ExternalProject_Get_Property(xpu_mkldnn_proj BINARY_DIR)
|
||||
set(__XPU_MKLDNN_BUILD_DIR ${BINARY_DIR})
|
||||
set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
|
||||
set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME})
|
||||
set(XPU_MKLDNN_INCLUDE ${__XPU_MKLDNN_BUILD_DIR}/include)
|
||||
# This target would be further linked to libtorch_xpu.so.
|
||||
# The libtorch_xpu.so would contain Conv&GEMM operators that depend on
|
||||
# oneDNN primitive implementations inside libdnnl.a.
|
||||
add_library(xpu_mkldnn INTERFACE)
|
||||
add_dependencies(xpu_mkldnn xpu_mkldnn_proj)
|
||||
target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
|
||||
target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME})
|
||||
target_include_directories(xpu_mkldnn INTERFACE ${XPU_MKLDNN_INCLUDE})
|
||||
endif()
|
||||
|
||||
|
@ -55,6 +55,23 @@ find_library(
|
||||
HINTS ${SYCL_LIBRARY_DIR}
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix,
|
||||
# where the current version of the SYCL runtime is 7.
|
||||
# Until oneAPI adds support to sycl.lib without the version suffix,
|
||||
# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts.
|
||||
# TODO: remove this when sycl.lib is supported on Windows
|
||||
if(WIN32)
|
||||
set(sycl_runtime_version 7)
|
||||
find_library(
|
||||
SYCL_LIBRARY
|
||||
NAMES "sycl${sycl_runtime_version}"
|
||||
HINTS ${SYCL_LIBRARY_DIR}
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
if(SYCL_LIBRARY STREQUAL "SYCL_LIBRARY-NOTFOUND")
|
||||
message(FATAL_ERROR "Cannot find a SYCL library on Windows")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_library(
|
||||
OCL_LIBRARY
|
||||
|
@ -11,24 +11,30 @@
|
||||
#include <torch/csrc/utils/python_numbers.h>
|
||||
#include <torch/csrc/utils/python_strings.h>
|
||||
|
||||
#ifndef WIN32
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
|
||||
using namespace torch;
|
||||
|
||||
static bool in_bad_fork = false; // True for children forked after xpu init
|
||||
|
||||
#ifndef WIN32
|
||||
// Called in the forked child if xpu has already been initialized
|
||||
static void forked_child() {
|
||||
in_bad_fork = true;
|
||||
torch::utils::set_requires_device_init(at::kXPU, true);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Should be called before the first xpu call. It is mainly called in lazy_init.
|
||||
// Note: This is distinct from initExtension because a stub xpu implementation
|
||||
// has some working functions (e.g. device_count) but cannot fully initialize.
|
||||
static void poison_fork() {
|
||||
#ifndef WIN32
|
||||
static c10::once_flag flag;
|
||||
c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
|
||||
#endif
|
||||
}
|
||||
|
||||
// XPU management methods
|
||||
|
Reference in New Issue
Block a user