diff --git a/CMakeLists.txt b/CMakeLists.txt index cd11ffdf7333..1264540c6875 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -242,8 +242,7 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) option(USE_ASAN "Use Address+Undefined Sanitizers" OFF) option(USE_TSAN "Use Thread Sanitizer" OFF) option(USE_CUDA "Use CUDA" ON) -cmake_dependent_option(USE_XPU "Use XPU. Only available on Linux." ON "LINUX" - OFF) +option(USE_XPU "Use XPU" ON) cmake_dependent_option( BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) diff --git a/README.md b/README.md index 9a4ba683d769..9123dea20107 100644 --- a/README.md +++ b/README.md @@ -189,7 +189,7 @@ Other potentially useful environment variables may be found in `setup.py`. ##### Intel GPU Support If you want to compile with Intel GPU support, follow these - [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions. -- Intel GPU is currently supported only for Linux systems. +- Intel GPU is supported for Linux and Windows. If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`. Other potentially useful environment variables may be found in `setup.py`. diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h index c7e7a5e94b40..afef4552c153 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h +++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h @@ -12,7 +12,7 @@ namespace at::native::onednn { -TORCH_API dnnl::memory make_onednn_memory( +TORCH_XPU_API dnnl::memory make_onednn_memory( dnnl::memory::desc md, dnnl::engine& engine, void* ptr); @@ -21,7 +21,7 @@ TORCH_API dnnl::memory make_onednn_memory( bool set_onednn_verbose(int level); // GpuEngineManager singleton -struct TORCH_API GpuEngineManager { +struct TORCH_XPU_API GpuEngineManager { static GpuEngineManager& Instance(); // Singleton dnnl::engine& get_engine(const Device& device) { @@ -51,7 +51,7 @@ struct TORCH_API GpuEngineManager { }; // GpuStreamManager singleton -struct TORCH_API GpuStreamManager { +struct TORCH_XPU_API GpuStreamManager { static GpuStreamManager& Instance(); // Singleton dnnl::stream get_stream() { diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.h b/aten/src/ATen/xpu/XPUGeneratorImpl.h index ce77d2e444e6..a1f264382a36 100644 --- a/aten/src/ATen/xpu/XPUGeneratorImpl.h +++ b/aten/src/ATen/xpu/XPUGeneratorImpl.h @@ -4,7 +4,7 @@ namespace at { -struct TORCH_API XPUGeneratorImpl : public GeneratorImpl { +struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl { // Constructors XPUGeneratorImpl(DeviceIndex device_index = -1); ~XPUGeneratorImpl() override = default; diff --git a/aten/src/ATen/xpu/detail/XPUHooks.cpp b/aten/src/ATen/xpu/detail/XPUHooks.cpp index 22f4ff22b4bb..61bc19faa95e 100644 --- a/aten/src/ATen/xpu/detail/XPUHooks.cpp +++ b/aten/src/ATen/xpu/detail/XPUHooks.cpp @@ -25,7 +25,13 @@ std::string XPUHooks::showConfig() const { int32_t XPUHooks::getGlobalIdxFromDevice(const at::Device& device) const { TORCH_CHECK(device.is_xpu(), "Only the XPU device type is expected."); +#ifdef _WIN32 + TORCH_CHECK( + false, + "Default context is not supported on XPU on Windows. So we can NOT find its global index of the ATen device."); +#else return at::xpu::getGlobalIdxFromDevice(device.index()); +#endif } Generator XPUHooks::getXPUGenerator(DeviceIndex device_index) const { @@ -38,7 +44,13 @@ const Generator& XPUHooks::getDefaultXPUGenerator( } Device XPUHooks::getDeviceFromPtr(void* data) const { +#ifdef _WIN32 + TORCH_CHECK( + false, + "Default context is not supported on XPU on Windows. So we can NOT find the ATen device of a pointer."); +#else return at::xpu::getDeviceFromPtr(data); +#endif } c10::DeviceIndex XPUHooks::getNumGPUs() const { diff --git a/c10/util/Float8_fnuz_cvt.h b/c10/util/Float8_fnuz_cvt.h index 983063a0230f..327f90d11a71 100644 --- a/c10/util/Float8_fnuz_cvt.h +++ b/c10/util/Float8_fnuz_cvt.h @@ -4,6 +4,10 @@ #include +#if defined(SYCL_LANGUAGE_VERSION) +#include +#endif + namespace c10::detail { /* @@ -33,6 +37,8 @@ inline C10_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) { // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) uint32_t renorm_shift = __clz(mantissa); +#elif defined(__SYCL_DEVICE_ONLY__) + uint32_t renorm_shift = sycl::clz(mantissa); #elif defined(_MSC_VER) unsigned long nonsign_bsr; _BitScanReverse(&nonsign_bsr, (unsigned long)mantissa); diff --git a/c10/xpu/CMakeLists.txt b/c10/xpu/CMakeLists.txt index d06d0f0aa92a..b5c63d4f7cca 100644 --- a/c10/xpu/CMakeLists.txt +++ b/c10/xpu/CMakeLists.txt @@ -8,6 +8,12 @@ if(NOT BUILD_LIBTORCHLESS) find_library(C10_XPU_LIB c10_xpu PATHS $ENV{LIBTORCH_LIB_PATH} NO_DEFAULT_PATH) endif() +# ---[ Configure macro file. +set(C10_XPU_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in xpu_cmake_macros.h.in +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/impl/xpu_cmake_macros.h.in + ${CMAKE_BINARY_DIR}/c10/xpu/impl/xpu_cmake_macros.h) + set(C10_XPU_SRCS XPUCachingAllocator.cpp XPUFunctions.cpp @@ -50,3 +56,7 @@ foreach(file ${C10_XPU_HEADERS}) get_filename_component(dir ${file} DIRECTORY) install(FILES ${file} DESTINATION include/c10/xpu/${dir}) endforeach() + +if(MSVC AND C10_XPU_BUILD_SHARED_LIBS) + install(FILES $ DESTINATION lib OPTIONAL) +endif() diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp index 15e24d94f5dc..cc885776a916 100644 --- a/c10/xpu/XPUFunctions.cpp +++ b/c10/xpu/XPUFunctions.cpp @@ -2,8 +2,6 @@ #include #include -#include -#include #include namespace c10::xpu { @@ -53,10 +51,20 @@ inline void initGlobalDevicePoolState() { return; } +#ifdef _WIN32 + // default context feature is disabled by default on Windows. + std::vector deviceList; + for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end(); + ++it) { + deviceList.push_back(*(*it)); + } + gDevicePool.context = std::make_unique(deviceList); +#else // The default context is utilized for each Intel GPU device, allowing the // retrieval of the context from any GPU device. gDevicePool.context = std::make_unique( gDevicePool.devices[0]->get_platform().ext_oneapi_get_default_context()); +#endif } inline void initDevicePoolCallOnce() { diff --git a/c10/xpu/XPUMacros.h b/c10/xpu/XPUMacros.h index fc6aad92229c..d51eab989d25 100644 --- a/c10/xpu/XPUMacros.h +++ b/c10/xpu/XPUMacros.h @@ -1,15 +1,29 @@ #pragma once +#ifndef C10_USING_CUSTOM_GENERATED_MACROS +#include +#endif + // See c10/macros/Export.h for a detailed explanation of what the function // of these macros are. We need one set of macros for every separate library // we build. +#ifdef _WIN32 +#if defined(C10_XPU_BUILD_SHARED_LIBS) +#define C10_XPU_EXPORT __declspec(dllexport) +#define C10_XPU_IMPORT __declspec(dllimport) +#else +#define C10_XPU_EXPORT +#define C10_XPU_IMPORT +#endif +#else // _WIN32 #if defined(__GNUC__) #define C10_XPU_EXPORT __attribute__((__visibility__("default"))) #else // defined(__GNUC__) #define C10_XPU_EXPORT #endif // defined(__GNUC__) #define C10_XPU_IMPORT C10_XPU_EXPORT +#endif // _WIN32 // This one is being used by libc10_xpu.so #ifdef C10_XPU_BUILD_MAIN_LIB diff --git a/c10/xpu/impl/xpu_cmake_macros.h.in b/c10/xpu/impl/xpu_cmake_macros.h.in new file mode 100644 index 000000000000..48ed78c07e1d --- /dev/null +++ b/c10/xpu/impl/xpu_cmake_macros.h.in @@ -0,0 +1,6 @@ +#pragma once + +// Automatically generated header file for the C10 XPU library. Do not +// include this file directly. Instead, include c10/xpu/XPUMacros.h + +#cmakedefine C10_XPU_BUILD_SHARED_LIBS diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 2a58b15e8d5e..0d64fe75be41 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1062,8 +1062,15 @@ if(USE_XPU) message(WARNING "Failed to include ATen XPU implementation target") else() target_link_libraries(torch_xpu PRIVATE torch_xpu_ops) - target_link_libraries(torch_xpu PRIVATE - "-Wl,--whole-archive,\"$\" -Wl,--no-whole-archive") + if(MSVC) + # Windows + target_link_libraries(torch_xpu PRIVATE + "-WHOLEARCHIVE:\"$\"") + else() + # Linux + target_link_libraries(torch_xpu PRIVATE + "-Wl,--whole-archive,\"$\" -Wl,--no-whole-archive") + endif() endif() endif() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 3a57dd64c6af..f1f2eb7cec31 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -89,8 +89,8 @@ endif() if(USE_XPU) include(${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake) if(NOT PYTORCH_FOUND_XPU) - # message(WARNING "Not compiling with XPU. Could NOT find SYCL." - # "Suppress this warning with -DUSE_XPU=OFF.") + message(WARNING "Not compiling with XPU. Could NOT find SYCL." + "Suppress this warning with -DUSE_XPU=OFF.") caffe2_update_option(USE_XPU OFF) endif() endif() diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake index b93f9229fc23..382e71b1049b 100644 --- a/cmake/Modules/FindMKLDNN.cmake +++ b/cmake/Modules/FindMKLDNN.cmake @@ -21,10 +21,16 @@ IF(NOT MKLDNN_FOUND) if(USE_XPU) # Build oneDNN GPU library if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set(DNNL_HOST_COMPILER "g++") + # Linux # g++ is soft linked to /usr/bin/cxx, oneDNN would not treat it as an absolute path + set(DNNL_HOST_COMPILER "g++") + set(SYCL_CXX_DRIVER "icpx") + set(DNNL_LIB_NAME "libdnnl.a") else() - message(FATAL_ERROR "oneDNN library currently only supports GUN g++ compiler for XPU backend") + # Windows + set(DNNL_HOST_COMPILER "DEFAULT") + set(SYCL_CXX_DRIVER "icx") + set(DNNL_LIB_NAME "dnnl.lib") endif() set(DNNL_MAKE_COMMAND "cmake" "--build" ".") @@ -41,8 +47,7 @@ IF(NOT MKLDNN_FOUND) PREFIX ${XPU_MKLDNN_DIR_PREFIX} BUILD_IN_SOURCE 0 CMAKE_ARGS -DCMAKE_C_COMPILER=icx - -DCMAKE_CXX_COMPILER=icpx - -DCMAKE_CXX_COMPILER_ID=IntelLLVM + -DCMAKE_CXX_COMPILER=${SYCL_CXX_DRIVER} -DDNNL_GPU_RUNTIME=SYCL -DDNNL_CPU_RUNTIME=THREADPOOL -DDNNL_BUILD_TESTS=OFF @@ -52,20 +57,20 @@ IF(NOT MKLDNN_FOUND) -DDNNL_DPCPP_HOST_COMPILER=${DNNL_HOST_COMPILER} # Use global cxx compiler as host compiler -G ${CMAKE_GENERATOR} # Align Generator to Torch BUILD_COMMAND ${DNNL_MAKE_COMMAND} - BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/libdnnl.a" + BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/${DNNL_LIB_NAME}" INSTALL_COMMAND "" ) ExternalProject_Get_Property(xpu_mkldnn_proj BINARY_DIR) set(__XPU_MKLDNN_BUILD_DIR ${BINARY_DIR}) - set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a) + set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME}) set(XPU_MKLDNN_INCLUDE ${__XPU_MKLDNN_BUILD_DIR}/include) # This target would be further linked to libtorch_xpu.so. # The libtorch_xpu.so would contain Conv&GEMM operators that depend on # oneDNN primitive implementations inside libdnnl.a. add_library(xpu_mkldnn INTERFACE) add_dependencies(xpu_mkldnn xpu_mkldnn_proj) - target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a) + target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME}) target_include_directories(xpu_mkldnn INTERFACE ${XPU_MKLDNN_INCLUDE}) endif() diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake index d9345bb2fe0d..4a4a6dfaa789 100644 --- a/cmake/Modules/FindSYCLToolkit.cmake +++ b/cmake/Modules/FindSYCLToolkit.cmake @@ -55,6 +55,23 @@ find_library( HINTS ${SYCL_LIBRARY_DIR} NO_DEFAULT_PATH ) +# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix, +# where the current version of the SYCL runtime is 7. +# Until oneAPI adds support to sycl.lib without the version suffix, +# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts. +# TODO: remove this when sycl.lib is supported on Windows +if(WIN32) + set(sycl_runtime_version 7) + find_library( + SYCL_LIBRARY + NAMES "sycl${sycl_runtime_version}" + HINTS ${SYCL_LIBRARY_DIR} + NO_DEFAULT_PATH + ) + if(SYCL_LIBRARY STREQUAL "SYCL_LIBRARY-NOTFOUND") + message(FATAL_ERROR "Cannot find a SYCL library on Windows") + endif() +endif() find_library( OCL_LIBRARY diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp index 7bf8abdef204..cfe7b43d19a9 100644 --- a/torch/csrc/xpu/Module.cpp +++ b/torch/csrc/xpu/Module.cpp @@ -11,24 +11,30 @@ #include #include +#ifndef WIN32 #include +#endif using namespace torch; static bool in_bad_fork = false; // True for children forked after xpu init +#ifndef WIN32 // Called in the forked child if xpu has already been initialized static void forked_child() { in_bad_fork = true; torch::utils::set_requires_device_init(at::kXPU, true); } +#endif // Should be called before the first xpu call. It is mainly called in lazy_init. // Note: This is distinct from initExtension because a stub xpu implementation // has some working functions (e.g. device_count) but cannot fully initialize. static void poison_fork() { +#ifndef WIN32 static c10::once_flag flag; c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); }); +#endif } // XPU management methods