Build SYCL kernels for ATen XPU ops on Native Windows (take 2) (#127390)

Original PR https://github.com/pytorch/pytorch/pull/126725 is closed due to bad rebase. ------- As proposed in https://github.com/pytorch/pytorch/issues/126719, we are enabling PyTorch XPU on Native Windows on Intel GPU. This PR enables XPU build on Windows as the first step of #126719: - Enable `USE_XPU` build on Windows using MSVC as host compiler. The use of MSVC as host compiler seamlessly aligns with the existing PyTorch build on Windows. - Build oneDNN GPU library on Windows. Co-authored-by: Yu, Guangye <guangye.yu@intel.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/127390 Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2024-06-06 01:41:06 +00:00
parent 6adcf21b2b
commit b4a0161449
15 changed files with 110 additions and 20 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -242,8 +242,7 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
-cmake_dependent_option(USE_XPU "Use XPU. Only available on Linux." ON "LINUX"
-                       OFF)
+option(USE_XPU "Use XPU" ON)
 cmake_dependent_option(
  BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
  "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
--- a/README.md
+++ b/README.md
@ -189,7 +189,7 @@ Other potentially useful environment variables may be found in `setup.py`.
 ##### Intel GPU Support
 If you want to compile with Intel GPU support, follow these
 - [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions.
- Intel GPU is currently supported only for Linux systems.
+- Intel GPU is supported for Linux and Windows.

 If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
 Other potentially useful environment variables may be found in `setup.py`.
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
@ -12,7 +12,7 @@

 namespace at::native::onednn {

-TORCH_API dnnl::memory make_onednn_memory(
+TORCH_XPU_API dnnl::memory make_onednn_memory(
    dnnl::memory::desc md,
    dnnl::engine& engine,
    void* ptr);
@ -21,7 +21,7 @@ TORCH_API dnnl::memory make_onednn_memory(
 bool set_onednn_verbose(int level);

 // GpuEngineManager singleton
-struct TORCH_API GpuEngineManager {
+struct TORCH_XPU_API GpuEngineManager {
  static GpuEngineManager& Instance(); // Singleton

  dnnl::engine& get_engine(const Device& device) {
@ -51,7 +51,7 @@ struct TORCH_API GpuEngineManager {
 };

 // GpuStreamManager singleton
-struct TORCH_API GpuStreamManager {
+struct TORCH_XPU_API GpuStreamManager {
  static GpuStreamManager& Instance(); // Singleton

  dnnl::stream get_stream() {
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.h
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.h
@ -4,7 +4,7 @@

 namespace at {

-struct TORCH_API XPUGeneratorImpl : public GeneratorImpl {
+struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
  // Constructors
  XPUGeneratorImpl(DeviceIndex device_index = -1);
  ~XPUGeneratorImpl() override = default;
--- a/aten/src/ATen/xpu/detail/XPUHooks.cpp
+++ b/aten/src/ATen/xpu/detail/XPUHooks.cpp
@ -25,7 +25,13 @@ std::string XPUHooks::showConfig() const {

 int32_t XPUHooks::getGlobalIdxFromDevice(const at::Device& device) const {
  TORCH_CHECK(device.is_xpu(), "Only the XPU device type is expected.");
+#ifdef _WIN32
+  TORCH_CHECK(
+      false,
+      "Default context is not supported on XPU on Windows. So we can NOT find its global index of the ATen device.");
+#else
  return at::xpu::getGlobalIdxFromDevice(device.index());
+#endif
 }

 Generator XPUHooks::getXPUGenerator(DeviceIndex device_index) const {
@ -38,7 +44,13 @@ const Generator& XPUHooks::getDefaultXPUGenerator(
 }

 Device XPUHooks::getDeviceFromPtr(void* data) const {
+#ifdef _WIN32
+  TORCH_CHECK(
+      false,
+      "Default context is not supported on XPU on Windows. So we can NOT find the ATen device of a pointer.");
+#else
  return at::xpu::getDeviceFromPtr(data);
+#endif
 }

 c10::DeviceIndex XPUHooks::getNumGPUs() const {
--- a/c10/util/Float8_fnuz_cvt.h
+++ b/c10/util/Float8_fnuz_cvt.h
@ -4,6 +4,10 @@

 #include <cstdint>

+#if defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp>
+#endif
+
 namespace c10::detail {

 /*
@ -33,6 +37,8 @@ inline C10_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) {
    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
    uint32_t renorm_shift = __clz(mantissa);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    uint32_t renorm_shift = sycl::clz(mantissa);
 #elif defined(_MSC_VER)
    unsigned long nonsign_bsr;
    _BitScanReverse(&nonsign_bsr, (unsigned long)mantissa);
--- a/c10/xpu/CMakeLists.txt
+++ b/c10/xpu/CMakeLists.txt
@ -8,6 +8,12 @@ if(NOT BUILD_LIBTORCHLESS)
  find_library(C10_XPU_LIB c10_xpu PATHS $ENV{LIBTORCH_LIB_PATH} NO_DEFAULT_PATH)
 endif()

+# ---[ Configure macro file.
+set(C10_XPU_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in xpu_cmake_macros.h.in
+configure_file(
+    ${CMAKE_CURRENT_LIST_DIR}/impl/xpu_cmake_macros.h.in
+    ${CMAKE_BINARY_DIR}/c10/xpu/impl/xpu_cmake_macros.h)
+
 set(C10_XPU_SRCS
    XPUCachingAllocator.cpp
    XPUFunctions.cpp
@ -50,3 +56,7 @@ foreach(file ${C10_XPU_HEADERS})
  get_filename_component(dir ${file} DIRECTORY)
  install(FILES ${file} DESTINATION include/c10/xpu/${dir})
 endforeach()
+
+if(MSVC AND C10_XPU_BUILD_SHARED_LIBS)
+  install(FILES $<TARGET_PDB_FILE:c10_xpu> DESTINATION lib OPTIONAL)
+endif()
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@ -2,8 +2,6 @@
 #include <c10/util/Exception.h>
 #include <c10/xpu/XPUFunctions.h>

-#include <sys/wait.h>
-#include <unistd.h>
 #include <vector>

 namespace c10::xpu {
@ -53,10 +51,20 @@ inline void initGlobalDevicePoolState() {
    return;
  }

+#ifdef _WIN32
+  // default context feature is disabled by default on Windows.
+  std::vector<sycl::device> deviceList;
+  for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end();
+       ++it) {
+    deviceList.push_back(*(*it));
+  }
+  gDevicePool.context = std::make_unique<sycl::context>(deviceList);
+#else
  // The default context is utilized for each Intel GPU device, allowing the
  // retrieval of the context from any GPU device.
  gDevicePool.context = std::make_unique<sycl::context>(
      gDevicePool.devices[0]->get_platform().ext_oneapi_get_default_context());
+#endif
 }

 inline void initDevicePoolCallOnce() {
--- a/c10/xpu/XPUMacros.h
+++ b/c10/xpu/XPUMacros.h
@ -1,15 +1,29 @@
 #pragma once

+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/xpu/impl/xpu_cmake_macros.h>
+#endif
+
 // See c10/macros/Export.h for a detailed explanation of what the function
 // of these macros are.  We need one set of macros for every separate library
 // we build.

+#ifdef _WIN32
+#if defined(C10_XPU_BUILD_SHARED_LIBS)
+#define C10_XPU_EXPORT __declspec(dllexport)
+#define C10_XPU_IMPORT __declspec(dllimport)
+#else
+#define C10_XPU_EXPORT
+#define C10_XPU_IMPORT
+#endif
+#else // _WIN32
 #if defined(__GNUC__)
 #define C10_XPU_EXPORT __attribute__((__visibility__("default")))
 #else // defined(__GNUC__)
 #define C10_XPU_EXPORT
 #endif // defined(__GNUC__)
 #define C10_XPU_IMPORT C10_XPU_EXPORT
+#endif // _WIN32

 // This one is being used by libc10_xpu.so
 #ifdef C10_XPU_BUILD_MAIN_LIB
--- a/c10/xpu/impl/xpu_cmake_macros.h.in
+++ b/c10/xpu/impl/xpu_cmake_macros.h.in
@ -0,0 +1,6 @@
+#pragma once
+
+// Automatically generated header file for the C10 XPU library.  Do not
+// include this file directly.  Instead, include c10/xpu/XPUMacros.h
+
+#cmakedefine C10_XPU_BUILD_SHARED_LIBS
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1062,9 +1062,16 @@ if(USE_XPU)
    message(WARNING "Failed to include ATen XPU implementation target")
  else()
    target_link_libraries(torch_xpu PRIVATE torch_xpu_ops)
+    if(MSVC)
+      # Windows
+      target_link_libraries(torch_xpu PRIVATE
+      "-WHOLEARCHIVE:\"$<TARGET_FILE:torch_xpu_ops>\"")
+    else()
+      # Linux
      target_link_libraries(torch_xpu PRIVATE
        "-Wl,--whole-archive,\"$<TARGET_FILE:torch_xpu_ops>\" -Wl,--no-whole-archive")
    endif()
+  endif()
 endif()

 if(NOT MSVC AND USE_XNNPACK)
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -89,8 +89,8 @@ endif()
 if(USE_XPU)
  include(${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake)
  if(NOT PYTORCH_FOUND_XPU)
-    # message(WARNING "Not compiling with XPU. Could NOT find SYCL."
-    # "Suppress this warning with -DUSE_XPU=OFF.")
+    message(WARNING "Not compiling with XPU. Could NOT find SYCL."
+    "Suppress this warning with -DUSE_XPU=OFF.")
    caffe2_update_option(USE_XPU OFF)
  endif()
 endif()
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@ -21,10 +21,16 @@ IF(NOT MKLDNN_FOUND)

  if(USE_XPU) # Build oneDNN GPU library
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      set(DNNL_HOST_COMPILER "g++")
+      # Linux
      # g++ is soft linked to /usr/bin/cxx, oneDNN would not treat it as an absolute path
+      set(DNNL_HOST_COMPILER "g++")
+      set(SYCL_CXX_DRIVER "icpx")
+      set(DNNL_LIB_NAME "libdnnl.a")
    else()
-      message(FATAL_ERROR "oneDNN library currently only supports GUN g++ compiler for XPU backend")
+      # Windows
+      set(DNNL_HOST_COMPILER "DEFAULT")
+      set(SYCL_CXX_DRIVER "icx")
+      set(DNNL_LIB_NAME "dnnl.lib")
    endif()

    set(DNNL_MAKE_COMMAND "cmake" "--build" ".")
@ -41,8 +47,7 @@ IF(NOT MKLDNN_FOUND)
      PREFIX ${XPU_MKLDNN_DIR_PREFIX}
      BUILD_IN_SOURCE 0
      CMAKE_ARGS  -DCMAKE_C_COMPILER=icx
-      -DCMAKE_CXX_COMPILER=icpx
-      -DCMAKE_CXX_COMPILER_ID=IntelLLVM
+      -DCMAKE_CXX_COMPILER=${SYCL_CXX_DRIVER}
      -DDNNL_GPU_RUNTIME=SYCL
      -DDNNL_CPU_RUNTIME=THREADPOOL
      -DDNNL_BUILD_TESTS=OFF
@ -52,20 +57,20 @@ IF(NOT MKLDNN_FOUND)
      -DDNNL_DPCPP_HOST_COMPILER=${DNNL_HOST_COMPILER} # Use global cxx compiler as host compiler
      -G ${CMAKE_GENERATOR} # Align Generator to Torch
      BUILD_COMMAND ${DNNL_MAKE_COMMAND}
-      BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/libdnnl.a"
+      BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/${DNNL_LIB_NAME}"
      INSTALL_COMMAND ""
    )

    ExternalProject_Get_Property(xpu_mkldnn_proj BINARY_DIR)
    set(__XPU_MKLDNN_BUILD_DIR ${BINARY_DIR})
-    set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
+    set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME})
    set(XPU_MKLDNN_INCLUDE ${__XPU_MKLDNN_BUILD_DIR}/include)
    # This target would be further linked to libtorch_xpu.so.
    # The libtorch_xpu.so would contain Conv&GEMM operators that depend on
    # oneDNN primitive implementations inside libdnnl.a.
    add_library(xpu_mkldnn INTERFACE)
    add_dependencies(xpu_mkldnn xpu_mkldnn_proj)
-    target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
+    target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME})
    target_include_directories(xpu_mkldnn INTERFACE ${XPU_MKLDNN_INCLUDE})
  endif()

--- a/cmake/Modules/FindSYCLToolkit.cmake
+++ b/cmake/Modules/FindSYCLToolkit.cmake
@ -55,6 +55,23 @@ find_library(
  HINTS ${SYCL_LIBRARY_DIR}
  NO_DEFAULT_PATH
 )
+# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix,
+# where the current version of the SYCL runtime is 7.
+# Until oneAPI adds support to sycl.lib without the version suffix,
+# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts.
+# TODO: remove this when sycl.lib is supported on Windows
+if(WIN32)
+  set(sycl_runtime_version 7)
+  find_library(
+    SYCL_LIBRARY
+    NAMES "sycl${sycl_runtime_version}"
+    HINTS ${SYCL_LIBRARY_DIR}
+    NO_DEFAULT_PATH
+  )
+  if(SYCL_LIBRARY STREQUAL "SYCL_LIBRARY-NOTFOUND")
+    message(FATAL_ERROR "Cannot find a SYCL library on Windows")
+  endif()
+endif()

 find_library(
  OCL_LIBRARY
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@ -11,24 +11,30 @@
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>

+#ifndef WIN32
 #include <pthread.h>
+#endif

 using namespace torch;

 static bool in_bad_fork = false; // True for children forked after xpu init

+#ifndef WIN32
 // Called in the forked child if xpu has already been initialized
 static void forked_child() {
  in_bad_fork = true;
  torch::utils::set_requires_device_init(at::kXPU, true);
 }
+#endif

 // Should be called before the first xpu call. It is mainly called in lazy_init.
 // Note: This is distinct from initExtension because a stub xpu implementation
 // has some working functions (e.g. device_count) but cannot fully initialize.
 static void poison_fork() {
+#ifndef WIN32
  static c10::once_flag flag;
  c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
+#endif
 }

 // XPU management methods