Enable win-arm64

This patch enables Pytorch build from source with Ninja and 'Visual Studio 16 2019' CMake generator on Windows on Arm. Tests: - Build from source: 'python setup.py develop'. - Run simple Pytorch example: passed - python test\test_torch.py: -- same results as on x64 -- Ran 1344 tests, failures=2 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72424
2025-10-20 21:14:14 +08:00 · 2022-02-28 17:17:56 +00:00
parent a1d5b5d2b3
commit c4ff49f4c7
5 changed files with 28 additions and 4 deletions
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@ -76,7 +76,7 @@ struct bitset final {
  // (i.e. if the very first bit is set, this function returns '1'), and a
  // return of '0' means that there was no bit set.
  size_t find_first_set() const {
-#if defined(_MSC_VER) && defined(_M_X64)
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
    unsigned long result;
    bool has_bits_set = (0 != _BitScanForward64(&result, bitset_));
    if (!has_bits_set) {
--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@ -101,8 +101,11 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo
  // Windows always little endian
  #define __BYTE_ORDER __LITTLE_ENDIAN

+  #if !defined(_M_ARM64)
  // intrinsics / prefetching
  #include <xmmintrin.h>
+  #endif
+
  #ifdef __MINGW32__
    #define PREFETCH(location) __builtin_prefetch(location)
  #else
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@ -1,4 +1,5 @@
 import os
+import platform
 from glob import glob
 import shutil
 from typing import Dict, Optional
@ -10,6 +11,22 @@ from setuptools import distutils  # type: ignore[import]

 def _overlay_windows_vcvars(env: Dict[str, str]) -> Dict[str, str]:
    vc_arch = 'x64' if IS_64BIT else 'x86'
+
+    if platform.machine() == 'ARM64':
+        vc_arch = 'x64_arm64'
+
+        # First Win11 Windows on Arm build version that supports x64 emulation
+        # is 10.0.22000.
+        win11_1st_version = (10, 0, 22000)
+        current_win_version = tuple(int(version_part) for version_part in
+                                    platform.version().split('.'))
+        if current_win_version < win11_1st_version:
+            vc_arch = 'x86_arm64'
+            print("Warning: 32-bit toolchain will be used, but 64-bit linker "
+                  "is recommended to avoid out-of-memory linker error!")
+            print("Warning: Please consider upgrading to Win11, where x64 "
+                  "emulation is enabled!")
+
    vc_env: Dict[str, str] = distutils._msvccompiler._get_vc_env(vc_arch)
    # Keys in `_get_vc_env` are always lowercase.
    # We turn them into uppercase before overlaying vcvars
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@ -4,6 +4,7 @@

 import multiprocessing
 import os
+import platform
 import re
 from subprocess import check_call, check_output, CalledProcessError
 import sys
@ -220,8 +221,11 @@ class CMake:
                          'in the build steps carefully.')
                    sys.exit(1)
            if IS_64BIT:
-                args.append('-Ax64')
-                toolset_dict['host'] = 'x64'
+                if platform.machine() == 'ARM64':
+                    args.append('-A ARM64')
+                else:
+                    args.append('-Ax64')
+                    toolset_dict['host'] = 'x64'
            if toolset_dict:
                toolset_expr = ','.join(["{}={}".format(k, v) for k, v in toolset_dict.items()])
                args.append('-T' + toolset_expr)
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@ -230,7 +230,7 @@ static CompilerConfig& getConfig() {
 // understand for AVX512. When we need better CPU performance this
 // optimization can be re-enabled by tracking down the platforms where
 // this error occurs and only selectively disabling it.
-#ifdef _MSC_VER
+#if (defined(_MSC_VER) && !defined(_M_ARM64))
 // According to https://stackoverflow.com/a/29178079, we are able to
 // detect which arch level is supported by the vectorizer using
 // the macro __isa_available. It is added during runtime.