[XROS][ML] System specific adjustments for UTs to work. (#65245)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/65245 Building and running c10 and qnnpack tests on XROS. Notable changes: - Adding #if define(_XROS_) in few places not supported by XROS - Changing Threadpool to abstract class ghstack-source-id: 139513579 Test Plan: Run c10 and qnnpack tests on XROS. Reviewed By: veselinp, iseeyuan Differential Revision: D30137333 fbshipit-source-id: bb6239b935187fac712834341fe5a8d3377762b1
2025-10-20 21:14:14 +08:00 · 2021-10-01 18:13:39 -07:00
parent 363ccb257d
commit eb3b9fe719
9 changed files with 62 additions and 38 deletions
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@ -43,7 +43,7 @@ namespace detail {
 * Note this is a legacy method (from THRandom.cpp)
 * FIXME: use std::random_device with entropy information
 */
-#ifndef _WIN32
+#if !defined(_WIN32) && !defined(__XROS__)
 static uint64_t readURandomLong() {
  int randDev = open("/dev/urandom", O_RDONLY);
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@ -56,7 +56,7 @@ static uint64_t readURandomLong() {
  close(randDev);
  return randValue;
 }
-#endif // _WIN32
+#endif // _WIN32 && __XROS__

 /**
 * Gets a non deterministic random number number from either the
@ -82,6 +82,9 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
    s = (uint64_t)std::chrono::high_resolution_clock::now()
            .time_since_epoch()
            .count();
+#elif defined(__XROS__)
+    std::random_device rd;
+    s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
 #elif defined(__SGX_ENABLED__)
    TORCH_CHECK(
        sgx_read_rand(reinterpret_cast<uint8_t*>(&s), sizeof(s)) == SGX_SUCCESS,
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@ -314,7 +314,7 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // CUDA_KERNEL_ASSERT checks the assertion
 // even when NDEBUG is defined. This is useful for important assertions in CUDA
 // code that would otherwise be suppressed when building Release.
-#if defined(__ANDROID__) || defined(__APPLE__) || \
+#if defined(__ANDROID__) || defined(__APPLE__) || defined(__XROS__) || \
    (defined(USE_ROCM) && ROCM_VERSION < 40100)
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@ -209,7 +209,9 @@ bool InitCaffeLogging(int* argc, char** argv) {
    ::google::InitGoogleLogging(argv[0]);
 #if !defined(_MSC_VER)
    // This is never defined on Windows
+#if !defined(__XROS__)
    ::google::InstallFailureSignalHandler();
+#endif
 #endif
  }
  UpdateLoggingLevelsFromFlags();
--- a/c10/util/Type.cpp
+++ b/c10/util/Type.cpp
@ -4,7 +4,8 @@
 #include <functional>
 #include <memory>

-#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) || \
+    defined(__XROS__)
 #define HAS_DEMANGLE 0
 #elif defined(__APPLE__) && \
    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
--- a/caffe2/utils/threadpool/ThreadPool.cc
+++ b/caffe2/utils/threadpool/ThreadPool.cc
@ -20,6 +20,26 @@ C10_DEFINE_int(pthreadpool_size, 0, "Override the default thread pool size.");

 namespace caffe2 {

+namespace {
+  class ThreadPoolImpl : public ThreadPool {
+  public:
+    explicit ThreadPoolImpl(int numThreads);
+    ~ThreadPoolImpl() override;
+
+    // Returns the number of threads currently in use
+    int getNumThreads() const override;
+    void setNumThreads(size_t numThreads) override;
+
+    void run(const std::function<void(int, size_t)>& fn, size_t range) override;
+    void withPool(const std::function<void(WorkersPool*)>& f) override;
+
+  private:
+    std::atomic_size_t numThreads_;
+    std::shared_ptr<WorkersPool> workersPool_;
+    std::vector<std::shared_ptr<Task>> tasks_;
+  };
+}
+
 size_t getDefaultNumThreads() {
  CAFFE_ENFORCE(cpuinfo_initialize(), "cpuinfo initialization failed");
  int numThreads = cpuinfo_get_processors_count();
@ -89,43 +109,40 @@ constexpr size_t kDefaultMinWorkSize = 1;

 size_t ThreadPool::defaultNumThreads_ = 0;

+ThreadPool* ThreadPool::createThreadPool(int numThreads) {
+  return new ThreadPoolImpl(numThreads);
+}
+
 std::unique_ptr<ThreadPool> ThreadPool::defaultThreadPool() {
  defaultNumThreads_ = getDefaultNumThreads();
  LOG(INFO) << "Constructing thread pool with " << defaultNumThreads_
            << " threads";
-  return std::make_unique<ThreadPool>(defaultNumThreads_);
+  return std::make_unique<ThreadPoolImpl>(defaultNumThreads_);
 }

-ThreadPool::ThreadPool(int numThreads)
-    : minWorkSize_(kDefaultMinWorkSize),
-      numThreads_(numThreads),
-      workersPool_(std::make_shared<WorkersPool>()) {}
+ThreadPoolImpl::ThreadPoolImpl(int numThreads)
+    : numThreads_(numThreads),
+      workersPool_(std::make_shared<WorkersPool>()) {
+  minWorkSize_ = kDefaultMinWorkSize;
+}

 // NOLINTNEXTLINE(modernize-use-equals-default)
-ThreadPool::~ThreadPool() {}
+ThreadPoolImpl::~ThreadPoolImpl() {}

-int ThreadPool::getNumThreads() const {
+int ThreadPoolImpl::getNumThreads() const {
  return numThreads_;
 }

 // Sets the number of threads
 // # of threads should not be bigger than the number of big cores
-void ThreadPool::setNumThreads(size_t numThreads) {
+void ThreadPoolImpl::setNumThreads(size_t numThreads) {
  if (defaultNumThreads_ == 0) {
    defaultNumThreads_ = getDefaultNumThreads();
  }
  numThreads_ = std::min(numThreads, defaultNumThreads_);
 }

-// Sets the minimum work size (range) for which to invoke the
-// threadpool; work sizes smaller than this will just be run on the
-// main (calling) thread
-void ThreadPool::setMinWorkSize(size_t size) {
-  std::lock_guard<std::mutex> guard(executionMutex_);
-  minWorkSize_ = size;
-}
-
-void ThreadPool::run(const std::function<void(int, size_t)>& fn, size_t range) {
+void ThreadPoolImpl::run(const std::function<void(int, size_t)>& fn, size_t range) {
  const auto numThreads = numThreads_.load(std::memory_order_relaxed);

  std::lock_guard<std::mutex> guard(executionMutex_);
@ -183,7 +200,7 @@ void ThreadPool::run(const std::function<void(int, size_t)>& fn, size_t range) {
  workersPool_->Execute(tasks_);
 }

-void ThreadPool::withPool(const std::function<void(WorkersPool*)>& f) {
+void ThreadPoolImpl::withPool(const std::function<void(WorkersPool*)>& f) {
  std::lock_guard<std::mutex> guard(executionMutex_);
  f(workersPool_.get());
 }
--- a/caffe2/utils/threadpool/ThreadPool.h
+++ b/caffe2/utils/threadpool/ThreadPool.h
@ -32,33 +32,34 @@ constexpr size_t kCacheLineSize = 64;
 // TORCH_API and alignas annotations at the same time.
 class TORCH_API /*alignas(kCacheLineSize)*/ ThreadPool {
 public:
+  static ThreadPool* createThreadPool(int numThreads);
  static std::unique_ptr<ThreadPool> defaultThreadPool();
-  ThreadPool(int numThreads);
-  ~ThreadPool();
+  virtual ~ThreadPool() = default;
  // Returns the number of threads currently in use
-  int getNumThreads() const;
-  void setNumThreads(size_t numThreads);
+  virtual int getNumThreads() const = 0;
+  virtual void setNumThreads(size_t numThreads) = 0;

  // Sets the minimum work size (range) for which to invoke the
  // threadpool; work sizes smaller than this will just be run on the
  // main (calling) thread
-  void setMinWorkSize(size_t size);
+  void setMinWorkSize(size_t size) {
+    std::lock_guard<std::mutex> guard(executionMutex_);
+    minWorkSize_ = size;
+  }
+
  size_t getMinWorkSize() const {
    return minWorkSize_;
  }
-  void run(const std::function<void(int, size_t)>& fn, size_t range);
+  virtual void run(const std::function<void(int, size_t)>& fn, size_t range) = 0;

  // Run an arbitrary function in a thread-safe manner accessing the Workers
  // Pool
-  void withPool(const std::function<void(WorkersPool*)>& fn);
+  virtual void withPool(const std::function<void(WorkersPool*)>& fn) = 0;

- private:
+ protected:
  static size_t defaultNumThreads_;
  mutable std::mutex executionMutex_;
  size_t minWorkSize_;
-  std::atomic_size_t numThreads_;
-  std::shared_ptr<WorkersPool> workersPool_;
-  std::vector<std::shared_ptr<Task>> tasks_;
 };

 } // namespace caffe2
--- a/caffe2/utils/threadpool/pthreadpool-cpp.cc
+++ b/caffe2/utils/threadpool/pthreadpool-cpp.cc
@ -83,7 +83,7 @@ size_t getDefaultNumThreads();
 PThreadPool* pthreadpool() {
  static auto threadpool =
    std::make_unique<PThreadPool>(getDefaultNumThreads());
-#ifndef WIN32
+#if !(defined(WIN32)) && !(defined(__XROS__))
  static std::once_flag flag;
  std::call_once(flag, []() {
    pthread_atfork(nullptr, nullptr, child_atfork);
--- a/caffe2/utils/threadpool/pthreadpool.h
+++ b/caffe2/utils/threadpool/pthreadpool.h
@ -8,7 +8,7 @@
 #include <stddef.h> // for size_t
 #include <stdint.h> // for uint32_t

-#ifdef USE_PTHREADPOOL
+#if defined(USE_PTHREADPOOL) && !(defined(__XROS__))
 // This is a hack.
 // Mainly introduced here because
 // 1. NNPACK can be compiled to use internal legacy threadpool implementation because much of C2 depends on that.
--- a/caffe2/utils/threadpool/pthreadpool_impl.cc
+++ b/caffe2/utils/threadpool/pthreadpool_impl.cc
@ -2,7 +2,7 @@
 #include "caffe2/utils/threadpool/pthreadpool-cpp.h"
 #include "caffe2/utils/threadpool/ThreadPool.h"

-#ifdef USE_PTHREADPOOL
+#if defined(USE_PTHREADPOOL) && !(defined(__XROS__))
 namespace caffe2 {
 namespace {
 static thread_local bool using_new_threadpool{false};
@ -34,7 +34,7 @@ void legacy_pthreadpool_compute_1d(
    }
    return;
  }
-#ifdef USE_PTHREADPOOL
+#if defined(USE_PTHREADPOOL) && !(defined(__XROS__))
  if (caffe2::using_new_threadpool) {
    pthreadpool_parallelize_1d(threadpool, function, argument, range, 0u);
  } else {
@ -76,7 +76,7 @@ legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count) {
  std::mutex thread_pool_creation_mutex_;
  std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);

-  return reinterpret_cast<legacy_pthreadpool_t>(new caffe2::ThreadPool(threads_count));
+  return reinterpret_cast<legacy_pthreadpool_t>(caffe2::ThreadPool::createThreadPool(threads_count));
 }

 void legacy_pthreadpool_destroy(legacy_pthreadpool_t pthreadpool) {