mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-26 16:44:54 +08:00 
			
		
		
		
	Compare commits
	
		
			70 Commits
		
	
	
		
			findhao/ba
			...
			v1.6.0-rc3
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| cefb9e0cd6 | |||
| d9e9e0087a | |||
| 43d746305c | |||
| 9409e03903 | |||
| c9a1853d2f | |||
| 7fa9b2923b | |||
| 40bf15a8ac | |||
| c164fc4d7f | |||
| e0b7480f34 | |||
| 89d7f194d8 | |||
| 59bb44a8e8 | |||
| 8f4d01d9f1 | |||
| 77ffb25925 | |||
| af9600b1f5 | |||
| 83262b1ba1 | |||
| f862a6ba4d | |||
| f3c1ea7455 | |||
| 2ed3ad2891 | |||
| a857af50a4 | |||
| d0045e5520 | |||
| 0406b69b79 | |||
| 6220cc4380 | |||
| eaf3f2fd34 | |||
| c35b4c770b | |||
| 11baccf1b5 | |||
| f0f0cbdd4a | |||
| 11b70b0041 | |||
| 01e9562313 | |||
| 3f13c9a2c8 | |||
| 63a94c021a | |||
| 2b175ba909 | |||
| 8c3f662224 | |||
| 0ffdd5aa1d | |||
| d53427c541 | |||
| b44b1d868e | |||
| 9184c9832e | |||
| e89c4f0dec | |||
| ea273c68f9 | |||
| 4dd37bfbf7 | |||
| 2533b9da83 | |||
| c5c8a85a82 | |||
| b4b8f5b9d4 | |||
| 41816dc97f | |||
| 31d9776c04 | |||
| ddea6c552f | |||
| 091537a764 | |||
| bf4d905ea1 | |||
| 415e499330 | |||
| eaf7dad5d6 | |||
| 75a074abdc | |||
| dede34eab7 | |||
| 0c90b6da5c | |||
| 4316199832 | |||
| f993e5ac88 | |||
| c5bd737f0c | |||
| fe45c2c986 | |||
| a9996bb482 | |||
| bdfcbfa18c | |||
| ea1b0dba18 | |||
| 6d85b2c989 | |||
| 44f79651a7 | |||
| 8682ac147b | |||
| 4cc605e80a | |||
| b0cce716f7 | |||
| 0dc93ac119 | |||
| bb848df10b | |||
| 2dc0b84aca | |||
| 168cddf5f1 | |||
| bc8760b3db | |||
| 4269b9a8fc | 
| @ -958,6 +958,11 @@ jobs: | ||||
|         no_output_timeout: "1h" | ||||
|         command: | | ||||
|             source "/pytorch/.circleci/scripts/binary_linux_build.sh" | ||||
|     - run: | ||||
|         name: Output binary sizes | ||||
|         no_output_timeout: "1m" | ||||
|         command: | | ||||
|             ls -lah /final_pkgs | ||||
|     - run: | ||||
|         name: save binary size | ||||
|         no_output_timeout: "5m" | ||||
| @ -972,6 +977,9 @@ jobs: | ||||
|         root: / | ||||
|         paths: final_pkgs | ||||
|  | ||||
|     - store_artifacts: | ||||
|         path: /final_pkgs | ||||
|  | ||||
|     # This should really just be another step of the binary_linux_build job above. | ||||
|     # This isn't possible right now b/c the build job uses the docker executor | ||||
|     # (otherwise they'd be really really slow) but this one uses the macine | ||||
|  | ||||
| @ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src | ||||
| cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/ | ||||
| # build a FAT bianry | ||||
| cd ${ZIP_DIR}/install/lib | ||||
| target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a) | ||||
| target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a) | ||||
| for lib in ${target_libs[*]} | ||||
| do | ||||
|     if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then | ||||
|  | ||||
| @ -20,6 +20,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly} | ||||
| CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::') | ||||
| BACKUP_BUCKET="s3://pytorch-backup" | ||||
|  | ||||
| retry pip install -q awscli | ||||
| # Upload the package to the final location | ||||
| pushd /home/circleci/project/final_pkgs | ||||
| if [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
| @ -30,14 +31,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
|   subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir  | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//') | ||||
|   BACKUP_DIR="conda/${subdir}" | ||||
| elif [[ "$PACKAGE_TYPE" == libtorch ]]; then | ||||
|   retry pip install -q awscli | ||||
|   s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
|   for pkg in $(ls); do | ||||
|     retry aws s3 cp "$pkg" "$s3_dir" --acl public-read | ||||
|   done | ||||
|   BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
| else | ||||
|   retry pip install -q awscli | ||||
|   s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
|   retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read | ||||
|   BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
| @ -45,5 +44,5 @@ fi | ||||
|  | ||||
| if [[ -n "${CIRCLE_TAG:-}" ]]; then | ||||
|   s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}" | ||||
|   retry aws s3 cp . "$s3_dir" | ||||
|   retry aws s3 cp --recursive . "$s3_dir" | ||||
| fi | ||||
|  | ||||
| @ -21,6 +21,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly} | ||||
| CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::') | ||||
| BACKUP_BUCKET="s3://pytorch-backup" | ||||
|  | ||||
| retry pip install -q awscli | ||||
| pushd "$workdir/final_pkgs" | ||||
| if [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
|   retry conda install -yq anaconda-client | ||||
| @ -30,14 +31,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
|   subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir  | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//') | ||||
|   BACKUP_DIR="conda/${subdir}" | ||||
| elif [[ "$PACKAGE_TYPE" == libtorch ]]; then | ||||
|   retry pip install -q awscli | ||||
|   s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
|   for pkg in $(ls); do | ||||
|     retry aws s3 cp "$pkg" "$s3_dir" --acl public-read | ||||
|   done | ||||
|   BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
| else | ||||
|   retry pip install -q awscli | ||||
|   s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
|   retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read | ||||
|   BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
| @ -45,5 +44,5 @@ fi | ||||
|  | ||||
| if [[ -n "${CIRCLE_TAG:-}" ]]; then | ||||
|   s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}" | ||||
|   retry aws s3 cp . "$s3_dir" | ||||
|   retry aws s3 cp --recursive . "$s3_dir" | ||||
| fi | ||||
|  | ||||
| @ -19,6 +19,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly/} | ||||
| CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::') | ||||
| BACKUP_BUCKET="s3://pytorch-backup" | ||||
|  | ||||
| retry pip install -q awscli | ||||
| pushd /root/workspace/final_pkgs | ||||
| # Upload the package to the final location | ||||
| if [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
| @ -29,14 +30,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
|   subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir  | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//') | ||||
|   BACKUP_DIR="conda/${subdir}" | ||||
| elif [[ "$PACKAGE_TYPE" == libtorch ]]; then | ||||
|   retry conda install -c conda-forge -yq awscli | ||||
|   s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
|   for pkg in $(ls); do | ||||
|     retry aws s3 cp "$pkg" "$s3_dir" --acl public-read | ||||
|   done | ||||
|   BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
| else | ||||
|   retry conda install -c conda-forge -yq awscli | ||||
|   s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
|   retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read | ||||
|   BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
| @ -44,5 +43,5 @@ fi | ||||
|  | ||||
| if [[ -n "${CIRCLE_TAG:-}" ]]; then | ||||
|   s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}" | ||||
|   retry aws s3 cp . "$s3_dir" | ||||
|   retry aws s3 cp --recursive . "$s3_dir" | ||||
| fi | ||||
|  | ||||
| @ -41,6 +41,11 @@ | ||||
|         no_output_timeout: "1h" | ||||
|         command: | | ||||
|             source "/pytorch/.circleci/scripts/binary_linux_build.sh" | ||||
|     - run: | ||||
|         name: Output binary sizes | ||||
|         no_output_timeout: "1m" | ||||
|         command: | | ||||
|             ls -lah /final_pkgs | ||||
|     - run: | ||||
|         name: save binary size | ||||
|         no_output_timeout: "5m" | ||||
| @ -55,6 +60,9 @@ | ||||
|         root: / | ||||
|         paths: final_pkgs | ||||
|  | ||||
|     - store_artifacts: | ||||
|         path: /final_pkgs | ||||
|  | ||||
|     # This should really just be another step of the binary_linux_build job above. | ||||
|     # This isn't possible right now b/c the build job uses the docker executor | ||||
|     # (otherwise they'd be really really slow) but this one uses the macine | ||||
|  | ||||
| @ -181,7 +181,7 @@ fi | ||||
|  | ||||
| # Patch required to build xla | ||||
| if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then | ||||
|   git clone --recursive https://github.com/pytorch/xla.git | ||||
|   git clone --recursive -b r1.6 https://github.com/pytorch/xla.git | ||||
|   ./xla/scripts/apply_patches.sh | ||||
| fi | ||||
|  | ||||
|  | ||||
| @ -185,9 +185,9 @@ function get_exit_code() { | ||||
| function file_diff_from_base() { | ||||
|   # The fetch may fail on Docker hosts, but it's not always necessary. | ||||
|   set +e | ||||
|   git fetch origin master --quiet | ||||
|   git fetch origin release/1.6 --quiet | ||||
|   set -e | ||||
|   git diff --name-only "$(git merge-base origin/master HEAD)" > "$1" | ||||
|   git diff --name-only "$(git merge-base origin/release/1.6 HEAD)" > "$1" | ||||
| } | ||||
|  | ||||
| function get_bazel() { | ||||
|  | ||||
| @ -289,7 +289,7 @@ test_backward_compatibility() { | ||||
|   pushd test/backward_compatibility | ||||
|   python dump_all_function_schemas.py --filename new_schemas.txt | ||||
|   pip_uninstall torch | ||||
|   pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html | ||||
|   pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html | ||||
|   python check_backward_compatibility.py --new-schemas new_schemas.txt | ||||
|   popd | ||||
|   set +x | ||||
| @ -341,8 +341,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; t | ||||
| elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then | ||||
|   test_bazel | ||||
| elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4* ]]; then | ||||
|   # test cpp extension for xenial + cuda 9.2 + gcc 5.4 to make sure  | ||||
|   # cpp extension can be built correctly under this old env  | ||||
|   # test cpp extension for xenial + cuda 9.2 + gcc 5.4 to make sure | ||||
|   # cpp extension can be built correctly under this old env | ||||
|   test_cpp_extensions | ||||
| else | ||||
|   test_torchvision | ||||
|  | ||||
| @ -1350,7 +1350,6 @@ filegroup( | ||||
|         "caffe2/utils/smart_tensor_printer.cc", | ||||
|         "caffe2/utils/string_utils.cc", | ||||
|         "caffe2/utils/threadpool/ThreadPool.cc", | ||||
|         "caffe2/utils/threadpool/ThreadPoolMobile.cc", | ||||
|         "caffe2/utils/threadpool/pthreadpool.cc", | ||||
|         "caffe2/utils/threadpool/pthreadpool_impl.cc", | ||||
|     ], | ||||
|  | ||||
| @ -481,7 +481,7 @@ if(USE_PYTORCH_QNNPACK) | ||||
| endif() | ||||
|  | ||||
| if(USE_XNNPACK) | ||||
|   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK -DUSE_INTERNAL_THREADPOOL_IMPL") | ||||
|   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK") | ||||
| endif() | ||||
|  | ||||
| if(USE_VULKAN) | ||||
|  | ||||
| @ -99,6 +99,7 @@ if(ANDROID_ABI) | ||||
|   import_static_lib(libnnpack) | ||||
|   import_static_lib(libXNNPACK) | ||||
|   import_static_lib(libpytorch_qnnpack) | ||||
|   import_static_lib(libpthreadpool) | ||||
|   import_static_lib(libeigen_blas) | ||||
|   import_static_lib(libcpuinfo) | ||||
|   import_static_lib(libclog) | ||||
| @ -115,6 +116,7 @@ if(ANDROID_ABI) | ||||
|       libnnpack | ||||
|       libXNNPACK | ||||
|       libpytorch_qnnpack | ||||
|       libpthreadpool | ||||
|       libeigen_blas | ||||
|       libcpuinfo | ||||
|       libclog | ||||
| @ -129,6 +131,7 @@ else() | ||||
|       nnpack | ||||
|       XNNPACK | ||||
|       pytorch_qnnpack | ||||
|       pthreadpool | ||||
|       cpuinfo | ||||
|       clog | ||||
|   ) | ||||
|  | ||||
| @ -8,8 +8,10 @@ | ||||
|  | ||||
| #include "pytorch_jni_common.h" | ||||
| #if defined(__ANDROID__) | ||||
| #include <caffe2/utils/threadpool/ThreadPool.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #ifndef USE_PTHREADPOOL | ||||
| #define USE_PTHREADPOOL | ||||
| #endif /* USE_PTHREADPOOL */ | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
| #endif | ||||
|  | ||||
| namespace pytorch_jni { | ||||
| @ -605,7 +607,7 @@ class PyTorchAndroidJni : public facebook::jni::JavaClass<PyTorchAndroidJni> { | ||||
|   } | ||||
|  | ||||
|   static void setNumThreads(facebook::jni::alias_ref<jclass>, jint numThreads) { | ||||
|     caffe2::mobile_threadpool()->setNumThreads(numThreads); | ||||
|     caffe2::pthreadpool()->set_thread_count(numThreads); | ||||
|   } | ||||
| }; | ||||
| #endif | ||||
|  | ||||
| @ -6,8 +6,7 @@ | ||||
| #ifndef C10_MOBILE | ||||
| #include <c10/core/thread_pool.h> | ||||
| #else | ||||
| #include <caffe2/utils/threadpool/ThreadPool.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
| #endif // C10_MOBILE | ||||
|  | ||||
| #include <atomic> | ||||
| @ -88,15 +87,15 @@ void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) { | ||||
|   // Run the first task on the current thread directly. | ||||
|   fn(0, 0); | ||||
| #else | ||||
|   caffe2::ThreadPool* pool = caffe2::mobile_threadpool(); | ||||
|   if (pool) { | ||||
|     // caffe2::ThreadPool can utilize the current thread. | ||||
|     pool->run(fn, range); | ||||
|   } else { | ||||
|     for (size_t i = 0; i < range; ++i) { | ||||
|       fn(0, i); | ||||
|     } | ||||
|   } | ||||
|   caffe2::PThreadPool* const pool = caffe2::pthreadpool(); | ||||
|   TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!"); | ||||
|  | ||||
|   pool->run( | ||||
|     // PThreadPool::run() is blocking.  A std::function [const] reference to | ||||
|     // this lambda cannot go out of scope before PThreadPool::run() returns. | ||||
|     [&fn](const size_t task_id) { | ||||
|       fn(0 /* unused */, task_id); | ||||
|     }, range); | ||||
| #endif // C10_MOBILE | ||||
| } | ||||
|  | ||||
| @ -184,7 +183,7 @@ void init_num_threads() { | ||||
| #endif | ||||
|  | ||||
| #ifdef C10_MOBILE | ||||
|   caffe2::mobile_threadpool(); | ||||
|   caffe2::pthreadpool(); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @ -208,7 +207,9 @@ void set_num_threads(int nthreads) { | ||||
|     } | ||||
|   } | ||||
| #else | ||||
|   TORCH_CHECK(false, "set_num_threads is not supported for mobile."); | ||||
|   caffe2::PThreadPool* const pool = caffe2::pthreadpool(); | ||||
|   TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!"); | ||||
|   pool->set_thread_count(nthreads); | ||||
| #endif // C10_MOBILE | ||||
| } | ||||
|  | ||||
| @ -226,9 +227,9 @@ int get_num_threads() { | ||||
|     return _get_intraop_pool().size() + 1; | ||||
|   } | ||||
| #else | ||||
|   caffe2::ThreadPool* pool = caffe2::mobile_threadpool(); | ||||
|   // caffe2::ThreadPool::getNumThreads() counts the current thread. | ||||
|   return !pool || in_parallel_region() ? 1 /* current thread */ : pool->getNumThreads(); | ||||
|   caffe2::PThreadPool* const pool = caffe2::pthreadpool(); | ||||
|   TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!") | ||||
|   return in_parallel_region() ? 1 /* current thread */ : pool->get_thread_count(); | ||||
| #endif // C10_MOBILE | ||||
| } | ||||
|  | ||||
| @ -257,8 +258,8 @@ void intraop_launch(std::function<void()> func) { | ||||
|     func(); | ||||
|   } | ||||
| #else | ||||
|   // TODO: caffe2::ThreadPool doesn't support submitting tasks separately and | ||||
|   // running in parallel. Should fix it when this API becomes popular. | ||||
|   // TODO: caffe2::PThreadPool only provides a data-parallel API. | ||||
|   // Task parallelism is not currently supported. | ||||
|   func(); | ||||
| #endif // C10_MOBILE | ||||
| } | ||||
| @ -280,8 +281,8 @@ std::shared_ptr<c10::ivalue::Future> intraop_launch_future( | ||||
|   } | ||||
|   return future; | ||||
| #else | ||||
|   // TODO: caffe2::ThreadPool doesn't support submitting tasks separately and | ||||
|   // running in parallel. Should fix it when this API becomes popular. | ||||
|   // TODO: caffe2::PThreadPool only provides a data-parallel API. | ||||
|   // Task parallelism is not currently supported. | ||||
|   auto future = std::make_shared<c10::ivalue::Future>(NoneType::get()); | ||||
|   func(); | ||||
|   future->markCompleted(); | ||||
|  | ||||
| @ -135,6 +135,7 @@ UPTOb( bool  , equal    , (const Tensor &A, const Tensor &B) ) | ||||
| UPTOb( Tensor, cat      , (TensorList    A, int64_t       B) ) | ||||
| UPTOb( Tensor, cat      , (TensorList    A, Dimname       B) ) | ||||
| UPTOb( Tensor, _cat     , (TensorList    A, int64_t       B) ) | ||||
| UPTOd( Tensor, index_put, (const Tensor &A, TensorList    B, const Tensor &         C, bool          D) ) | ||||
| UPTOb( Tensor, stack    , (TensorList    A, int64_t       B) ) | ||||
|  | ||||
| #undef UPTOa | ||||
|  | ||||
| @ -482,15 +482,16 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) { | ||||
|   KERNEL(ADD_NS(addcdiv), "addcdiv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote) | ||||
|   KERNEL(ADD_NS(addcmul), "addcmul", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote) | ||||
|   KERNEL(ADD_NS(atan2), "atan2", Tensor (const Tensor &, const Tensor &), promote) | ||||
|   KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote) | ||||
|   KERNEL_UNBOXED_ONLY(ADD_NS(bilinear), "bilinear", Tensor (const Tensor &, const Tensor &, const Tensor &, const Tensor &), promote) | ||||
|   KERNEL_UNBOXED_ONLY(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote) | ||||
|   KERNEL_UNBOXED_ONLY(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote) | ||||
|   KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote) | ||||
|   KERNEL(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote) | ||||
|   KERNEL_UNBOXED_ONLY(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote) | ||||
|   KERNEL(ADD_NS(_cat), "_cat", Tensor (TensorList, int64_t), promote) | ||||
|   KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote) | ||||
|   KERNEL_UNBOXED_ONLY(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote) | ||||
|   KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote) | ||||
|   KERNEL_UNBOXED_ONLY(ADD_NS(index_put), "index_put", Tensor (const Tensor &, TensorList, const Tensor &, bool), promote) | ||||
|   KERNEL(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote) | ||||
|   KERNEL_UNBOXED_ONLY(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote) | ||||
|  | ||||
|   m.impl_UNBOXED("binary_cross_entropy", &at::autocast::binary_cross_entropy_banned); | ||||
| } | ||||
|  | ||||
| @ -188,6 +188,7 @@ namespace c10 { | ||||
|   _(prim, unchecked_unwrap_optional) \ | ||||
|   _(aten, __contains__)              \ | ||||
|   _(prim, BailoutTemplate)           \ | ||||
|   _(prim, grad)                      \ | ||||
|   _(aten, zero_)                     \ | ||||
|   _(aten, fill_)                     \ | ||||
|   FORALL_ATEN_BASE_SYMBOLS(_)        \ | ||||
|  | ||||
| @ -1481,7 +1481,7 @@ inline TypePtr TensorType::fromBoolType() { | ||||
|  | ||||
| inline c10::optional<c10::ScalarType> tryScalarTypeFromJitType(const c10::TypePtr & type) { | ||||
|   if (type == FloatType::get()) { | ||||
|     return at::ScalarType::Double; | ||||
|     return at::typeMetaToScalarType(c10::get_default_dtype()); | ||||
|   } else if (type == IntType::get()) { | ||||
|     return at::ScalarType::Long; | ||||
|   } else if (type == BoolType::get()) { | ||||
|  | ||||
| @ -181,6 +181,10 @@ Allocator* CUDAHooks::getPinnedMemoryAllocator() const { | ||||
|   return at::cuda::getPinnedMemoryAllocator(); | ||||
| } | ||||
|  | ||||
| Allocator* CUDAHooks::getCUDADeviceAllocator() const { | ||||
|   return at::cuda::getCUDADeviceAllocator(); | ||||
| } | ||||
|  | ||||
| bool CUDAHooks::compiledWithCuDNN() const { | ||||
|   return AT_CUDNN_ENABLED(); | ||||
| } | ||||
|  | ||||
| @ -22,6 +22,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { | ||||
|   int64_t current_device() const override; | ||||
|   bool hasPrimaryContext(int64_t device_index) const override; | ||||
|   c10::optional<int64_t> getDevceIndexWithPrimaryContext() const override; | ||||
|   Allocator* getCUDADeviceAllocator() const override; | ||||
|   Allocator* getPinnedMemoryAllocator() const override; | ||||
|   bool compiledWithCuDNN() const override; | ||||
|   bool compiledWithMIOpen() const override; | ||||
|  | ||||
| @ -121,6 +121,10 @@ struct CAFFE2_API CUDAHooksInterface { | ||||
|     TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP); | ||||
|   } | ||||
|  | ||||
|   virtual Allocator* getCUDADeviceAllocator() const { | ||||
|     TORCH_CHECK(false, "CUDADeviceAllocator requires CUDA. ", CUDA_HELP); | ||||
|   } | ||||
|  | ||||
|   virtual bool compiledWithCuDNN() const { | ||||
|     return false; | ||||
|   } | ||||
|  | ||||
| @ -262,9 +262,7 @@ auto ConvParams::use_xnnpack( | ||||
|     const at::Tensor& input, | ||||
|     const at::Tensor& weight, | ||||
|     const at::Tensor& bias) const -> bool { | ||||
| // Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool | ||||
| // TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash. | ||||
| #if defined(C10_MOBILE) && !defined(__APPLE__) | ||||
| #if defined(C10_MOBILE) | ||||
|   if (!transposed) { | ||||
|     return (input.size(1) == groups) && | ||||
|             xnnpack::use_convolution2d( | ||||
|  | ||||
| @ -17,9 +17,7 @@ Tensor linear(const Tensor& input, const Tensor& weight, const Tensor& bias) { | ||||
|   if (input.is_mkldnn()) { | ||||
|     return at::mkldnn_linear(input, weight, bias); | ||||
|   } | ||||
| // Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool | ||||
| // TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash. | ||||
| #if defined(C10_MOBILE) && !defined(__APPLE__) | ||||
| #if defined(C10_MOBILE) | ||||
|   if (xnnpack::use_linear(input, weight, bias)) { | ||||
|     return xnnpack::linear(input, weight, bias); | ||||
|   } | ||||
|  | ||||
| @ -58,8 +58,9 @@ bool _nnpack_available() { | ||||
|  | ||||
| #include <nnpack.h> | ||||
|  | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
| #include <ATen/native/ConvUtils.h> | ||||
| #include <ATen/Parallel.h> | ||||
|  | ||||
| namespace at { | ||||
| namespace native { | ||||
| @ -87,15 +88,9 @@ static bool init_nnpack() { | ||||
| } | ||||
|  | ||||
| static pthreadpool_t nnpack_threadpool() { | ||||
|   // Try initializing a threadpool for NNPACK's use.  If we fail to | ||||
|   // successfully initialize an implementation, return nullptr which will | ||||
|   // instruct NNPACK to run single threaded. | ||||
|  | ||||
| #ifdef C10_MOBILE | ||||
|   // If building for mobile, use Caffe 2's mobile-friendly threadpool. | ||||
|   return caffe2::mobile_pthreadpool(); | ||||
|   return caffe2::pthreadpool_(); | ||||
| #else | ||||
|   // Otherwise, try using pthreadpool if we manage to initialize it successfully. | ||||
|   static pthreadpool_t nnpack_threadpool_ = nullptr; | ||||
|   static bool called_nnpack_threadpool_ = false; | ||||
|  | ||||
|  | ||||
| @ -135,9 +135,7 @@ Tensor max_pool2d( | ||||
|         self, kernel_size, stride, padding, dilation, ceil_mode); | ||||
|   } | ||||
|  | ||||
| // Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool | ||||
| // TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash. | ||||
| #if defined(C10_MOBILE) && !defined(__APPLE__) | ||||
| #if defined(C10_MOBILE) | ||||
|   if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride, | ||||
|                              dilation, ceil_mode)) { | ||||
|     return xnnpack::max_pool2d( | ||||
|  | ||||
| @ -355,13 +355,12 @@ TensorOptions infer_full_options( | ||||
|  | ||||
|   if (!options.has_dtype()) { | ||||
|     if (fill_value.isIntegral(true)) { | ||||
|       TORCH_WARN_ONCE( | ||||
|         "Deprecation warning: In a future PyTorch release torch.full ", | ||||
|         "will no longer return tensors of floating dtype by default. ", | ||||
|         "Instead, a bool fill_value will return a tensor of torch.bool dtype, ", | ||||
|         "and an integral fill_value will return a tensor of torch.long dtype. ", | ||||
|         "Set the optional `dtype` or `out` arguments to suppress this warning." | ||||
|       ); | ||||
|       TORCH_CHECK(false, | ||||
|         "Providing a bool or integral fill value without setting the optional ", | ||||
|         "`dtype` or `out` arguments is currently unsupported. In PyTorch 1.7, ", | ||||
|         "when `dtype` and `out` are not set a bool fill value will ", | ||||
|         "return a tensor of torch.bool dtype, and an integral fill value ", | ||||
|         "will return a tensor of torch.long dtype."); | ||||
|     } else if (fill_value.isComplex()) { | ||||
|       auto scalar_type = (get_default_dtype() == ScalarType::Double) ? | ||||
|                             ScalarType::ComplexDouble : | ||||
|  | ||||
| @ -706,8 +706,9 @@ TensorIterator TensorIterator::unary_op(Tensor& out, const Tensor& a, | ||||
|     .set_check_mem_overlap(check_mem_overlap) | ||||
|     .add_output(out) | ||||
|     .add_input(a) | ||||
|     .cast_common_dtype_to_outputs(true) | ||||
|     .enforce_safe_casting_to_output(true) | ||||
|     .cast_common_dtype_to_outputs(false) | ||||
|     .enforce_safe_casting_to_output(false) | ||||
|     .check_all_same_dtype(true) | ||||
|     .build(); | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -762,7 +762,12 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) { | ||||
|  | ||||
|   Tensor xtensor = self.expand(padded_size); | ||||
|  | ||||
|   Tensor result = at::empty(target_size, self.options()); | ||||
|   Tensor result; | ||||
|   if (self.is_quantized()) { | ||||
|     result = at::empty_quantized(target_size, self); | ||||
|   } else { | ||||
|     result = at::empty(target_size, self.options()); | ||||
|   } | ||||
|  | ||||
|   // return an empty tensor if one of the repeat dimensions is zero | ||||
|   if (zero_tensor) { | ||||
|  | ||||
| @ -67,7 +67,7 @@ static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, co | ||||
|  | ||||
|       // Copies the complex result to the actual result and returns it | ||||
|       result.resize_(complex_result.sizes()); | ||||
|       result.copy_(complex_result); | ||||
|       result.copy_(at::real(complex_result)); | ||||
|       return result; | ||||
|     } | ||||
|  | ||||
|  | ||||
| @ -1127,6 +1127,12 @@ | ||||
|   variants: method | ||||
|   device_guard: False | ||||
|  | ||||
| - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor | ||||
|   variants: function | ||||
|   dispatch: | ||||
|     QuantizedCPU: empty_quantized | ||||
|     QuantizedCUDA: empty_quantized | ||||
|  | ||||
| - func: empty.out(int[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_guard: False | ||||
|  | ||||
| @ -5108,6 +5114,8 @@ | ||||
|   dispatch: | ||||
|     CPU: unfold | ||||
|     CUDA: unfold | ||||
|     QuantizedCPU: unfold | ||||
|     QuantizedCUDA: unfold | ||||
|  | ||||
| - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor | ||||
|   variants: function | ||||
|  | ||||
| @ -76,5 +76,28 @@ Tensor empty_per_channel_affine_quantized_other_backends_stub( | ||||
|   TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8"); | ||||
| } | ||||
|  | ||||
| // Create an empty quantized Tensor with size, based on the options | ||||
| // and quantization parameters of the input quantized Tensor | ||||
| Tensor empty_quantized(IntArrayRef size, const Tensor& qtensor) { | ||||
|   Tensor output; | ||||
|   if (qtensor.qscheme() == kPerTensorAffine) { | ||||
|     output = at::_empty_affine_quantized(size, qtensor.options(), | ||||
|                                          qtensor.q_scale(), | ||||
|                                          qtensor.q_zero_point()); | ||||
|   } else if (qtensor.qscheme() == kPerChannelAffine) { | ||||
|     output = at::_empty_per_channel_affine_quantized( | ||||
|         size, | ||||
|         qtensor.q_per_channel_scales(), | ||||
|         qtensor.q_per_channel_zero_points(), | ||||
|         qtensor.q_per_channel_axis(), | ||||
|         qtensor.options()); | ||||
|   } else { | ||||
|     TORCH_CHECK(false, | ||||
|                 "QScheme not supported by empty_quantized:", | ||||
|                 toString(qtensor.qscheme())); | ||||
|   } | ||||
|   return output; | ||||
| } | ||||
|  | ||||
| } // namespace native | ||||
| } // namespace at | ||||
|  | ||||
| @ -5,7 +5,7 @@ | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
| #include <c10/util/math_compat.h> | ||||
|  | ||||
| #include <algorithm> | ||||
| @ -375,7 +375,7 @@ Tensor qnnpack_avg_pool2d( | ||||
|   CAFFE_ENFORCE( | ||||
|       setupStatus == pytorch_qnnp_status_success, | ||||
|       "failed to setup QNNPACK Average Pooling operator"); | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|       pytorch_qnnp_run_operator(qnnpack_operator, threadpool); | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|  | ||||
| @ -5,7 +5,6 @@ | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <c10/util/math_compat.h> | ||||
|  | ||||
| #include <algorithm> | ||||
|  | ||||
| @ -7,7 +7,7 @@ | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
|  | ||||
| @ -194,7 +194,7 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) { | ||||
|       setupStatus == pytorch_qnnp_status_success, | ||||
|       "failed to setup QNNPACK Add operator"); | ||||
|  | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|       pytorch_qnnp_run_operator(qnnpack_operator, threadpool); | ||||
|  | ||||
|  | ||||
| @ -8,7 +8,7 @@ | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <c10/core/TensorOptions.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
|  | ||||
| @ -82,7 +82,7 @@ Tensor quantized_channel_shuffle_impl( | ||||
|       setupStatus == pytorch_qnnp_status_success, | ||||
|       "failed to setup QNNPACK ChannelShuffle operator"); | ||||
|  | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|       pytorch_qnnp_run_operator(qnnpack_operator, threadpool); | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|  | ||||
| @ -7,7 +7,7 @@ | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <ATen/quantized/Quantizer.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
|  | ||||
| @ -64,7 +64,7 @@ Tensor qnnpack_clamp(Tensor input, Scalar min, Scalar max) { | ||||
|   TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success, | ||||
|                         "failed to setup QNNPACK Clamp operator"); | ||||
|  | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|  | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|     pytorch_qnnp_run_operator(clamp_op, threadpool); | ||||
|  | ||||
| @ -10,7 +10,7 @@ | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <ATen/native/quantized/cpu/quant_utils.h> | ||||
| #include <ATen/native/quantized/cpu/conv_packed_params.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| template <int kSpatialDim = 2> | ||||
| bool ConvDimChecks( | ||||
| @ -603,7 +603,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl( | ||||
|       output_min, | ||||
|       output_max, | ||||
|       reinterpret_cast<uint8_t*>(output.template data_ptr<c10::quint8>()), | ||||
|       caffe2::mobile_pthreadpool()); | ||||
|       caffe2::pthreadpool_()); | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|       run_status == pytorch_qnnp_status_success, | ||||
|  | ||||
| @ -5,7 +5,7 @@ | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
|  | ||||
| @ -57,7 +57,7 @@ Tensor qnnpack_hardsigmoid(Tensor input) { | ||||
|   TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success, | ||||
|                         "failed to setup QNNPACK Hardsigmoid operator"); | ||||
|  | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|  | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|     pytorch_qnnp_run_operator(hardsigmoid_op, threadpool); | ||||
|  | ||||
| @ -5,7 +5,7 @@ | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
|  | ||||
| @ -51,7 +51,7 @@ Tensor qnnpack_hardswish(const Tensor& qx, Tensor& qy) { | ||||
|   TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success, | ||||
|                         "failed to setup QNNPACK Hardswish operator"); | ||||
|  | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|  | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|     pytorch_qnnp_run_operator(hardswish_op, threadpool); | ||||
|  | ||||
| @ -4,7 +4,7 @@ | ||||
| #include <ATen/native/quantized/cpu/fbgemm_utils.h> | ||||
| #include <ATen/native/quantized/cpu/packed_params.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
| #include <torch/custom_class.h> | ||||
| #include <torch/library.h> | ||||
|  | ||||
| @ -341,7 +341,9 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl( | ||||
|       packB->getPackedWeights(), | ||||
|       (uint8_t*)output.data_ptr<c10::quint8>(), | ||||
|       rows_w /* output_stride */, | ||||
|       caffe2::mobile_pthreadpool() /* threadpool */); | ||||
|       // TODO (Ashkan): Disabling temporarily. | ||||
|       // Throws a floating point exception with OSS pthreadpool. | ||||
|       caffe2::pthreadpool_() /* threadpool */); | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|       runStatus == pytorch_qnnp_status_success, | ||||
|  | ||||
| @ -5,7 +5,7 @@ | ||||
| #include <ATen/native/quantized/cpu/packed_params.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <ATen/native/quantized/cpu/quant_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
| #include <torch/library.h> | ||||
|  | ||||
| #include <torch/custom_class.h> | ||||
| @ -241,8 +241,17 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) { | ||||
|  | ||||
|   // Calculate statistics for quantization of input Tensor | ||||
|   // TODO: optimized kernel | ||||
|   float x_min = input_contig.min().item<float>(); | ||||
|   float x_max = input_contig.max().item<float>(); | ||||
|   float x_min; | ||||
|   float x_max; | ||||
|   if (input.numel() > 0) { | ||||
|     x_min = input_contig.min().item<float>(); | ||||
|     x_max = input_contig.max().item<float>(); | ||||
|   } else { | ||||
|     // On empty input, no output data will be generated, | ||||
|     // so use arbitrary qparams. | ||||
|     x_min = 0; | ||||
|     x_max = 0; | ||||
|   } | ||||
|  | ||||
|   auto q_params = quant_utils::ChooseQuantizationParams( | ||||
|       /*min=*/x_min, | ||||
| @ -327,7 +336,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) { | ||||
|       bias_ptr, | ||||
|       output.data_ptr<float>(), | ||||
|       rows_w /* output_stride */, | ||||
|       caffe2::mobile_pthreadpool() /* threadpool */); | ||||
|       caffe2::pthreadpool_() /* threadpool */); | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|       runStatus == pytorch_qnnp_status_success, | ||||
|  | ||||
| @ -100,6 +100,12 @@ enum pytorch_qnnp_status qnnpackLinearDynamic( | ||||
|       .ukernel = pytorch_qnnp_params.q8conv.gemm_dq, | ||||
|   }; | ||||
|  | ||||
|   if (output_size == 0) { | ||||
|       // pthreadpool can tolerate a range of 0, but not a tile of 0. | ||||
|       // We use output_size as a tile size, so bail here if it's 0. | ||||
|       return pytorch_qnnp_status_success; | ||||
|   } | ||||
|  | ||||
|   pthreadpool_compute_4d_tiled( | ||||
|       threadpool, | ||||
|       (pthreadpool_function_4d_tiled_t)compute_q8gemm_dq, | ||||
|  | ||||
| @ -98,6 +98,12 @@ enum pytorch_qnnp_status qnnpackLinear( | ||||
|       .ukernel = pytorch_qnnp_params.q8conv.gemm, | ||||
|   }; | ||||
|  | ||||
|   if (output_size == 0) { | ||||
|       // pthreadpool can tolerate a range of 0, but not a tile of 0. | ||||
|       // We use output_size as a tile size, so bail here if it's 0. | ||||
|       return pytorch_qnnp_status_success; | ||||
|   } | ||||
|  | ||||
|   pthreadpool_compute_4d_tiled( | ||||
|       threadpool, | ||||
|       (pthreadpool_function_4d_tiled_t) compute_q8gemm, | ||||
|  | ||||
| @ -9,7 +9,7 @@ | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
| #include <vector> | ||||
| @ -346,7 +346,7 @@ void check_maxpool2d_params( | ||||
|        setupStatus == pytorch_qnnp_status_success, | ||||
|        "failed to setup QNNPACK MaxPool operator"); | ||||
|  | ||||
|    pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|    pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|    const pytorch_qnnp_status runStatus = | ||||
|        pytorch_qnnp_run_operator(qnnpack_operator, threadpool); | ||||
|    TORCH_INTERNAL_ASSERT( | ||||
|  | ||||
| @ -3,7 +3,7 @@ | ||||
| #include <ATen/NativeFunctions.h> | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| namespace at { | ||||
| namespace native { | ||||
| @ -66,7 +66,7 @@ Tensor qnnpack_mean(const Tensor& input, IntArrayRef dim) { | ||||
|   CAFFE_ENFORCE( | ||||
|       setupStatus == pytorch_qnnp_status_success, | ||||
|       "failed to setup QNNPACK Global Average Pooling operator"); | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|       pytorch_qnnp_run_operator(qnnpack_operator, threadpool); | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|  | ||||
| @ -6,7 +6,7 @@ | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
| #include <torch/library.h> | ||||
|  | ||||
| #include <algorithm> | ||||
| @ -69,7 +69,7 @@ Tensor qnnpack_relu(Tensor input) { | ||||
|       setupStatus == pytorch_qnnp_status_success, | ||||
|       "failed to setup QNNPACK Relu operator"); | ||||
|  | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|  | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|       pytorch_qnnp_run_operator(qnnpack_operator, threadpool); | ||||
|  | ||||
| @ -7,7 +7,7 @@ | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
|  | ||||
| @ -66,7 +66,7 @@ Tensor qnnpack_sigmoid(Tensor input) { | ||||
|   TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success, | ||||
|                         "failed to setup QNNPACK sigmoid operator"); | ||||
|  | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|  | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|     pytorch_qnnp_run_operator(sigmoid_op, threadpool); | ||||
|  | ||||
| @ -7,7 +7,7 @@ | ||||
| #include <ATen/native/quantized/cpu/quantized_ops.h> | ||||
| #include <ATen/native/quantized/cpu/init_qnnpack.h> | ||||
| #include <ATen/native/quantized/cpu/qnnpack_utils.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
|  | ||||
| @ -64,7 +64,7 @@ Tensor qnnpack_tanh(Tensor input) { | ||||
|   TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success, | ||||
|                         "failed to setup QNNPACK TanH operator"); | ||||
|  | ||||
|   pthreadpool_t threadpool = caffe2::mobile_pthreadpool(); | ||||
|   pthreadpool_t threadpool = caffe2::pthreadpool_(); | ||||
|  | ||||
|   const pytorch_qnnp_status runStatus = | ||||
|     pytorch_qnnp_run_operator(tanh_op, threadpool); | ||||
|  | ||||
| @ -5,7 +5,7 @@ | ||||
| #ifdef USE_XNNPACK | ||||
|  | ||||
| #include <xnnpack.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolXNNPACK.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
|  | ||||
| namespace at { | ||||
| namespace native { | ||||
|  | ||||
| @ -208,15 +208,15 @@ Tensor run( | ||||
|       padded_input_nhwc.size(Layout::Activation4D::width),   // input_width | ||||
|       padded_input_nhwc.data_ptr<float>(),                   // input | ||||
|       output.data_ptr<float>(),                              // output | ||||
|       caffe2::xnnpack_threadpool());                         // threadpool | ||||
|       caffe2::pthreadpool_());                               // threadpool | ||||
|  | ||||
|   TORCH_CHECK( | ||||
|       xnn_status_success == setup_status, | ||||
|       "xnn_setup_convolution2d_nhwc_f32 failed!"); | ||||
|  | ||||
|   const xnn_status run_status = xnn_run_operator( | ||||
|       context.op.get(),               // operator | ||||
|       caffe2::xnnpack_threadpool());  // threadpool | ||||
|       context.op.get(),         // operator | ||||
|       caffe2::pthreadpool_());  // threadpool | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|       xnn_status_success == run_status, | ||||
|  | ||||
| @ -137,15 +137,15 @@ Tensor run( | ||||
|       Layout::ActivationND::batch(padded_input.sizes()),  // Batch, | ||||
|       padded_input.data_ptr<float>(),                     // input | ||||
|       output.data_ptr<float>(),                           // output | ||||
|       caffe2::xnnpack_threadpool());                      // threadpool | ||||
|       caffe2::pthreadpool_());                            // threadpool | ||||
|  | ||||
|   TORCH_CHECK( | ||||
|       xnn_status_success == setup_status, | ||||
|       "xnn_setup_fully_connected_nc_f32 failed!"); | ||||
|  | ||||
|   const xnn_status run_status = xnn_run_operator( | ||||
|       context.op.get(),               // operator | ||||
|       caffe2::xnnpack_threadpool());  // threadpool | ||||
|       context.op.get(),         // operator | ||||
|       caffe2::pthreadpool_());  // threadpool | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|       xnn_status_success == run_status, | ||||
|  | ||||
| @ -219,15 +219,15 @@ Tensor max_pool2d( | ||||
|       input_padded_contig_nhwc.size(Layout::Activation4D::width),   // input_width | ||||
|       input_padded_contig_nhwc.data_ptr<float>(),                   // input | ||||
|       output_padded_contig_nhwc.data_ptr<float>(),                  // output | ||||
|       caffe2::xnnpack_threadpool());                                // threadpool | ||||
|       caffe2::pthreadpool_());                                      // threadpool | ||||
|  | ||||
|   TORCH_CHECK( | ||||
|       xnn_status_success == setup_status, | ||||
|       "xnn_setup_max_pooling2d_nhwc_f32 failed!"); | ||||
|  | ||||
|   const xnn_status run_status = xnn_run_operator( | ||||
|       max_pool_op,                    // operator | ||||
|       caffe2::xnnpack_threadpool());  // threadpool | ||||
|       max_pool_op,              // operator | ||||
|       caffe2::pthreadpool_());  // threadpool | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|       xnn_status_success == run_status, | ||||
|  | ||||
| @ -4,10 +4,10 @@ | ||||
| #include <ATen/NativeFunctions.h> | ||||
| #include <ATen/Parallel.h> | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/detail/CUDAHooksInterface.h> | ||||
| #include <ATen/native/TensorFactories.h> | ||||
| #include <ATen/native/quantized/affine_quantizer.h> | ||||
| #include <ATen/quantized/QTensorImpl.h> | ||||
| #include <c10/core/Allocator.h> | ||||
| #include <c10/core/CPUAllocator.h> | ||||
| #include <cmath> | ||||
| #include <typeinfo> | ||||
| @ -66,7 +66,9 @@ inline Tensor new_qtensor( | ||||
|     const TensorOptions& options, | ||||
|     QuantizerPtr quantizer) { | ||||
|   auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous); | ||||
|   at::Allocator* allocator = GetAllocator(options.device().type()); | ||||
|   at::Allocator* allocator = options.device().type() == DeviceType::CUDA | ||||
|     ? at::detail::getCUDAHooks().getCUDADeviceAllocator() | ||||
|     : at::getCPUAllocator(); | ||||
|  | ||||
| #ifdef USE_PYTORCH_QNNPACK | ||||
|   if (at::globalContext().qEngine() == at::QEngine::QNNPACK) { | ||||
|  | ||||
| @ -87,7 +87,6 @@ endif() | ||||
| # Note: the folders that are being commented out have not been properly | ||||
| # addressed yet. | ||||
|  | ||||
| # For pthreadpool_new_if_impl. TODO: Remove when threadpools are unitied. | ||||
| if(NOT MSVC AND USE_XNNPACK) | ||||
|   if(NOT TARGET fxdiv) | ||||
|     set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") | ||||
| @ -96,10 +95,6 @@ if(NOT MSVC AND USE_XNNPACK) | ||||
|       "${FXDIV_SOURCE_DIR}" | ||||
|       "${CMAKE_BINARY_DIR}/FXdiv") | ||||
|   endif() | ||||
|   if(NOT (INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)) | ||||
|     set_source_files_properties( | ||||
|         utils/threadpool/pthreadpool_new_if_impl.c PROPERTIES COMPILE_FLAGS -fno-openmp) | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
| add_subdirectory(core) | ||||
|  | ||||
| @ -818,6 +818,67 @@ c10::optional<int> OperatorBase::argumentIndexWithName( | ||||
| #endif | ||||
| } | ||||
|  | ||||
| bool OperatorBase::RunAsync(int stream_id) { | ||||
|   try { | ||||
|     auto result = Run(stream_id); | ||||
|     if (result) { | ||||
|       if (HasAsyncPart()) { | ||||
|         RecordEvent(); | ||||
|       } else { | ||||
|         SetEventFinished(); | ||||
|       } | ||||
|     } else { | ||||
|       SetEventFinished(getErrorMsg().c_str()); | ||||
|     } | ||||
|     return result; | ||||
|   } catch (EnforceNotMet& err) { | ||||
|     SetEventFinishedWithException(err.what()); | ||||
|     throw; | ||||
|   } catch (const std::exception& err) { | ||||
|     SetEventFinishedWithException(err.what()); | ||||
|     throw; | ||||
|   } catch (...) { | ||||
|     SetEventFinishedWithException(getErrorMsg().c_str()); | ||||
|     throw; | ||||
|   } | ||||
| } | ||||
|  | ||||
| void OperatorBase::AddRelatedBlobInfo(EnforceNotMet* err) { | ||||
|   CAFFE_ENFORCE( | ||||
|       isLegacyOperator(), | ||||
|       "AddRelatedBlobInfo(err) not supported for operators exported to c10."); | ||||
|  | ||||
|   if (!has_debug_def()) { | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   bool found_input = false; | ||||
|   bool found_output = false; | ||||
|   if (err->caller() != nullptr) { | ||||
|     std::ostringstream oss; | ||||
|     for (size_t i = 0; i < inputs_.size(); i++) { | ||||
|       if (inputs_[i]->GetRaw() == err->caller()) { | ||||
|         found_input = true; | ||||
|         oss << "while accessing input: " << debug_def().input(i); | ||||
|         break; | ||||
|       } | ||||
|     } | ||||
|     for (size_t i = 0; i < outputs_.size(); i++) { | ||||
|       if (outputs_[i]->GetRaw() == err->caller()) { | ||||
|         found_output = true; | ||||
|         if (found_input) { | ||||
|           oss << " OR "; | ||||
|         } | ||||
|         oss << "while accessing output: " << debug_def().output(i); | ||||
|         break; | ||||
|       } | ||||
|     } | ||||
|     if (found_input || found_output) { | ||||
|       err->add_context(oss.str()); | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| OperatorBase::~OperatorBase() noexcept = default; | ||||
|  | ||||
| #ifndef C10_MOBILE | ||||
|  | ||||
| @ -480,70 +480,13 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> { | ||||
|  | ||||
|   virtual void CancelAsyncCallback() {} | ||||
|  | ||||
|   // RunAsync, if implemenented by the specific operators, will schedule the | ||||
|   // RunAsync, if implemented by the specific operators, will schedule the | ||||
|   // computation on the corresponding context and record the event in its | ||||
|   // event_ member object. If the specific operator does not support RunAsync, | ||||
|   // it will simply be synchronous as a fallback. | ||||
|   virtual bool RunAsync(int stream_id = 0) { | ||||
|     try { | ||||
|       auto result = Run(stream_id); | ||||
|       if (result) { | ||||
|         if (HasAsyncPart()) { | ||||
|           RecordEvent(); | ||||
|         } else { | ||||
|           SetEventFinished(); | ||||
|         } | ||||
|       } else { | ||||
|         SetEventFinished(getErrorMsg().c_str()); | ||||
|       } | ||||
|       return result; | ||||
|     } catch (EnforceNotMet& err) { | ||||
|       SetEventFinishedWithException(err.what()); | ||||
|       throw; | ||||
|     } catch (const std::exception& err) { | ||||
|       SetEventFinishedWithException(err.what()); | ||||
|       throw; | ||||
|     } catch (...) { | ||||
|       SetEventFinishedWithException(getErrorMsg().c_str()); | ||||
|       throw; | ||||
|     } | ||||
|   } | ||||
|   virtual bool RunAsync(int stream_id = 0); | ||||
|  | ||||
|   virtual void AddRelatedBlobInfo(EnforceNotMet* err) { | ||||
|     CAFFE_ENFORCE( | ||||
|         isLegacyOperator(), | ||||
|         "AddRelatedBlobInfo(err) not supported for operators exported to c10."); | ||||
|  | ||||
|     if (!has_debug_def()) { | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     bool found_input = false; | ||||
|     bool found_output = false; | ||||
|     if (err->caller() != nullptr) { | ||||
|       std::ostringstream oss; | ||||
|       for (size_t i = 0; i < inputs_.size(); i++) { | ||||
|         if (inputs_[i]->GetRaw() == err->caller()) { | ||||
|           found_input = true; | ||||
|           oss << "while accessing input: " << debug_def().input(i); | ||||
|           break; | ||||
|         } | ||||
|       } | ||||
|       for (size_t i = 0; i < outputs_.size(); i++) { | ||||
|         if (outputs_[i]->GetRaw() == err->caller()) { | ||||
|           found_output = true; | ||||
|           if (found_input) { | ||||
|             oss << " OR "; | ||||
|           } | ||||
|           oss << "while accessing output: " << debug_def().output(i); | ||||
|           break; | ||||
|         } | ||||
|       } | ||||
|       if (found_input || found_output) { | ||||
|         err->add_context(oss.str()); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   virtual void AddRelatedBlobInfo(EnforceNotMet* err); | ||||
|  | ||||
|   virtual std::string debug_info_string() const { | ||||
|     return ""; | ||||
|  | ||||
| @ -3,6 +3,25 @@ | ||||
|  | ||||
| namespace caffe2 { | ||||
|  | ||||
| OpSchema::OpSchema(const string& type, const string& file, const int line) | ||||
|    : type_(type), file_(file), line_(line), tensor_inference_function_( | ||||
|       [](const OperatorDef& def, const vector<TensorShape>&) { | ||||
|         vector<TensorShape> out; | ||||
|         for (int i = 0; i < def.output_size(); i++) { | ||||
|           TensorShape ts; | ||||
|           ts.set_unknown_shape(true); | ||||
|           out.push_back(ts); | ||||
|         } | ||||
|         return out; | ||||
|       }), device_inference_function_( | ||||
|       [](const OperatorDef& def) { | ||||
|         auto op_device = | ||||
|             def.has_device_option() ? def.device_option() : DeviceOption(); | ||||
|         vector<DeviceOption> in_dev(def.input_size(), op_device); | ||||
|         vector<DeviceOption> out_dev(def.output_size(), op_device); | ||||
|         return std::make_pair(in_dev, out_dev); | ||||
|       }) {} | ||||
|  | ||||
| bool OpSchema::Verify(const OperatorDef& def) const { | ||||
|   // Check the number of inputs. | ||||
|   if (def.input_size() < min_input_ || def.input_size() > max_input_) { | ||||
|  | ||||
| @ -39,9 +39,8 @@ constexpr int kCannotComputeNumOutputs = -1; | ||||
|  */ | ||||
| class CAFFE2_API OpSchema { | ||||
|  public: | ||||
|   OpSchema() : type_("unknown"), file_("unknown"), line_(0) {} | ||||
|   OpSchema(const string& type, const string& file, const int line) | ||||
|       : type_(type), file_(file), line_(line) {} | ||||
|   OpSchema() : OpSchema("unknown", "unknown", 0) {} | ||||
|   OpSchema(const string& type, const string& file, const int line); | ||||
|  | ||||
|   /** | ||||
|    * @brief Returns the file that the op schema is registered from. | ||||
| @ -443,25 +442,9 @@ class CAFFE2_API OpSchema { | ||||
|   std::function<bool(int, int)> inplace_enforced_ = [](int, int) { | ||||
|     return false; | ||||
|   }; | ||||
|   TensorInferenceFunctionType tensor_inference_function_ = | ||||
|       [](const OperatorDef& def, const vector<TensorShape>&) { | ||||
|         vector<TensorShape> out; | ||||
|         for (int i = 0; i < def.output_size(); i++) { | ||||
|           TensorShape ts; | ||||
|           ts.set_unknown_shape(true); | ||||
|           out.push_back(ts); | ||||
|         } | ||||
|         return out; | ||||
|       }; | ||||
|   TensorInferenceFunctionType tensor_inference_function_; | ||||
|   std::unique_ptr<CostInferenceFunctionType> cost_inference_function_ = nullptr; | ||||
|   DeviceInferenceFunctionType device_inference_function_ = | ||||
|       [](const OperatorDef& def) { | ||||
|         auto op_device = | ||||
|             def.has_device_option() ? def.device_option() : DeviceOption(); | ||||
|         vector<DeviceOption> in_dev(def.input_size(), op_device); | ||||
|         vector<DeviceOption> out_dev(def.output_size(), op_device); | ||||
|         return std::make_pair(in_dev, out_dev); | ||||
|       }; | ||||
|   DeviceInferenceFunctionType device_inference_function_; | ||||
|  | ||||
|   std::function<std::vector<TensorFiller>( | ||||
|       const std::vector<std::vector<int64_t>>&)> | ||||
|  | ||||
| @ -88,7 +88,7 @@ class Int8AddOp final : public Operator<CPUContext> { | ||||
|         setupStatus == qnnp_status_success, | ||||
|         "failed to setup QNNPACK add operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|     const qnnp_status runStatus = | ||||
|         qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -80,7 +80,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> { | ||||
|           setupStatus == qnnp_status_success, | ||||
|           "failed to setup QNNPACK Global Average Pooling operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       const qnnp_status runStatus = | ||||
|           qnnp_run_operator(this->qnnpackGlobalOperator_, | ||||
|             nullptr /* thread pool */); | ||||
| @ -122,7 +122,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> { | ||||
|           setupStatus == qnnp_status_success, | ||||
|           "failed to setup QNNPACK Average Pooling operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       const qnnp_status runStatus = | ||||
|           qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -72,7 +72,7 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> { | ||||
|         setupStatus == qnnp_status_success, | ||||
|         "failed to setup QNNPACK channel shuffle operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|     const qnnp_status runStatus = | ||||
|         qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -141,7 +141,7 @@ class Int8ConvOp final : public ConvPoolOpBase<CPUContext> { | ||||
|         lastOutputPointer_ = Y->t.template mutable_data<uint8_t>(); | ||||
|       } | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       const qnnp_status runStatus = | ||||
|           qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -140,7 +140,7 @@ class Int8ConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> { | ||||
|         lastOutputPointer_ = Y->t.template mutable_data<uint8_t>(); | ||||
|       } | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       const qnnp_status runStatus = | ||||
|           qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -104,7 +104,7 @@ class Int8FCOp final : public Operator<CPUContext> { | ||||
|         lastOutputPointer_ = Y->t.template mutable_data<uint8_t>(); | ||||
|       } | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       const qnnp_status runStatus = | ||||
|           qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -80,7 +80,7 @@ class Int8LeakyReluOp final : public Operator<CPUContext> { | ||||
|         setupStatus == qnnp_status_success, | ||||
|         "failed to setup QNNPACK Leaky ReLU operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|     const qnnp_status runStatus = | ||||
|         qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -74,7 +74,7 @@ class Int8MaxPoolOp final : public ConvPoolOpBase<CPUContext> { | ||||
|         setupStatus == qnnp_status_success, | ||||
|         "failed to setup QNNPACK Max Pooling operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|     const qnnp_status runStatus = | ||||
|         qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -65,7 +65,7 @@ class Int8ReluOp final : public Operator<CPUContext> { | ||||
|         setupStatus == qnnp_status_success, | ||||
|         "failed to setup QNNPACK Clamp operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|     const qnnp_status runStatus = | ||||
|         qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -73,7 +73,7 @@ class Int8SigmoidOp final : public Operator<CPUContext> { | ||||
|         setupStatus == qnnp_status_success, | ||||
|         "failed to setup QNNPACK Sigmoid operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|     const qnnp_status runStatus = | ||||
|         qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -73,7 +73,7 @@ class Int8SoftmaxOp final : public Operator<CPUContext> { | ||||
|         setupStatus == qnnp_status_success, | ||||
|         "failed to setup QNNPACK SoftArgMax operator"); | ||||
|  | ||||
| #ifdef FBCODE_CAFFE2 | ||||
| #if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|     const qnnp_status runStatus = | ||||
|         qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); | ||||
| #else | ||||
|  | ||||
| @ -42,13 +42,48 @@ if platform.system() == 'Windows': | ||||
|     else: | ||||
|         cuda_path = '' | ||||
|  | ||||
|     if not is_conda and sys.version_info >= (3, 8): | ||||
|         dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path])) | ||||
|     import ctypes | ||||
|     kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True) | ||||
|     dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path])) | ||||
|     with_load_library_flags = hasattr(kernel32, 'AddDllDirectory') | ||||
|     prev_error_mode = kernel32.SetErrorMode(0x0001) | ||||
|  | ||||
|         for dll_path in dll_paths: | ||||
|     kernel32.LoadLibraryW.restype = ctypes.c_void_p | ||||
|     if with_load_library_flags: | ||||
|         kernel32.AddDllDirectory.restype = ctypes.c_void_p | ||||
|         kernel32.LoadLibraryExW.restype = ctypes.c_void_p | ||||
|  | ||||
|     for dll_path in dll_paths: | ||||
|         if sys.version_info >= (3, 8): | ||||
|             os.add_dll_directory(dll_path) | ||||
|     else: | ||||
|         dll_paths = [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path] | ||||
|         dll_paths = list(filter(os.path.exists, dll_paths)) + [os.environ['PATH']] | ||||
|         elif with_load_library_flags: | ||||
|             res = kernel32.AddDllDirectory(dll_path) | ||||
|             if res is None: | ||||
|                 err = ctypes.WinError(ctypes.get_last_error()) | ||||
|                 err.strerror += ' Error adding "{}" to the DLL directories.'.format(dll_path) | ||||
|                 raise err | ||||
|  | ||||
|         os.environ['PATH'] = ';'.join(dll_paths) | ||||
|     dlls = glob.glob(os.path.join(th_dll_path, '*.dll')) | ||||
|     path_patched = False | ||||
|     for dll in dlls: | ||||
|         is_loaded = False | ||||
|         if with_load_library_flags: | ||||
|             res = kernel32.LoadLibraryExW(dll, None, 0x00001100) | ||||
|             last_error = ctypes.get_last_error() | ||||
|             if res is None and last_error != 126: | ||||
|                 err = ctypes.WinError(last_error) | ||||
|                 err.strerror += ' Error loading "{}" or one of its dependencies.'.format(dll) | ||||
|                 raise err | ||||
|             elif res is not None: | ||||
|                 is_loaded = True | ||||
|         if not is_loaded: | ||||
|             if not path_patched: | ||||
|                 os.environ['PATH'] = ';'.join(dll_paths + [os.environ['PATH']]) | ||||
|                 path_patched = True | ||||
|             res = kernel32.LoadLibraryW(dll) | ||||
|             if res is None: | ||||
|                 err = ctypes.WinError(ctypes.get_last_error()) | ||||
|                 err.strerror += ' Error loading "{}" or one of its dependencies.'.format(dll) | ||||
|                 raise err | ||||
|  | ||||
|     kernel32.SetErrorMode(prev_error_mode) | ||||
|  | ||||
| @ -4,6 +4,7 @@ | ||||
| #include <istream> | ||||
| #include <ostream> | ||||
| #include <fstream> | ||||
| #include <algorithm> | ||||
|  | ||||
| #include <c10/core/Allocator.h> | ||||
| #include <c10/core/Backend.h> | ||||
| @ -303,10 +304,10 @@ void PyTorchStreamWriter::setup(const string& file_name) { | ||||
|  | ||||
|   mz_zip_writer_init_v2(ar_.get(), 0, MZ_ZIP_FLAG_WRITE_ZIP64); | ||||
|   valid("initializing archive ", file_name.c_str()); | ||||
| } | ||||
|  | ||||
|   std::string version = c10::to_string(kProducedFileFormatVersion); | ||||
|   version.push_back('\n'); | ||||
|   writeRecord("version", version.c_str(), version.size()); | ||||
| void PyTorchStreamWriter::setMinVersion(const uint64_t version) { | ||||
|   version_ = std::max(version, version_); | ||||
| } | ||||
|  | ||||
| void PyTorchStreamWriter::writeRecord( | ||||
| @ -339,6 +340,11 @@ void PyTorchStreamWriter::writeRecord( | ||||
| } | ||||
|  | ||||
| void PyTorchStreamWriter::writeEndOfFile() { | ||||
|   // Writes version info | ||||
|   std::string version = c10::to_string(version_); | ||||
|   version.push_back('\n'); | ||||
|   writeRecord("version", version.c_str(), version.size()); | ||||
|  | ||||
|   AT_ASSERT(!finalized_); | ||||
|   finalized_ = true; | ||||
|   mz_zip_writer_finalize_archive(ar_.get()); | ||||
|  | ||||
| @ -94,14 +94,45 @@ constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L; | ||||
| constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L; | ||||
|  | ||||
| // Versions (i.e. why was the version number bumped?) | ||||
|  | ||||
| // Note [Dynamic Versions and torch.jit.save vs. torch.save] | ||||
| // | ||||
| // Our versioning scheme has a "produced file format version" which | ||||
| // describes how an archive is to be read. The version written in an archive | ||||
| // is at least this current produced file format version, but may be greater | ||||
| // if it includes certain symbols. We refer to these conditional versions | ||||
| // as "dynamic," since they are identified at runtime. | ||||
| // | ||||
| // Dynamic versioning is useful when an operator's semantics are updated. | ||||
| // When using torch.jit.save we want those semantics to be preserved. If | ||||
| // we bumped the produced file format version on every change, however, | ||||
| // then older versions of PyTorch couldn't read even simple archives, like | ||||
| // a single tensor, from newer versions of PyTorch. Instead, we | ||||
| // assign dynamic versions to these changes that override the | ||||
| // produced file format version as needed. That is, when the semantics | ||||
| // of torch.div changed it was assigned dynamic version 4, and when | ||||
| // torch.jit.saving modules that use torch.div those archives also have | ||||
| // (at least) version 4. This prevents earlier versions of PyTorch | ||||
| // from accidentally performing the wrong kind of division. Modules | ||||
| // that don't use torch.div or other operators with dynamic versions | ||||
| // can write the produced file format version, and these programs will | ||||
| // run as expected on earlier versions of PyTorch. | ||||
| // | ||||
| // While torch.jit.save attempts to preserve operator semantics, | ||||
| // torch.save does not. torch.save is analogous to pickling Python, so | ||||
| // a function that uses torch.div will have different behavior if torch.saved | ||||
| // and torch.loaded across PyTorch versions. From a technical perspective, | ||||
| // torch.save ignores dynamic versioning. | ||||
|  | ||||
| // 1. Initial version | ||||
| // 2. Removed op_version_set version numbers | ||||
| // 3. Added type tags to pickle serialization of container types | ||||
| // 4. Stopped integer division using torch.div | ||||
| // 4. (Dynamic) Stopped integer division using torch.div | ||||
| //      (a versioned symbol preserves the historic behavior of versions 1--3) | ||||
| // 5. (Read-only) Stops torch.full inferring a floating point dtype | ||||
| //      when given integer fill values. | ||||
| constexpr uint64_t kProducedFileFormatVersion = 0x4L; | ||||
| // 5. (Dynamic) Stops torch.full inferring a floating point dtype | ||||
| //      when given bool or integer fill values. | ||||
| //      (a versioned symbol preserves the historic behavior of versions 1--4) | ||||
| constexpr uint64_t kProducedFileFormatVersion = 0x3L; | ||||
|  | ||||
| // Writer-specific constants | ||||
| constexpr uint64_t kFieldAlignment = 64; | ||||
| @ -144,6 +175,8 @@ class CAFFE2_API PyTorchStreamWriter final { | ||||
|   explicit PyTorchStreamWriter( | ||||
|       const std::function<size_t(const void*, size_t)>& writer_func); | ||||
|  | ||||
|    void setMinVersion(const uint64_t version); | ||||
|  | ||||
|   void writeRecord( | ||||
|       const std::string& name, | ||||
|       const void* data, | ||||
| @ -171,6 +204,7 @@ class CAFFE2_API PyTorchStreamWriter final { | ||||
|   std::string padding_; | ||||
|   std::ofstream file_stream_; | ||||
|   std::function<size_t(const void*, size_t)> writer_func_; | ||||
|   uint64_t version_ = kProducedFileFormatVersion; | ||||
|   bool finalized_ = false; | ||||
|   bool err_seen_ = false; | ||||
|   friend size_t ostream_write_func( | ||||
|  | ||||
| @ -195,7 +195,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() { | ||||
|   const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()), | ||||
|                                      .height = static_cast<size_t>(stride_h())}; | ||||
|   initNNPACK(); | ||||
|  | ||||
| #if !defined(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|   pthreadpool_t pool = nullptr; | ||||
| #else | ||||
|   pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool()); | ||||
| #endif | ||||
|  | ||||
|   runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) { | ||||
|     if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) { | ||||
|  | ||||
| @ -1,15 +1,8 @@ | ||||
| # TODO: Add ThreadPoolXNNPACK.cc when XNNPACK integration is updated | ||||
| # to pass the actual threadpool ptr instead of nullptr. | ||||
| if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE) | ||||
|   add_definitions(-DUSE_INTERNAL_THREADPOOL_IMPL) | ||||
|   list(APPEND Caffe2_CPU_SRCS | ||||
|     utils/string_utils.cc | ||||
|     utils/threadpool/pthreadpool.cc | ||||
|     utils/threadpool/pthreadpool_impl.cc | ||||
|     utils/threadpool/pthreadpool_new_if_impl.c | ||||
|     utils/threadpool/pthreadpool-cpp.cc | ||||
|     utils/threadpool/ThreadPool.cc | ||||
|     utils/threadpool/ThreadPoolMobile.cc | ||||
|     utils/threadpool/ThreadPoolXNNPACK.cc | ||||
|   ) | ||||
|   set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) | ||||
|   return() | ||||
| @ -28,23 +21,19 @@ list(APPEND Caffe2_CPU_SRCS | ||||
|   utils/proto_convert.cc | ||||
|   utils/proto_utils.cc | ||||
|   utils/proto_wrap.cc | ||||
|   utils/threadpool/ThreadPool.cc | ||||
|   utils/signal_handler.cc | ||||
|   utils/smart_tensor_printer.cc | ||||
|   utils/string_utils.cc | ||||
|   utils/threadpool/ThreadPool.cc) | ||||
|   utils/string_utils.cc) | ||||
|  | ||||
| # ---[ threadpool/pthreadpool* is a local modification of the NNPACK | ||||
| # pthreadpool with a very similar interface. Neither NNPACK, nor this | ||||
| # thread pool supports Windows. | ||||
| if(NOT MSVC AND USE_XNNPACK) | ||||
|   add_definitions(-DUSE_INTERNAL_THREADPOOL_IMPL) | ||||
|   set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} | ||||
|           utils/threadpool/pthreadpool.cc | ||||
|           utils/threadpool/pthreadpool_impl.cc | ||||
|           utils/threadpool/pthreadpool_new_if_impl.c | ||||
|           utils/threadpool/ThreadPoolMobile.cc | ||||
|           utils/threadpool/ThreadPoolXNNPACK.cc | ||||
|           ) | ||||
| if(USE_PTHREADPOOL) | ||||
|   list(APPEND Caffe2_CPU_SRCS | ||||
|     utils/threadpool/pthreadpool-cpp.cc) | ||||
|   if(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|     list(APPEND Caffe2_CPU_SRCS | ||||
|       utils/threadpool/pthreadpool.cc | ||||
|       utils/threadpool/pthreadpool_impl.cc) | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
| set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} | ||||
|  | ||||
| @ -1,21 +0,0 @@ | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPool.h> | ||||
| #include <caffe2/utils/threadpool/pthreadpool.h> | ||||
|  | ||||
| namespace caffe2 { | ||||
|  | ||||
| caffe2::ThreadPool* mobile_threadpool() { | ||||
| #ifdef C10_MOBILE | ||||
|   static std::unique_ptr<caffe2::ThreadPool> thread_pool = | ||||
|       caffe2::ThreadPool::defaultThreadPool(); | ||||
|   return thread_pool.get(); | ||||
| #else | ||||
|   return nullptr; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| pthreadpool_t mobile_pthreadpool() { | ||||
|   return reinterpret_cast<pthreadpool_t>(mobile_threadpool()); | ||||
| } | ||||
|  | ||||
| } // namespace caffe2 | ||||
| @ -1,24 +0,0 @@ | ||||
| #pragma once | ||||
| #include <caffe2/utils/threadpool/pthreadpool.h> | ||||
|  | ||||
| // TODO Implement a parallel_for version for Mobile here, add to Aten/Parallel.h | ||||
|  | ||||
| namespace caffe2 { | ||||
|  | ||||
| class ThreadPool; | ||||
|  | ||||
| // Return a singleton instance of caffe2::ThreadPool for ATen/TH multithreading. | ||||
| ThreadPool* mobile_threadpool(); | ||||
|  | ||||
| // NOTE: This interface is temporary and should not be used. | ||||
| // Please use Aten/Parallel.h for parallel primitives in pytorch. | ||||
| // This implementation will be used by pytorch mobile, specifically | ||||
| // NNPACK/QNNPACK. For mobile we need to use caffe2::ThreadPool instead of the | ||||
| // 3rd party pthreadpool. Future work (TODO) Implement a mobile version of | ||||
| // "at::parallel_for" using caffe2::ThreadPool so all ATen/TH multithreading | ||||
| // usage is mobile friendly; Refactor QNNPACK or pthreadpool to explicitly using | ||||
| // "at::parallel_for" primitive to replace pthreadpool_compute_1d for Pytorch; | ||||
| pthreadpool_t mobile_pthreadpool(); | ||||
|  | ||||
| size_t getDefaultNumThreads(); | ||||
| } // namespace caffe2 | ||||
| @ -1,22 +0,0 @@ | ||||
| #include <caffe2/utils/threadpool/pthreadpool.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolMobile.h> | ||||
| #include <caffe2/utils/threadpool/ThreadPoolXNNPACK.h> | ||||
| #include <memory> | ||||
|  | ||||
| namespace caffe2 { | ||||
|  | ||||
| // Will be unified. | ||||
| pthreadpool_t xnnpack_threadpool() { | ||||
| // Depending on internal implemenation vs. OSS we will link against pthreadpool_create_xnnpack | ||||
| // or pthreadpool_create. This is only temporary. It will be unified soon. | ||||
| #ifdef USE_INTERNAL_THREADPOOL_IMPL | ||||
|   static std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy_xnnpack)> | ||||
|       threadpool(pthreadpool_create_xnnpack(getDefaultNumThreads()), pthreadpool_destroy_xnnpack); | ||||
| #else | ||||
|   static std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> | ||||
|       threadpool(pthreadpool_create(getDefaultNumThreads()), pthreadpool_destroy); | ||||
| #endif | ||||
|   return threadpool.get(); | ||||
| } | ||||
|  | ||||
| } // namespace caffe2 | ||||
| @ -1,7 +0,0 @@ | ||||
| #pragma once | ||||
| // Creating a separate .h/.cc file for creating threadpool for XNNPACK | ||||
| // to avoid touching existing internal builds. | ||||
| // When we unify threadpools this should all go away. | ||||
| namespace caffe2 { | ||||
| pthreadpool_t xnnpack_threadpool(); | ||||
| } // namespace caffe2 | ||||
							
								
								
									
										71
									
								
								caffe2/utils/threadpool/pthreadpool-cpp.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								caffe2/utils/threadpool/pthreadpool-cpp.cc
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,71 @@ | ||||
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> | ||||
| #include <c10/util/Exception.h> | ||||
|  | ||||
| namespace caffe2 { | ||||
|  | ||||
| PThreadPool::PThreadPool(const size_t thread_count) | ||||
|     : threadpool_(pthreadpool_create(thread_count), pthreadpool_destroy) {} | ||||
|  | ||||
| size_t PThreadPool::get_thread_count() const { | ||||
|   std::lock_guard<std::mutex> lock{mutex_}; | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT(threadpool_.get(), "Invalid threadpool!"); | ||||
|   return pthreadpool_get_threads_count(threadpool_.get()); | ||||
| } | ||||
|  | ||||
| void PThreadPool::set_thread_count(const size_t thread_count) { | ||||
|   std::lock_guard<std::mutex> lock{mutex_}; | ||||
|  | ||||
|   // As it stands, pthreadpool is an entirely data parallel framework with no | ||||
|   // support for task parallelism.  Hence, all functions are blocking, and no | ||||
|   // user-provided tasks can be in flight when the control is returned to the | ||||
|   // user of the API, which means re-initializing the library, without the | ||||
|   // need to wait on any pending tasks, is all one needs to do to re-adjust | ||||
|   // the thread count. | ||||
|   threadpool_.reset(pthreadpool_create(thread_count)); | ||||
| } | ||||
|  | ||||
| void PThreadPool::run( | ||||
|     const std::function<void(size_t)>& fn, | ||||
|     const size_t range) { | ||||
|   std::lock_guard<std::mutex> lock{mutex_}; | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT(threadpool_.get(), "Invalid threadpool!"); | ||||
|  | ||||
|   struct Context final { | ||||
|     const std::function<void(size_t)>& fn; | ||||
|   } context{ | ||||
|       fn, | ||||
|   }; | ||||
|  | ||||
|   pthreadpool_parallelize_1d( | ||||
|       threadpool_.get(), | ||||
|       // Note: pthreadpool_parallelize_1d() is a blocking function.  The | ||||
|       // function pointer to this lambda passed on to | ||||
|       // pthreadpool_parallelize_1d() cannot go out of scope until | ||||
|       // pthreadpool_parallelize_1d() returns. | ||||
|       [](void* const context, const size_t item) { | ||||
|         reinterpret_cast<Context*>(context)->fn(item); | ||||
|       }, | ||||
|       &context, | ||||
|       range, | ||||
|       0u); | ||||
| } | ||||
|  | ||||
| // Forward declaration | ||||
| size_t getDefaultNumThreads(); | ||||
|  | ||||
| PThreadPool* pthreadpool() { | ||||
|   static std::unique_ptr<PThreadPool> threadpool = | ||||
|       std::make_unique<PThreadPool>(getDefaultNumThreads()); | ||||
|   return threadpool.get(); | ||||
| } | ||||
|  | ||||
| pthreadpool_t pthreadpool_() { | ||||
|   PThreadPool* const threadpool = pthreadpool(); | ||||
|   TORCH_INTERNAL_ASSERT( | ||||
|       threadpool, "Failed to acquire an instance of PThreadPool!"); | ||||
|   return threadpool->threadpool_.get(); | ||||
| } | ||||
|  | ||||
| } // namespace caffe2 | ||||
							
								
								
									
										54
									
								
								caffe2/utils/threadpool/pthreadpool-cpp.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								caffe2/utils/threadpool/pthreadpool-cpp.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,54 @@ | ||||
| #pragma once | ||||
|  | ||||
| #ifdef USE_PTHREADPOOL | ||||
|  | ||||
| #ifdef USE_INTERNAL_PTHREADPOOL_IMPL | ||||
| #include <caffe2/utils/threadpool/pthreadpool.h> | ||||
| #else | ||||
| #include <pthreadpool.h> | ||||
| #endif | ||||
|  | ||||
| #include <functional> | ||||
| #include <memory> | ||||
| #include <mutex> | ||||
|  | ||||
| namespace caffe2 { | ||||
|  | ||||
| class PThreadPool final { | ||||
|  public: | ||||
|   explicit PThreadPool(size_t thread_count); | ||||
|   ~PThreadPool() = default; | ||||
|  | ||||
|   PThreadPool(const PThreadPool&) = delete; | ||||
|   PThreadPool& operator=(const PThreadPool&) = delete; | ||||
|  | ||||
|   PThreadPool(PThreadPool&&) = delete; | ||||
|   PThreadPool& operator=(PThreadPool&&) = delete; | ||||
|  | ||||
|   size_t get_thread_count() const; | ||||
|   void set_thread_count(size_t thread_count); | ||||
|  | ||||
|   // Run, in parallel, function fn(task_id) over task_id in range [0, range). | ||||
|   // This function is blocking.  All input is processed by the time it returns. | ||||
|   void run(const std::function<void(size_t)>& fn, size_t range); | ||||
|  | ||||
|  private: | ||||
|   friend pthreadpool_t pthreadpool_(); | ||||
|  | ||||
|  private: | ||||
|   mutable std::mutex mutex_; | ||||
|   std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_; | ||||
| }; | ||||
|  | ||||
| // Return a singleton instance of PThreadPool for ATen/TH multithreading. | ||||
| PThreadPool* pthreadpool(); | ||||
|  | ||||
| // Exposes the underlying implementation of PThreadPool. | ||||
| // Only for use in external libraries so as to unify threading across | ||||
| // internal (i.e. ATen, etc.) and external (e.g. NNPACK, QNNPACK, XNNPACK) | ||||
| // use cases. | ||||
| pthreadpool_t pthreadpool_(); | ||||
|  | ||||
| } // namespace caffe2 | ||||
|  | ||||
| #endif /* USE_PTHREADPOOL */ | ||||
| @ -32,7 +32,7 @@ static inline size_t min(size_t a, size_t b) { | ||||
| } | ||||
|  | ||||
| struct compute_1d_tiled_context { | ||||
|   pthreadpool_function_1d_tiled_t function; | ||||
|   legacy_pthreadpool_function_1d_tiled_t function; | ||||
|   void* argument; | ||||
|   size_t range; | ||||
|   size_t tile; | ||||
| @ -46,9 +46,9 @@ static void compute_1d_tiled(void* context_, size_t linear_index) { | ||||
|   context->function(context->argument, index, tile); | ||||
| } | ||||
|  | ||||
| void pthreadpool_compute_1d_tiled( | ||||
|   pthreadpool_t threadpool, | ||||
|   pthreadpool_function_1d_tiled_t function, | ||||
| void legacy_pthreadpool_compute_1d_tiled( | ||||
|   legacy_pthreadpool_t threadpool, | ||||
|   legacy_pthreadpool_function_1d_tiled_t function, | ||||
|   void* argument, | ||||
|   size_t range, | ||||
|   size_t tile) | ||||
| @ -65,12 +65,12 @@ void pthreadpool_compute_1d_tiled( | ||||
|                                                /*.argument = */ argument, | ||||
|                                                /*.range = */ range, | ||||
|                                                /*.tile = */ tile}; | ||||
|     pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range); | ||||
|     legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range); | ||||
|   } | ||||
| } | ||||
|  | ||||
| struct compute_2d_context { | ||||
|   pthreadpool_function_2d_t function; | ||||
|   legacy_pthreadpool_function_2d_t function; | ||||
|   void* argument; | ||||
|   caffe2::FixedDivisor<int32_t> range_j; | ||||
| }; | ||||
| @ -85,9 +85,9 @@ static void compute_2d(void* context_, size_t linear_index) { | ||||
|   context->function(context->argument, q, r); | ||||
| } | ||||
|  | ||||
| void pthreadpool_compute_2d( | ||||
|   struct pthreadpool* threadpool, | ||||
|   pthreadpool_function_2d_t function, | ||||
| void legacy_pthreadpool_compute_2d( | ||||
|   legacy_pthreadpool_t threadpool, | ||||
|   legacy_pthreadpool_function_2d_t function, | ||||
|   void* argument, | ||||
|   size_t range_i, | ||||
|   size_t range_j) | ||||
| @ -106,12 +106,12 @@ void pthreadpool_compute_2d( | ||||
|         /*.function = */ function, | ||||
|         /*.argument = */ argument, | ||||
|         /*.range_j = */ caffe2::FixedDivisor<int32_t>(range_j)}; | ||||
|     pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j); | ||||
|     legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j); | ||||
|   } | ||||
| } | ||||
|  | ||||
| struct compute_2d_tiled_context { | ||||
|   pthreadpool_function_2d_tiled_t function; | ||||
|   legacy_pthreadpool_function_2d_tiled_t function; | ||||
|   void* argument; | ||||
|   caffe2::FixedDivisor<int32_t> tile_range_j; | ||||
|   size_t range_i; | ||||
| @ -135,9 +135,9 @@ static void compute_2d_tiled(void* context_, size_t linear_index) { | ||||
|   context->function(context->argument, index_i, index_j, tile_i, tile_j); | ||||
| } | ||||
|  | ||||
| void pthreadpool_compute_2d_tiled( | ||||
|   pthreadpool_t threadpool, | ||||
|   pthreadpool_function_2d_tiled_t function, | ||||
| void legacy_pthreadpool_compute_2d_tiled( | ||||
|   legacy_pthreadpool_t threadpool, | ||||
|   legacy_pthreadpool_function_2d_tiled_t function, | ||||
|   void* argument, | ||||
|   size_t range_i, | ||||
|   size_t range_j, | ||||
| @ -166,12 +166,12 @@ void pthreadpool_compute_2d_tiled( | ||||
|         /*.range_j = */ range_j, | ||||
|         /*.tile_i = */ tile_i, | ||||
|         /*.tile_j = */ tile_j}; | ||||
|     pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j); | ||||
|     legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j); | ||||
|   } | ||||
| } | ||||
|  | ||||
| struct compute_3d_tiled_context { | ||||
|   pthreadpool_function_3d_tiled_t function; | ||||
|   legacy_pthreadpool_function_3d_tiled_t function; | ||||
|   void* argument; | ||||
|   caffe2::FixedDivisor<int32_t> tile_range_j; | ||||
|   caffe2::FixedDivisor<int32_t> tile_range_k; | ||||
| @ -205,9 +205,9 @@ static void compute_3d_tiled( | ||||
|       context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k); | ||||
| } | ||||
|  | ||||
| void pthreadpool_compute_3d_tiled( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_3d_tiled_t function, | ||||
| void legacy_pthreadpool_compute_3d_tiled( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_3d_tiled_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
| @ -251,16 +251,16 @@ void pthreadpool_compute_3d_tiled( | ||||
|         /*.tile_i = */ tile_i, | ||||
|         /*.tile_j = */ tile_j, | ||||
|         /*.tile_k = */ tile_k}; | ||||
|     pthreadpool_compute_1d( | ||||
|     legacy_pthreadpool_compute_1d( | ||||
|         threadpool, | ||||
|         (pthreadpool_function_1d_t)compute_3d_tiled, | ||||
|         (legacy_pthreadpool_function_1d_t)compute_3d_tiled, | ||||
|         &context, | ||||
|         tile_range_i * tile_range_j * tile_range_k); | ||||
|   } | ||||
| } | ||||
|  | ||||
| struct compute_4d_tiled_context { | ||||
|   pthreadpool_function_4d_tiled_t function; | ||||
|   legacy_pthreadpool_function_4d_tiled_t function; | ||||
|   void* argument; | ||||
|   caffe2::FixedDivisor<int32_t> tile_range_kl; | ||||
|   caffe2::FixedDivisor<int32_t> tile_range_j; | ||||
| @ -310,9 +310,9 @@ static void compute_4d_tiled( | ||||
|       tile_l); | ||||
| } | ||||
|  | ||||
| void pthreadpool_compute_4d_tiled( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_4d_tiled_t function, | ||||
| void legacy_pthreadpool_compute_4d_tiled( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_4d_tiled_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
| @ -367,9 +367,9 @@ void pthreadpool_compute_4d_tiled( | ||||
|         /*.tile_j = */ tile_j, | ||||
|         /*.tile_k = */ tile_k, | ||||
|         /*.tile_l = */ tile_l}; | ||||
|     pthreadpool_compute_1d( | ||||
|     legacy_pthreadpool_compute_1d( | ||||
|         threadpool, | ||||
|         (pthreadpool_function_1d_t)compute_4d_tiled, | ||||
|         (legacy_pthreadpool_function_1d_t)compute_4d_tiled, | ||||
|         &context, | ||||
|         tile_range_i * tile_range_j * tile_range_k * tile_range_l); | ||||
|   } | ||||
|  | ||||
| @ -5,49 +5,16 @@ | ||||
|  | ||||
| #include "ThreadPoolCommon.h" | ||||
|  | ||||
|  | ||||
| #include <stddef.h> // for size_t | ||||
|  | ||||
| typedef struct pthreadpool* pthreadpool_t; | ||||
|  | ||||
| typedef void (*pthreadpool_function_1d_t)(void*, size_t); | ||||
| typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t); | ||||
| typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t); | ||||
| typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t); | ||||
| typedef void (*pthreadpool_function_3d_tiled_t)( | ||||
|     void*, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t); | ||||
| typedef void (*pthreadpool_function_4d_tiled_t)( | ||||
|     void*, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t); | ||||
|  | ||||
| #include <stdint.h> // for uint32_t | ||||
|  | ||||
| typedef void (*pthreadpool_task_1d_t)(void*, size_t); | ||||
| typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t); | ||||
| typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t); | ||||
| typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t); | ||||
| typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t); | ||||
| typedef void (*pthreadpool_task_3d_tile_2d_t)( | ||||
|     void*, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t); | ||||
| typedef void (*pthreadpool_task_4d_tile_2d_t)( | ||||
| typedef struct pthreadpool* legacy_pthreadpool_t; | ||||
|  | ||||
| typedef void (*legacy_pthreadpool_function_1d_t)(void*, size_t); | ||||
| typedef void (*legacy_pthreadpool_function_1d_tiled_t)(void*, size_t, size_t); | ||||
| typedef void (*legacy_pthreadpool_function_2d_t)(void*, size_t, size_t); | ||||
| typedef void (*legacy_pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t); | ||||
| typedef void (*legacy_pthreadpool_function_3d_tiled_t)( | ||||
|     void*, | ||||
|     size_t, | ||||
|     size_t, | ||||
| @ -55,16 +22,7 @@ typedef void (*pthreadpool_task_4d_tile_2d_t)( | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t); | ||||
| typedef void (*pthreadpool_task_5d_tile_2d_t)( | ||||
|     void*, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t, | ||||
|     size_t); | ||||
| typedef void (*pthreadpool_task_6d_tile_2d_t)( | ||||
| typedef void (*legacy_pthreadpool_function_4d_tiled_t)( | ||||
|     void*, | ||||
|     size_t, | ||||
|     size_t, | ||||
| @ -90,8 +48,8 @@ extern "C" { | ||||
|  *    On error the function returns NULL and sets errno accordingly. | ||||
|  */ | ||||
|  | ||||
| //Returns internal threadpool impl. | ||||
| pthreadpool_t pthreadpool_create(size_t threads_count); | ||||
| // Returns internal threadpool impl. | ||||
| legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count); | ||||
|  | ||||
| /** | ||||
|  * Queries the number of threads in a thread pool. | ||||
| @ -100,7 +58,7 @@ pthreadpool_t pthreadpool_create(size_t threads_count); | ||||
|  * | ||||
|  * @returns  The number of threads in the thread pool. | ||||
|  */ | ||||
| size_t pthreadpool_get_threads_count(pthreadpool_t threadpool); | ||||
| size_t legacy_pthreadpool_get_threads_count(legacy_pthreadpool_t threadpool); | ||||
|  | ||||
| /** | ||||
|  * Processes items in parallel using threads from a thread pool. | ||||
| @ -117,38 +75,45 @@ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool); | ||||
|  * @param[in]  items       The number of items to process. The @a function | ||||
|  *    will be called once for each item. | ||||
|  */ | ||||
| void pthreadpool_compute_1d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_1d_t function, | ||||
| void legacy_pthreadpool_compute_1d( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_1d_t function, | ||||
|     void* argument, | ||||
|     size_t range); | ||||
|  | ||||
| void pthreadpool_compute_1d_tiled( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_1d_tiled_t function, | ||||
| void legacy_pthreadpool_parallelize_1d( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_1d_t function, | ||||
|     void* argument, | ||||
|     size_t range, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void legacy_pthreadpool_compute_1d_tiled( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_1d_tiled_t function, | ||||
|     void* argument, | ||||
|     size_t range, | ||||
|     size_t tile); | ||||
|  | ||||
| void pthreadpool_compute_2d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_2d_t function, | ||||
| void legacy_pthreadpool_compute_2d( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_2d_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j); | ||||
|  | ||||
| void pthreadpool_compute_2d_tiled( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_2d_tiled_t function, | ||||
| void legacy_pthreadpool_compute_2d_tiled( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_2d_tiled_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
|     size_t tile_i, | ||||
|     size_t tile_j); | ||||
|  | ||||
| void pthreadpool_compute_3d_tiled( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_3d_tiled_t function, | ||||
| void legacy_pthreadpool_compute_3d_tiled( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_3d_tiled_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
| @ -157,9 +122,9 @@ void pthreadpool_compute_3d_tiled( | ||||
|     size_t tile_j, | ||||
|     size_t tile_k); | ||||
|  | ||||
| void pthreadpool_compute_4d_tiled( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_4d_tiled_t function, | ||||
| void legacy_pthreadpool_compute_4d_tiled( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_4d_tiled_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
| @ -178,129 +143,29 @@ void pthreadpool_compute_4d_tiled( | ||||
|  * | ||||
|  * @param[in,out]  threadpool  The thread pool to destroy. | ||||
|  */ | ||||
| void pthreadpool_destroy(pthreadpool_t threadpool); | ||||
| void legacy_pthreadpool_destroy(legacy_pthreadpool_t threadpool); | ||||
|  | ||||
| // New interface copy/pasted from pthreadpool. | ||||
| // We will merge the internal and third-party/pthreadpool eventually. | ||||
| // For now copy-paste to get past build issues. | ||||
| #ifdef USE_INTERNAL_PTHREADPOOL_IMPL | ||||
|  | ||||
| #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001 | ||||
| #define pthreadpool_t legacy_pthreadpool_t | ||||
| #define pthreadpool_function_1d_t legacy_pthreadpool_function_1d_t | ||||
| #define pthreadpool_function_1d_tiled_t legacy_pthreadpool_function_1d_tiled_t | ||||
| #define pthreadpool_function_2d_t legacy_pthreadpool_function_2d_t | ||||
| #define pthreadpool_function_2d_tiled_t legacy_pthreadpool_function_2d_tiled_t | ||||
| #define pthreadpool_function_3d_tiled_t legacy_pthreadpool_function_3d_tiled_t | ||||
| #define pthreadpool_function_4d_tiled_t legacy_pthreadpool_function_4d_tiled_t | ||||
| #define pthreadpool_create legacy_pthreadpool_create | ||||
| #define pthreadpool_destroy legacy_pthreadpool_destroy | ||||
| #define pthreadpool_get_threads_count legacy_pthreadpool_get_threads_count | ||||
| #define pthreadpool_compute_1d legacy_pthreadpool_compute_1d | ||||
| #define pthreadpool_parallelize_1d legacy_pthreadpool_parallelize_1d | ||||
| #define pthreadpool_compute_1d_tiled legacy_pthreadpool_compute_1d_tiled | ||||
| #define pthreadpool_compute_2d legacy_pthreadpool_compute_2d | ||||
| #define pthreadpool_compute_2d_tiled legacy_pthreadpool_compute_2d_tiled | ||||
| #define pthreadpool_compute_3d_tiled legacy_pthreadpool_compute_3d_tiled | ||||
| #define pthreadpool_compute_4d_tiled legacy_pthreadpool_compute_4d_tiled | ||||
|  | ||||
| // Returns the copied threadpool impl of third-party/pthreadpool | ||||
| pthreadpool_t pthreadpool_create_xnnpack(size_t threads_count); | ||||
|  | ||||
| // Copied third-party impl. | ||||
| size_t pthreadpool_get_threads_count_xnnpack(pthreadpool_t threadpool); | ||||
|  | ||||
| // Copied third-party impl. | ||||
| void pthreadpool_destroy_xnnpack(pthreadpool_t threadpool); | ||||
|  | ||||
| /** | ||||
|  * Processes items in parallel using threads from a thread pool. | ||||
|  * | ||||
|  * When the call returns, all items have been processed and the thread pool is | ||||
|  * ready for a new task. | ||||
|  * | ||||
|  * @note If multiple threads call this function with the same thread pool, the | ||||
|  *    calls are serialized. | ||||
|  * | ||||
|  * @param[in]  threadpool  The thread pool to use for parallelisation. | ||||
|  * @param[in]  function    The function to call for each item. | ||||
|  * @param[in]  argument    The first argument passed to the @a function. | ||||
|  * @param[in]  items       The number of items to process. The @a function | ||||
|  *    will be called once for each item. | ||||
|  */ | ||||
| void pthreadpool_parallelize_1d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_1d_t function, | ||||
|     void* argument, | ||||
|     size_t range, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void pthreadpool_parallelize_1d_tile_1d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_1d_tile_1d_t function, | ||||
|     void* argument, | ||||
|     size_t range, | ||||
|     size_t tile, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void pthreadpool_parallelize_2d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_2d_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void pthreadpool_parallelize_2d_tile_1d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_2d_tile_1d_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
|     size_t tile_j, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void pthreadpool_parallelize_2d_tile_2d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_2d_tile_2d_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
|     size_t tile_i, | ||||
|     size_t tile_j, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void pthreadpool_parallelize_3d_tile_2d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_3d_tile_2d_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
|     size_t range_k, | ||||
|     size_t tile_j, | ||||
|     size_t tile_k, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void pthreadpool_parallelize_4d_tile_2d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_4d_tile_2d_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
|     size_t range_k, | ||||
|     size_t range_l, | ||||
|     size_t tile_k, | ||||
|     size_t tile_l, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void pthreadpool_parallelize_5d_tile_2d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_5d_tile_2d_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
|     size_t range_k, | ||||
|     size_t range_l, | ||||
|     size_t range_m, | ||||
|     size_t tile_l, | ||||
|     size_t tile_m, | ||||
|     uint32_t flags); | ||||
|  | ||||
| void pthreadpool_parallelize_6d_tile_2d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_task_6d_tile_2d_t function, | ||||
|     void* argument, | ||||
|     size_t range_i, | ||||
|     size_t range_j, | ||||
|     size_t range_k, | ||||
|     size_t range_l, | ||||
|     size_t range_m, | ||||
|     size_t range_n, | ||||
|     size_t tile_m, | ||||
|     size_t tile_n, | ||||
|     uint32_t flags); | ||||
| #endif /* USE_INTERNAL_PTHREADPOOL_IMPL */ | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| } /* extern "C" */ | ||||
|  | ||||
| @ -6,9 +6,9 @@ | ||||
| // External API | ||||
| // | ||||
|  | ||||
| void pthreadpool_compute_1d( | ||||
|     pthreadpool_t threadpool, | ||||
|     pthreadpool_function_1d_t function, | ||||
| void legacy_pthreadpool_compute_1d( | ||||
|     legacy_pthreadpool_t threadpool, | ||||
|     legacy_pthreadpool_function_1d_t function, | ||||
|     void* argument, | ||||
|     size_t range) { | ||||
|   if (threadpool == nullptr) { | ||||
| @ -27,30 +27,31 @@ void pthreadpool_compute_1d( | ||||
|           range); | ||||
| } | ||||
|  | ||||
| size_t pthreadpool_get_threads_count(pthreadpool_t threadpool) { | ||||
|   // The current fix only useful when XNNPACK calls pthreadpool_get_threads_count with nullptr. | ||||
| void legacy_pthreadpool_parallelize_1d( | ||||
|     const legacy_pthreadpool_t threadpool, | ||||
|     const legacy_pthreadpool_function_1d_t function, | ||||
|     void* const argument, | ||||
|     const size_t range, | ||||
|     uint32_t) { | ||||
|   legacy_pthreadpool_compute_1d(threadpool, function, argument, range); | ||||
| } | ||||
|  | ||||
| size_t legacy_pthreadpool_get_threads_count(legacy_pthreadpool_t threadpool) { | ||||
|   // The current fix only useful when XNNPACK calls legacy_pthreadpool_get_threads_count with nullptr. | ||||
|   if (threadpool == nullptr) { | ||||
|     return 1; | ||||
|   } | ||||
|   return reinterpret_cast<caffe2::ThreadPool*>(threadpool)->getNumThreads(); | ||||
|   // TODO: Future fix: If we keep maintaining two different threadpools. | ||||
|   // Old C2 and new one for XNNPACK, then the we have two different pthreadpool pointer | ||||
|   // types. One is caffe2::Thredpool*, the other is pthreadpool* (pthreadpool_new_if_impl.c) | ||||
|   // XNNPACK calls pthreadpool_get_threads_count during op setup using pthreadpool*, and | ||||
|   // uses _parallelize_ interface for for actual work. | ||||
|   // While NNPACK uses caffe2::Threadpool*. | ||||
|   // Thus if pthreadpool_get_threads_count is getting called from XNNPACK we cannot | ||||
|   // reinterpret_cast it to ThreadPool. It will seg fault or worse will have unedfined behavior. | ||||
| } | ||||
|  | ||||
| pthreadpool_t pthreadpool_create(size_t threads_count) { | ||||
| legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count) { | ||||
|   std::mutex thread_pool_creation_mutex_; | ||||
|   std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_); | ||||
|  | ||||
|   return reinterpret_cast<pthreadpool_t>(new caffe2::ThreadPool(threads_count)); | ||||
|   return reinterpret_cast<legacy_pthreadpool_t>(new caffe2::ThreadPool(threads_count)); | ||||
| } | ||||
|  | ||||
| void pthreadpool_destroy(pthreadpool_t pthreadpool) { | ||||
| void legacy_pthreadpool_destroy(legacy_pthreadpool_t pthreadpool) { | ||||
|   if (pthreadpool) { | ||||
|     caffe2::ThreadPool* threadpool = | ||||
|         reinterpret_cast<caffe2::ThreadPool*>(pthreadpool); | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,62 +0,0 @@ | ||||
| #pragma once | ||||
|  | ||||
| #include <stdint.h> | ||||
|  | ||||
| #if defined(__SSE__) || defined(__x86_64__) | ||||
| #include <xmmintrin.h> | ||||
| #endif | ||||
|  | ||||
| struct fpu_state { | ||||
| #if defined(__SSE__) || defined(__x86_64__) | ||||
|   uint32_t mxcsr; | ||||
| #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) | ||||
|   uint32_t fpscr; | ||||
| #elif defined(__aarch64__) | ||||
|   uint64_t fpcr; | ||||
| #else | ||||
|   char unused; | ||||
| #endif | ||||
| }; | ||||
|  | ||||
| static inline struct fpu_state get_fpu_state() { | ||||
|   struct fpu_state state = { 0 }; | ||||
| #if defined(__SSE__) || defined(__x86_64__) | ||||
|   state.mxcsr = (uint32_t) _mm_getcsr(); | ||||
| #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) | ||||
|   __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr)); | ||||
| #elif defined(__aarch64__) | ||||
|   __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr)); | ||||
| #endif | ||||
|   return state; | ||||
| } | ||||
|  | ||||
| static inline void set_fpu_state(const struct fpu_state state) { | ||||
| #if defined(__SSE__) || defined(__x86_64__) | ||||
|   _mm_setcsr((unsigned int) state.mxcsr); | ||||
| #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) | ||||
|   __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr)); | ||||
| #elif defined(__aarch64__) | ||||
|   __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static inline void disable_fpu_denormals() { | ||||
| #if defined(__SSE__) || defined(__x86_64__) | ||||
|   _mm_setcsr(_mm_getcsr() | 0x8040); | ||||
| #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) | ||||
|   uint32_t fpscr; | ||||
|   __asm__ __volatile__( | ||||
|       "VMRS %[fpscr], fpscr\n" | ||||
|       "ORR %[fpscr], #0x1000000\n" | ||||
|       "VMSR fpscr, %[fpscr]\n" | ||||
|     : [fpscr] "=r" (fpscr)); | ||||
| #elif defined(__aarch64__) | ||||
|   uint64_t fpcr; | ||||
|   __asm__ __volatile__( | ||||
|       "MRS %[fpcr], fpcr\n" | ||||
|       "ORR %w[fpcr], %w[fpcr], 0x1000000\n" | ||||
|       "ORR %w[fpcr], %w[fpcr], 0x80000\n" | ||||
|       "MSR fpcr, %[fpcr]\n" | ||||
|     : [fpcr] "=r" (fpcr)); | ||||
| #endif | ||||
| } | ||||
| @ -239,10 +239,10 @@ if(USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK) | ||||
|   endif() | ||||
|  | ||||
|   if(DISABLE_NNPACK_AND_FAMILY) | ||||
|     set(USE_NNPACK OFF) | ||||
|     set(USE_QNNPACK OFF) | ||||
|     set(USE_PYTORCH_QNNPACK OFF) | ||||
|     set(USE_XNNPACK OFF) | ||||
|     caffe2_update_option(USE_NNPACK OFF) | ||||
|     caffe2_update_option(USE_QNNPACK OFF) | ||||
|     caffe2_update_option(USE_PYTORCH_QNNPACK OFF) | ||||
|     caffe2_update_option(USE_XNNPACK OFF) | ||||
|   else() | ||||
|     set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") | ||||
|  | ||||
| @ -261,11 +261,9 @@ if(USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK) | ||||
|     if(NOT DEFINED PTHREADPOOL_SOURCE_DIR) | ||||
|       set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory") | ||||
|     endif() | ||||
|  | ||||
|     set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "") | ||||
|     set(CPUINFO_LOG_LEVEL "error" CACHE STRING "") | ||||
|     set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "") | ||||
|   endif() | ||||
| else() | ||||
|   set(DISABLE_NNPACK_AND_FAMILY ON) | ||||
| endif() | ||||
|  | ||||
| set(CONFU_DEPENDENCIES_SOURCE_DIR ${PROJECT_BINARY_DIR}/confu-srcs | ||||
| @ -281,45 +279,48 @@ if(INTERN_BUILD_MOBILE AND INTERN_USE_EIGEN_BLAS) | ||||
| endif() | ||||
|  | ||||
| # ---[ pthreadpool | ||||
| # QNNPACK and NNPACK both depend on pthreadpool, but when building with libtorch | ||||
| # they should use the pthreadpool implementation under caffe2/utils/threadpool | ||||
| # instead of the default implementation. To avoid confusion, add pthreadpool | ||||
| # subdirectory explicitly with EXCLUDE_FROM_ALL property prior to QNNPACK/NNPACK | ||||
| # does so, which will prevent it from installing the default pthreadpool library. | ||||
| if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE AND (USE_QNNPACK OR USE_NNPACK OR USE_XNNPACK)) | ||||
|   if(NOT DEFINED PTHREADPOOL_SOURCE_DIR) | ||||
|     set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") | ||||
|     set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory") | ||||
|   endif() | ||||
| # Only add a dependency on pthreadpool if we are on a mobile build | ||||
| # or are building any of the libraries in the {Q/X}NNPACK family. | ||||
| if(INTERN_BUILD_MOBILE OR NOT DISABLE_NNPACK_AND_FAMILY) | ||||
|   set(USE_PTHREADPOOL ON CACHE BOOL "" FORCE) | ||||
|   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_PTHREADPOOL") | ||||
|  | ||||
|   # Always use third_party/pthreadpool. | ||||
|   set(USE_INTERNAL_PTHREADPOOL_IMPL OFF CACHE BOOL "" FORCE) | ||||
|  | ||||
|   if(NOT TARGET pthreadpool) | ||||
|     set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "") | ||||
|     set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "") | ||||
|     add_subdirectory( | ||||
|       "${PTHREADPOOL_SOURCE_DIR}" | ||||
|       "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool" | ||||
|       EXCLUDE_FROM_ALL) | ||||
|   endif() | ||||
| endif() | ||||
|     if(USE_SYSTEM_PTHREADPOOL) | ||||
|       add_library(pthreadpool SHARED IMPORTED) | ||||
|       find_library(PTHREADPOOL_LIBRARY pthreadpool) | ||||
|       set_property(TARGET pthreadpool PROPERTY IMPORTED_LOCATION "${PTHREADPOOL_LIBRARY}") | ||||
|       if(NOT PTHREADPOOL_LIBRARY) | ||||
|         message(FATAL_ERROR "Cannot find pthreadpool") | ||||
|       endif() | ||||
|       message("-- Found pthreadpool: ${PTHREADPOOL_LIBRARY}") | ||||
|     elseif(NOT USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       if(NOT DEFINED PTHREADPOOL_SOURCE_DIR) | ||||
|         set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") | ||||
|         set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory") | ||||
|       endif() | ||||
|  | ||||
| # XNNPACK has not option of like QNNPACK_CUSTOM_THREADPOOL | ||||
| # that allows us to hijack pthreadpool interface. | ||||
| # Thus not doing this ends up building pthreadpool as well as | ||||
| # the internal implemenation of pthreadpool which results in symbol conflicts. | ||||
| if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK) | ||||
|   if(NOT DEFINED PTHREADPOOL_SOURCE_DIR) | ||||
|     set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") | ||||
|     set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory") | ||||
|   endif() | ||||
|       set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "") | ||||
|       set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "") | ||||
|       set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "") | ||||
|       set(PTHREADPOOL_ALLOW_DEPRECATED_API ON CACHE BOOL "") | ||||
|       add_subdirectory( | ||||
|         "${PTHREADPOOL_SOURCE_DIR}" | ||||
|         "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool") | ||||
|       set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|     endif() | ||||
|  | ||||
|   if(NOT TARGET pthreadpool) | ||||
|     set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "") | ||||
|     set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "") | ||||
|     add_subdirectory( | ||||
|       "${PTHREADPOOL_SOURCE_DIR}" | ||||
|       "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool" | ||||
|       EXCLUDE_FROM_ALL) | ||||
|     if(USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTERNAL_PTHREADPOOL_IMPL") | ||||
|     else() | ||||
|       list(APPEND Caffe2_DEPENDENCY_LIBS pthreadpool) | ||||
|     endif() | ||||
|   endif() | ||||
| else() | ||||
|   set(USE_PTHREADPOOL OFF CACHE BOOL "" FORCE) | ||||
| endif() | ||||
|  | ||||
| # ---[ Caffe2 uses cpuinfo library in the thread pool | ||||
| @ -369,9 +370,12 @@ if(USE_QNNPACK) | ||||
|   endif() | ||||
|  | ||||
|   if(NOT TARGET qnnpack) | ||||
|     if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       set(QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "") | ||||
|     endif() | ||||
|  | ||||
|     set(QNNPACK_BUILD_TESTS OFF CACHE BOOL "") | ||||
|     set(QNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "") | ||||
|     set(QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "") | ||||
|     set(QNNPACK_LIBRARY_TYPE "static" CACHE STRING "") | ||||
|     add_subdirectory( | ||||
|       "${QNNPACK_SOURCE_DIR}" | ||||
| @ -379,8 +383,29 @@ if(USE_QNNPACK) | ||||
|     # We build static versions of QNNPACK and pthreadpool but link | ||||
|     # them into a shared library for Caffe2, so they need PIC. | ||||
|     set_property(TARGET qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|     set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|     set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|  | ||||
|     if(QNNPACK_CUSTOM_THREADPOOL) | ||||
|       target_compile_definitions( | ||||
|         qnnpack PRIVATE | ||||
|         pthreadpool_t=legacy_pthreadpool_t | ||||
|         pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t | ||||
|         pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t | ||||
|         pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t | ||||
|         pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t | ||||
|         pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t | ||||
|         pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t | ||||
|         pthreadpool_create=legacy_pthreadpool_create | ||||
|         pthreadpool_destroy=legacy_pthreadpool_destroy | ||||
|         pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count | ||||
|         pthreadpool_compute_1d=legacy_pthreadpool_compute_1d | ||||
|         pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d | ||||
|         pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled | ||||
|         pthreadpool_compute_2d=legacy_pthreadpool_compute_2d | ||||
|         pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled | ||||
|         pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled | ||||
|         pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled) | ||||
|     endif() | ||||
|   endif() | ||||
|  | ||||
|   list(APPEND Caffe2_DEPENDENCY_LIBS qnnpack) | ||||
| @ -400,9 +425,12 @@ if(USE_PYTORCH_QNNPACK) | ||||
|     endif() | ||||
|  | ||||
|     if(NOT TARGET pytorch_qnnpack) | ||||
|       if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|         set(PYTORCH_QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "") | ||||
|       endif() | ||||
|  | ||||
|       set(PYTORCH_QNNPACK_BUILD_TESTS OFF CACHE BOOL "") | ||||
|       set(PYTORCH_QNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "") | ||||
|       set(PYTORCH_QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "") | ||||
|       set(PYTORCH_QNNPACK_LIBRARY_TYPE "static" CACHE STRING "") | ||||
|       add_subdirectory( | ||||
|         "${PYTORCH_QNNPACK_SOURCE_DIR}" | ||||
| @ -410,10 +438,29 @@ if(USE_PYTORCH_QNNPACK) | ||||
|       # We build static versions of QNNPACK and pthreadpool but link | ||||
|       # them into a shared library for Caffe2, so they need PIC. | ||||
|       set_property(TARGET pytorch_qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|       if(NOT USE_SYSTEM_PTHREADPOOL) | ||||
|         set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|       endif() | ||||
|       set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|  | ||||
|       if(PYTORCH_QNNPACK_CUSTOM_THREADPOOL) | ||||
|         target_compile_definitions( | ||||
|           pytorch_qnnpack PRIVATE | ||||
|           pthreadpool_t=legacy_pthreadpool_t | ||||
|           pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t | ||||
|           pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t | ||||
|           pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t | ||||
|           pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t | ||||
|           pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t | ||||
|           pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t | ||||
|           pthreadpool_create=legacy_pthreadpool_create | ||||
|           pthreadpool_destroy=legacy_pthreadpool_destroy | ||||
|           pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count | ||||
|           pthreadpool_compute_1d=legacy_pthreadpool_compute_1d | ||||
|           pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d | ||||
|           pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled | ||||
|           pthreadpool_compute_2d=legacy_pthreadpool_compute_2d | ||||
|           pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled | ||||
|           pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled | ||||
|           pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled) | ||||
|       endif() | ||||
|     endif() | ||||
|  | ||||
|     list(APPEND Caffe2_DEPENDENCY_LIBS pytorch_qnnpack) | ||||
| @ -447,7 +494,6 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK) | ||||
|   endif() | ||||
|  | ||||
|   if(NOT TARGET XNNPACK) | ||||
|     set(XNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "") | ||||
|     set(XNNPACK_LIBRARY_TYPE "static" CACHE STRING "") | ||||
|     set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "") | ||||
|     set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "") | ||||
| @ -457,15 +503,6 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK) | ||||
|       "${CONFU_DEPENDENCIES_BINARY_DIR}/XNNPACK") | ||||
|  | ||||
|     set_property(TARGET XNNPACK PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|     # Context: pthreadpool_get_threads_count implementation that is built in pytorch, uses | ||||
|     # implementation defined in caffe2/utils/threadpool/pthreadpool_impl.cc. This implementation | ||||
|     # assumes the the pthreadpool* passed is of type caffe2::ThradPool and thus does reinterpret cast. | ||||
|     # This is not valid when we create pthreadpool via caffe2::xnnpack_threadpool, which is of type | ||||
|     # compatible with new pthreadpool interface and is used in PT's XNNPACK integration. | ||||
|     # Thus all the calls for pthreadpool_get_threads_count originating from XNNPACK must be routed | ||||
|     # appropriately to pthreadpool_get_threads_count_xnnpack, which does not do the aforementioned | ||||
|     # casting to caffe2::ThradPool. Once the threadpools are unified, we will not need this. | ||||
|     target_compile_definitions(XNNPACK PRIVATE -Dpthreadpool_get_threads_count=pthreadpool_get_threads_count_xnnpack) | ||||
|   endif() | ||||
|  | ||||
|   include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR}) | ||||
|  | ||||
							
								
								
									
										27
									
								
								cmake/External/nnpack.cmake
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										27
									
								
								cmake/External/nnpack.cmake
									
									
									
									
										vendored
									
									
								
							| @ -59,9 +59,12 @@ if(ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAM | ||||
|   set(GOOGLETEST_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/googletest" CACHE STRING "Google Test source directory") | ||||
|  | ||||
|   if(NOT TARGET nnpack) | ||||
|     if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL) | ||||
|       set(NNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "") | ||||
|     endif() | ||||
|  | ||||
|     set(NNPACK_BUILD_TESTS OFF CACHE BOOL "") | ||||
|     set(NNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "") | ||||
|     set(NNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "") | ||||
|     set(NNPACK_LIBRARY_TYPE "static" CACHE STRING "") | ||||
|     set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "") | ||||
|     set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "") | ||||
| @ -73,6 +76,28 @@ if(ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAM | ||||
|     set_property(TARGET nnpack PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|     set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|     set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
|  | ||||
|     if(NNPACK_CUSTOM_THREADPOOL) | ||||
|       target_compile_definitions( | ||||
|         nnpack PRIVATE | ||||
|         pthreadpool_t=legacy_pthreadpool_t | ||||
|         pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t | ||||
|         pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t | ||||
|         pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t | ||||
|         pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t | ||||
|         pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t | ||||
|         pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t | ||||
|         pthreadpool_create=legacy_pthreadpool_create | ||||
|         pthreadpool_destroy=legacy_pthreadpool_destroy | ||||
|         pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count | ||||
|         pthreadpool_compute_1d=legacy_pthreadpool_compute_1d | ||||
|         pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d | ||||
|         pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled | ||||
|         pthreadpool_compute_2d=legacy_pthreadpool_compute_2d | ||||
|         pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled | ||||
|         pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled | ||||
|         pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled) | ||||
|     endif() | ||||
|   endif() | ||||
|  | ||||
|   set(NNPACK_FOUND TRUE) | ||||
|  | ||||
| @ -69,6 +69,11 @@ if(NOT @BUILD_SHARED_LIBS@) | ||||
|     list(APPEND TORCH_LIBRARIES ${XNNPACK_LIBRARY}) | ||||
|   endif() | ||||
|  | ||||
|   if(NOT @USE_INTERNAL_PTHREADPOOL_IMPL@) | ||||
|     find_library(PTHREADPOOL_LIBRARY pthreadpool PATHS "${TORCH_INSTALL_PREFIX}/lib") | ||||
|     list(APPEND TORCH_LIBRARIES ${PTHREADPOOL_LIBRARY}) | ||||
|   endif() | ||||
|  | ||||
|   if(@INTERN_USE_EIGEN_BLAS@) | ||||
|     find_library(EIGEN_BLAS_LIBRARY eigen_blas PATHS "${TORCH_INSTALL_PREFIX}/lib") | ||||
|     list(APPEND TORCH_LIBRARIES ${EIGEN_BLAS_LIBRARY}) | ||||
|  | ||||
| @ -404,15 +404,15 @@ if((CUDA_VERSION VERSION_EQUAL   9.0) OR | ||||
|   endif() | ||||
| elseif(CUDA_VERSION VERSION_EQUAL   9.2) | ||||
|   if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND | ||||
|       NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.13 AND | ||||
|       NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.14 AND | ||||
|       NOT DEFINED ENV{CUDAHOSTCXX}) | ||||
|     message(FATAL_ERROR | ||||
|       "CUDA ${CUDA_VERSION} is not compatible with MSVC toolchain version " | ||||
|       ">= 19.13. (a.k.a Visual Studio 2017 Update 7, VS 15.7) " | ||||
|       ">= 19.14. (a.k.a Visual Studio 2017 Update 7, VS 15.7) " | ||||
|       "Please upgrade to CUDA >= 10.0 or set the following environment " | ||||
|       "variable to use another version (for example): \n" | ||||
|       "  set \"CUDAHOSTCXX=C:\\Program Files (x86)\\Microsoft Visual Studio" | ||||
|       "\\2017\\Enterprise\\VC\\Tools\\MSVC\\14.12.25827\\bin\\HostX64\\x64\\cl.exe\"\n") | ||||
|       "\\2017\\Enterprise\\VC\\Tools\\MSVC\\14.13.26132\\bin\\HostX64\\x64\\cl.exe\"\n") | ||||
|   endif() | ||||
| elseif(CUDA_VERSION VERSION_EQUAL   10.0) | ||||
|   if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND | ||||
|  | ||||
| @ -186,6 +186,7 @@ autocast casts all inputs to ``float32`` and runs the op in ``float32``. | ||||
| ``cross``, | ||||
| ``dot``, | ||||
| ``equal``, | ||||
| ``index_put``, | ||||
| ``stack``, | ||||
| ``tensordot`` | ||||
|  | ||||
|  | ||||
| @ -17,7 +17,7 @@ Functional higher level API | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| .. warning:: | ||||
|     This API is experimental. Even though the function signatures are very unlikely to change, major | ||||
|     This API is in beta. Even though the function signatures are very unlikely to change, major | ||||
|     improvements to performances are planned before we consider this stable. | ||||
|  | ||||
| This section contains the higher level API for the autograd that builds on the basic API above | ||||
|  | ||||
| @ -4,6 +4,10 @@ | ||||
| Distributed communication package - torch.distributed | ||||
| ===================================================== | ||||
|  | ||||
| .. note :: | ||||
|     Please refer to `PyTorch Distributed Overview <https://pytorch.org/tutorials/beginner/dist_overview.html>`__ | ||||
|     for a brief introduction to all features related to distributed training. | ||||
|  | ||||
| .. automodule:: torch.distributed | ||||
| .. currentmodule:: torch.distributed | ||||
|  | ||||
|  | ||||
| @ -46,8 +46,11 @@ Creating TorchScript Code | ||||
|     script | ||||
|     trace | ||||
|     trace_module | ||||
|     fork | ||||
|     wait | ||||
|     ScriptModule | ||||
|     ScriptFunction | ||||
|     freeze | ||||
|     save | ||||
|     load | ||||
|     ignore | ||||
|  | ||||
| @ -316,4 +316,4 @@ operators, see :ref:`name_inference_reference-doc`. | ||||
|           (('N', 'features'), torch.Size([32, 49152])) | ||||
|  | ||||
|       .. warning:: | ||||
|           The named tensor API is experimental and subject to change. | ||||
|           The named tensor API is a prototype feature and subject to change. | ||||
|  | ||||
| @ -11,7 +11,7 @@ programs, and can aid you in debugging. | ||||
| .. _excluding-subgraphs: | ||||
|  | ||||
| Excluding subgraphs from backward | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
| --------------------------------- | ||||
|  | ||||
| Every Tensor has a flag: :attr:`requires_grad` that allows for fine grained | ||||
| exclusion of subgraphs from gradient computation and can increase efficiency. | ||||
| @ -19,7 +19,7 @@ exclusion of subgraphs from gradient computation and can increase efficiency. | ||||
| .. _excluding-requires_grad: | ||||
|  | ||||
| ``requires_grad`` | ||||
| ~~~~~~~~~~~~~~~~~ | ||||
| ^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| If there's a single input to an operation that requires gradient, its output | ||||
| will also require gradient. Conversely, only if all inputs don't require | ||||
| @ -61,7 +61,7 @@ will also require them. | ||||
| .. _how-autograd-encodes-history: | ||||
|  | ||||
| How autograd encodes the history | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
| -------------------------------- | ||||
|  | ||||
| Autograd is reverse automatic differentiation system.  Conceptually, | ||||
| autograd records a graph recording all of the operations that created | ||||
| @ -87,7 +87,7 @@ every iteration. You don't have to encode all possible paths before you | ||||
| launch the training - what you run is what you differentiate. | ||||
|  | ||||
| In-place operations with autograd | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
| --------------------------------- | ||||
|  | ||||
| Supporting in-place operations in autograd is a hard matter, and we discourage | ||||
| their use in most cases. Autograd's aggressive buffer freeing and reuse makes | ||||
| @ -121,7 +121,8 @@ functions and not seeing any errors, you can be sure that the computed | ||||
| gradients are correct. | ||||
|  | ||||
| Multithreaded Autograd | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
| ---------------------- | ||||
|  | ||||
| The autograd engine is responsible for running all the backward operations | ||||
| necessary to compute the backward pass. This section will describe all the details | ||||
| that can help you make the best use of it in a multithreaded environment.(this is | ||||
| @ -156,7 +157,7 @@ does not block on the concurrent backward computations, example code could be: | ||||
| Note that some behaviors that user should be aware of: | ||||
|  | ||||
| Concurrency on CPU | ||||
| ------------------ | ||||
| ^^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| When you run ``backward()`` or ``grad()`` via python or C++ API in multiple | ||||
| threads on CPU, you are expecting to see extra concurrency instead of | ||||
| @ -164,7 +165,7 @@ serializing all the backward calls in a specific order during execution | ||||
| (behavior before PyTorch 1.6). | ||||
|  | ||||
| Non-determinism | ||||
| ------------------ | ||||
| ^^^^^^^^^^^^^^^ | ||||
|  | ||||
| If you are calling ``backward()`` on multiple thread concurrently but with | ||||
| shared inputs (i.e. Hogwild CPU training). Since parameters are automatically | ||||
| @ -180,7 +181,7 @@ to happen. User could use the functional API :func:`torch.autograd.grad` to | ||||
| calculate the gradients instead of ``backward()`` to avoid non-determinism. | ||||
|  | ||||
| Graph retaining | ||||
| ------------------ | ||||
| ^^^^^^^^^^^^^^^ | ||||
|  | ||||
| If part of the autograd graph is shared between threads, i.e. run first | ||||
| part of forward single thread, then run second part in multiple threads, | ||||
| @ -192,7 +193,7 @@ crash in this case. Autograd will error out to the user similar to what call | ||||
| they should use ``retain_graph=True``. | ||||
|  | ||||
| Thread Safety on Autograd Node | ||||
| ------------------------------ | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| Since Autograd allows the caller thread to drive its backward execution for | ||||
| potential parallelism, it's important that we ensure thread safety on CPU with | ||||
| @ -204,7 +205,7 @@ for built-in C++ Autograd Nodes(e.g. AccumulateGrad, CopySlices) and custom | ||||
| thread safety on autograd Nodes that might have state write/read. | ||||
|  | ||||
| No thread safety on C++ hooks | ||||
| ------------------------------ | ||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| Autograd relies on the user to write thread safe C++ hooks. If you want the hook | ||||
| to be correctly applied in multithreading environment, you will need to write | ||||
|  | ||||
| @ -3,6 +3,9 @@ | ||||
| Quantization | ||||
| ============ | ||||
|  | ||||
| .. warning :: | ||||
|      Quantization is in beta and subject to change. | ||||
|  | ||||
| Introduction to Quantization | ||||
| ---------------------------- | ||||
|  | ||||
|  | ||||
| @ -1,8 +1,3 @@ | ||||
| :orphan: | ||||
|  | ||||
| .. contents:: :local: | ||||
|     :depth: 2 | ||||
|  | ||||
| .. _distributed-rpc-framework: | ||||
|  | ||||
| Distributed RPC Framework | ||||
| @ -17,6 +12,9 @@ machines. | ||||
|      APIs in the RPC package are stable. There are multiple ongoing work items | ||||
|      to improve performance and error handling, which will ship in future releases. | ||||
|  | ||||
| .. note :: | ||||
|     Please refer to `PyTorch Distributed Overview <https://pytorch.org/tutorials/beginner/dist_overview.html>`__ | ||||
|     for a brief introduction to all features related to distributed training. | ||||
|  | ||||
| Basics | ||||
| ------ | ||||
| @ -97,7 +95,7 @@ applications can always explicitly move the input tensors to CPU on the caller | ||||
| and move it to the desired devices on the callee if necessary. | ||||
|  | ||||
| .. warning:: | ||||
|   TorchScript support in RPC is experimental and subject to change. Since | ||||
|   TorchScript support in RPC is a prototype feature and subject to change. Since | ||||
|   v1.5.0, ``torch.distributed.rpc`` supports calling TorchScript functions as | ||||
|   RPC target functions, and this will help improve parallelism on the callee | ||||
|   side as executing TorchScript functions does not require GIL. | ||||
| @ -115,7 +113,7 @@ The RPC package also provides decorators which allow applications to specify | ||||
| how a given function should be treated on the callee side. | ||||
|  | ||||
| .. warning:: | ||||
|   The ``rpc.functions`` package is experimental and subject to change. | ||||
|   The ``rpc.functions`` package is a prototype feature and subject to change. | ||||
|  | ||||
| .. autofunction:: torch.distributed.rpc.functions.async_execution | ||||
|  | ||||
| @ -204,12 +202,8 @@ The TensorPipe backend has been introduced in PyTorch v1.6 and is being actively | ||||
| developed. At the moment, it only supports CPU tensors, with GPU support coming | ||||
| soon. It comes with a TCP-based transport, just like Gloo. It is also able to | ||||
| automatically chunk and multiplex large tensors over multiple sockets and | ||||
| threads in order to achieve very high bandwidths. In addition to that, it packs | ||||
| two Linux-specific transports for communication between processes on a same | ||||
| machine (one based on ringbuffers stored in shared memory, the other on the | ||||
| cross-memory attach syscalls) which can achieve lower latencies than TCP. | ||||
| The agent will be able to pick the best transport on its own, with no | ||||
| intervention required. | ||||
| threads in order to achieve very high bandwidths. The agent will be able to pick | ||||
| the best transport on its own, with no intervention required. | ||||
|  | ||||
| Example:: | ||||
|  | ||||
| @ -252,7 +246,7 @@ parameters during training. See :ref:`remote-reference-protocol` for more | ||||
| details. | ||||
|  | ||||
| .. autoclass:: RRef | ||||
|     :inherited-members: | ||||
|     :members: | ||||
|  | ||||
|  | ||||
| .. toctree:: | ||||
| @ -301,4 +295,4 @@ Tutorials | ||||
| The RPC tutorial introduces users to the RPC framework and provides two example applications using :ref:`torch.distributed.rpc<distributed-rpc-framework>` APIs. | ||||
|  | ||||
| -  `Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__ | ||||
| -  `Implementing a Parameter Server using Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html>`__ | ||||
| -  `Implementing a Parameter Server using Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html>`__ | ||||
|  | ||||
| @ -7,7 +7,7 @@ torch.sparse | ||||
|  | ||||
| .. warning:: | ||||
|  | ||||
|     This API is currently experimental and may change in the near future. | ||||
|     This API is in beta and may change in the near future. | ||||
|  | ||||
| Torch supports sparse tensors in COO(rdinate) format, which can | ||||
| efficiently store and process tensors for which the majority of elements | ||||
|  | ||||
| @ -208,7 +208,7 @@ torch.layout | ||||
|  | ||||
| A :class:`torch.layout` is an object that represents the memory layout of a | ||||
| :class:`torch.Tensor`. Currently, we support ``torch.strided`` (dense Tensors) | ||||
| and have experimental support for ``torch.sparse_coo`` (sparse COO Tensors). | ||||
| and have beta support for ``torch.sparse_coo`` (sparse COO Tensors). | ||||
|  | ||||
| ``torch.strided`` represents dense Tensors and is the memory layout that | ||||
| is most commonly used. Each strided tensor has an associated | ||||
|  | ||||
| @ -63,7 +63,7 @@ targets.each do |target| | ||||
|     target.resources_build_phase.add_file_reference(config_file_ref, true) | ||||
| end | ||||
| puts "Linking static libraries..." | ||||
| libs = ['libc10.a', 'libclog.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a'] | ||||
| libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a'] | ||||
| targets.each do |target| | ||||
|     target.frameworks_build_phases.clear | ||||
|     for lib in libs do | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	