scatter/gather - check that inputs are of the same dimensionality (#41890 )

Co-authored-by: Nikita Vedeneev <nik@quansight.com>
[1.6.0] Mark torch.set_deterministic and torch.is_deterministic as experimental (#41870 )
2025-10-21 21:49:24 +08:00 · 2020-07-22 18:33:07 -07:00 · 2020-07-22 18:32:47 -07:00 · 2020-07-22 17:37:45 -07:00 · 2020-07-22 14:50:15 -07:00 · 2020-07-22 14:49:51 -07:00
214 changed files with 3559 additions and 3154 deletions
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -36,6 +36,12 @@ CONFIG_TREE_DATA = [
                    ("libtorch", [XImportant(True)])
                ]),
            ]),
+            ("11.0", [
+                X("3.8"),
+                ("3.8", [
+                    ("libtorch", [X(True)])
+                ]),
+            ]),
        ]),
    ]),
    ("bionic", [
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@ -49,7 +49,8 @@ class Conf:

        cuda_parms = []
        if self.cuda_version:
-            cuda_parms.extend(["cuda" + self.cuda_version, "cudnn7"])
+            cudnn = "cudnn8" if self.cuda_version.startswith("11.") else "cudnn7"
+            cuda_parms.extend(["cuda" + self.cuda_version, cudnn])
        result = leading + ["linux", self.distro] + cuda_parms + self.parms
        if not for_docker and self.parms_list_ignored_for_docker_image is not None:
            result = result + self.parms_list_ignored_for_docker_image
@ -222,8 +223,7 @@ def instantiate_configs():
                python_version = fc.find_prop("pyver")
                parms_list[0] = fc.find_prop("abbreviated_pyver")

-        if cuda_version in ["9.2", "10", "10.1", "10.2"]:
-            # TODO The gcc version is orthogonal to CUDA version?
+        if cuda_version:
            cuda_gcc_version = fc.find_prop("cuda_gcc_override") or "gcc7"
            parms_list.append(cuda_gcc_version)

--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -958,6 +958,11 @@ jobs:
        no_output_timeout: "1h"
        command: |
            source "/pytorch/.circleci/scripts/binary_linux_build.sh"
+    - run:
+        name: Output binary sizes
+        no_output_timeout: "1m"
+        command: |
+            ls -lah /final_pkgs
    - run:
        name: save binary size
        no_output_timeout: "5m"
@ -972,6 +977,9 @@ jobs:
        root: /
        paths: final_pkgs

+    - store_artifacts:
+        path: /final_pkgs
+
    # This should really just be another step of the binary_linux_build job above.
    # This isn't possible right now b/c the build job uses the docker executor
    # (otherwise they'd be really really slow) but this one uses the macine
@ -7388,6 +7396,54 @@ workflows:
          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
          use_cuda_docker_runtime: "1"
          resource_class: gpu.medium
+      - pytorch_linux_build:
+          name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
+      - pytorch_linux_test:
+          name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test
+          requires:
+            - pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - pytorch_linux_build:
+          name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
+      - pytorch_linux_test:
+          name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test
+          requires:
+            - pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7:209062ef-ab58-422a-b295-36c4eed6e906"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
      - pytorch_linux_build:
          name: pytorch_linux_bionic_py3_6_clang9_build
          build_environment: "pytorch-linux-bionic-py3.6-clang9-build"
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
 cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
 # build a FAT bianry
 cd ${ZIP_DIR}/install/lib
-target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
+target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
 for lib in ${target_libs[*]}
 do
    if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then
--- a/.circleci/scripts/binary_linux_upload.sh
+++ b/.circleci/scripts/binary_linux_upload.sh
@ -20,6 +20,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly}
 CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
 BACKUP_BUCKET="s3://pytorch-backup"

+retry pip install -q awscli
 # Upload the package to the final location
 pushd /home/circleci/project/final_pkgs
 if [[ "$PACKAGE_TYPE" == conda ]]; then
@ -30,14 +31,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir  | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
  BACKUP_DIR="conda/${subdir}"
 elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
-  retry pip install -q awscli
  s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
  for pkg in $(ls); do
    retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
  done
  BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
 else
-  retry pip install -q awscli
  s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
  retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
  BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
@ -45,5 +44,5 @@ fi

 if [[ -n "${CIRCLE_TAG:-}" ]]; then
  s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
-  retry aws s3 cp . "$s3_dir"
+  retry aws s3 cp --recursive . "$s3_dir"
 fi
--- a/.circleci/scripts/binary_macos_upload.sh
+++ b/.circleci/scripts/binary_macos_upload.sh
@ -21,6 +21,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly}
 CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
 BACKUP_BUCKET="s3://pytorch-backup"

+retry pip install -q awscli
 pushd "$workdir/final_pkgs"
 if [[ "$PACKAGE_TYPE" == conda ]]; then
  retry conda install -yq anaconda-client
@ -30,14 +31,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir  | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
  BACKUP_DIR="conda/${subdir}"
 elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
-  retry pip install -q awscli
  s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
  for pkg in $(ls); do
    retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
  done
  BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
 else
-  retry pip install -q awscli
  s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
  retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
  BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
@ -45,5 +44,5 @@ fi

 if [[ -n "${CIRCLE_TAG:-}" ]]; then
  s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
-  retry aws s3 cp . "$s3_dir"
+  retry aws s3 cp --recursive . "$s3_dir"
 fi
--- a/.circleci/scripts/binary_windows_upload.sh
+++ b/.circleci/scripts/binary_windows_upload.sh
@ -19,6 +19,7 @@ PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly/}
 CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
 BACKUP_BUCKET="s3://pytorch-backup"

+retry pip install -q awscli
 pushd /root/workspace/final_pkgs
 # Upload the package to the final location
 if [[ "$PACKAGE_TYPE" == conda ]]; then
@ -29,14 +30,12 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  subdir=$(tar -xOf ./*.bz2 info/index.json | grep subdir  | cut -d ':' -f2 | sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//')
  BACKUP_DIR="conda/${subdir}"
 elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
-  retry conda install -c conda-forge -yq awscli
  s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
  for pkg in $(ls); do
    retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
  done
  BACKUP_DIR="libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
 else
-  retry conda install -c conda-forge -yq awscli
  s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
  retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
  BACKUP_DIR="whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
@ -44,5 +43,5 @@ fi

 if [[ -n "${CIRCLE_TAG:-}" ]]; then
  s3_dir="${BACKUP_BUCKET}/${CIRCLE_TAG}/${BACKUP_DIR}"
-  retry aws s3 cp . "$s3_dir"
+  retry aws s3 cp --recursive . "$s3_dir"
 fi
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@ -41,6 +41,11 @@
        no_output_timeout: "1h"
        command: |
            source "/pytorch/.circleci/scripts/binary_linux_build.sh"
+    - run:
+        name: Output binary sizes
+        no_output_timeout: "1m"
+        command: |
+            ls -lah /final_pkgs
    - run:
        name: save binary size
        no_output_timeout: "5m"
@ -55,6 +60,9 @@
        root: /
        paths: final_pkgs

+    - store_artifacts:
+        path: /final_pkgs
+
    # This should really just be another step of the binary_linux_build job above.
    # This isn't possible right now b/c the build job uses the docker executor
    # (otherwise they'd be really really slow) but this one uses the macine
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -181,7 +181,7 @@ fi

 # Patch required to build xla
 if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
-  git clone --recursive https://github.com/pytorch/xla.git
+  git clone --recursive -b r1.6 https://github.com/pytorch/xla.git
  ./xla/scripts/apply_patches.sh
 fi

--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@ -185,9 +185,9 @@ function get_exit_code() {
 function file_diff_from_base() {
  # The fetch may fail on Docker hosts, but it's not always necessary.
  set +e
-  git fetch origin master --quiet
+  git fetch origin release/1.6 --quiet
  set -e
-  git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
+  git diff --name-only "$(git merge-base origin/release/1.6 HEAD)" > "$1"
 }

 function get_bazel() {
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -289,7 +289,7 @@ test_backward_compatibility() {
  pushd test/backward_compatibility
  python dump_all_function_schemas.py --filename new_schemas.txt
  pip_uninstall torch
-  pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+  pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
  python check_backward_compatibility.py --new-schemas new_schemas.txt
  popd
  set +x
@ -341,8 +341,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; t
 elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  test_bazel
 elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4* ]]; then
-  # test cpp extension for xenial + cuda 9.2 + gcc 5.4 to make sure 
-  # cpp extension can be built correctly under this old env 
+  # test cpp extension for xenial + cuda 9.2 + gcc 5.4 to make sure
+  # cpp extension can be built correctly under this old env
  test_cpp_extensions
 else
  test_torchvision
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1350,7 +1350,6 @@ filegroup(
        "caffe2/utils/smart_tensor_printer.cc",
        "caffe2/utils/string_utils.cc",
        "caffe2/utils/threadpool/ThreadPool.cc",
-        "caffe2/utils/threadpool/ThreadPoolMobile.cc",
        "caffe2/utils/threadpool/pthreadpool.cc",
        "caffe2/utils/threadpool/pthreadpool_impl.cc",
    ],
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -481,7 +481,7 @@ if(USE_PYTORCH_QNNPACK)
 endif()

 if(USE_XNNPACK)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK -DUSE_INTERNAL_THREADPOOL_IMPL")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK")
 endif()

 if(USE_VULKAN)
--- a/android/pytorch_android/CMakeLists.txt
+++ b/android/pytorch_android/CMakeLists.txt
@ -99,6 +99,7 @@ if(ANDROID_ABI)
  import_static_lib(libnnpack)
  import_static_lib(libXNNPACK)
  import_static_lib(libpytorch_qnnpack)
+  import_static_lib(libpthreadpool)
  import_static_lib(libeigen_blas)
  import_static_lib(libcpuinfo)
  import_static_lib(libclog)
@ -115,6 +116,7 @@ if(ANDROID_ABI)
      libnnpack
      libXNNPACK
      libpytorch_qnnpack
+      libpthreadpool
      libeigen_blas
      libcpuinfo
      libclog
@ -129,6 +131,7 @@ else()
      nnpack
      XNNPACK
      pytorch_qnnpack
+      pthreadpool
      cpuinfo
      clog
  )
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
@ -8,8 +8,10 @@

 #include "pytorch_jni_common.h"
 #if defined(__ANDROID__)
-#include <caffe2/utils/threadpool/ThreadPool.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#ifndef USE_PTHREADPOOL
+#define USE_PTHREADPOOL
+#endif /* USE_PTHREADPOOL */
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #endif

 namespace pytorch_jni {
@ -605,7 +607,7 @@ class PyTorchAndroidJni : public facebook::jni::JavaClass<PyTorchAndroidJni> {
  }

  static void setNumThreads(facebook::jni::alias_ref<jclass>, jint numThreads) {
-    caffe2::mobile_threadpool()->setNumThreads(numThreads);
+    caffe2::pthreadpool()->set_thread_count(numThreads);
  }
 };
 #endif
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@ -56,6 +56,38 @@
    - THBoolTensor* mask
    - THTensor* source
 ]]
+[[
+  name: _th_masked_select
+  cname: maskedSelect
+  cpu_bool: True
+  cpu_bfloat16: True
+  variants:
+    - function
+  backends:
+    - CPU
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THByteTensor* mask
+]]
+[[
+  name: _th_masked_select_bool
+  cname: maskedSelectBool
+  cpu_bool: True
+  cpu_bfloat16: True
+  variants:
+    - function
+  backends:
+    - CPU
+  return: argument 0
+  arguments:
+    - arg: THTensor* result
+      output: True
+    - THTensor* self
+    - THBoolTensor* mask
+]]
 [[
  name: _th_nonzero
  cname: nonzero
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@ -6,8 +6,7 @@
 #ifndef C10_MOBILE
 #include <c10/core/thread_pool.h>
 #else
-#include <caffe2/utils/threadpool/ThreadPool.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #endif // C10_MOBILE

 #include <atomic>
@ -88,15 +87,15 @@ void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
  // Run the first task on the current thread directly.
  fn(0, 0);
 #else
-  caffe2::ThreadPool* pool = caffe2::mobile_threadpool();
-  if (pool) {
-    // caffe2::ThreadPool can utilize the current thread.
-    pool->run(fn, range);
-  } else {
-    for (size_t i = 0; i < range; ++i) {
-      fn(0, i);
-    }
-  }
+  caffe2::PThreadPool* const pool = caffe2::pthreadpool();
+  TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
+
+  pool->run(
+    // PThreadPool::run() is blocking.  A std::function [const] reference to
+    // this lambda cannot go out of scope before PThreadPool::run() returns.
+    [&fn](const size_t task_id) {
+      fn(0 /* unused */, task_id);
+    }, range);
 #endif // C10_MOBILE
 }

@ -184,7 +183,7 @@ void init_num_threads() {
 #endif

 #ifdef C10_MOBILE
-  caffe2::mobile_threadpool();
+  caffe2::pthreadpool();
 #endif
 }

@ -208,7 +207,9 @@ void set_num_threads(int nthreads) {
    }
  }
 #else
-  TORCH_CHECK(false, "set_num_threads is not supported for mobile.");
+  caffe2::PThreadPool* const pool = caffe2::pthreadpool();
+  TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
+  pool->set_thread_count(nthreads);
 #endif // C10_MOBILE
 }

@ -226,9 +227,9 @@ int get_num_threads() {
    return _get_intraop_pool().size() + 1;
  }
 #else
-  caffe2::ThreadPool* pool = caffe2::mobile_threadpool();
-  // caffe2::ThreadPool::getNumThreads() counts the current thread.
-  return !pool || in_parallel_region() ? 1 /* current thread */ : pool->getNumThreads();
+  caffe2::PThreadPool* const pool = caffe2::pthreadpool();
+  TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!")
+  return in_parallel_region() ? 1 /* current thread */ : pool->get_thread_count();
 #endif // C10_MOBILE
 }

@ -257,8 +258,8 @@ void intraop_launch(std::function<void()> func) {
    func();
  }
 #else
-  // TODO: caffe2::ThreadPool doesn't support submitting tasks separately and
-  // running in parallel. Should fix it when this API becomes popular.
+  // TODO: caffe2::PThreadPool only provides a data-parallel API.
+  // Task parallelism is not currently supported.
  func();
 #endif // C10_MOBILE
 }
@ -280,8 +281,8 @@ std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
  }
  return future;
 #else
-  // TODO: caffe2::ThreadPool doesn't support submitting tasks separately and
-  // running in parallel. Should fix it when this API becomes popular.
+  // TODO: caffe2::PThreadPool only provides a data-parallel API.
+  // Task parallelism is not currently supported.
  auto future = std::make_shared<c10::ivalue::Future>(NoneType::get());
  func();
  future->markCompleted();
--- a/aten/src/ATen/autocast_VS2017_helper.h
+++ b/aten/src/ATen/autocast_VS2017_helper.h
@ -135,6 +135,7 @@ UPTOb( bool  , equal    , (const Tensor &A, const Tensor &B) )
 UPTOb( Tensor, cat      , (TensorList    A, int64_t       B) )
 UPTOb( Tensor, cat      , (TensorList    A, Dimname       B) )
 UPTOb( Tensor, _cat     , (TensorList    A, int64_t       B) )
+UPTOd( Tensor, index_put, (const Tensor &A, TensorList    B, const Tensor &         C, bool          D) )
 UPTOb( Tensor, stack    , (TensorList    A, int64_t       B) )

 #undef UPTOa
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -482,15 +482,16 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
  KERNEL(ADD_NS(addcdiv), "addcdiv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
  KERNEL(ADD_NS(addcmul), "addcmul", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
  KERNEL(ADD_NS(atan2), "atan2", Tensor (const Tensor &, const Tensor &), promote)
-  KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote)
  KERNEL_UNBOXED_ONLY(ADD_NS(bilinear), "bilinear", Tensor (const Tensor &, const Tensor &, const Tensor &, const Tensor &), promote)
-  KERNEL_UNBOXED_ONLY(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote)
-  KERNEL_UNBOXED_ONLY(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote)
-  KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote)
  KERNEL(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
  KERNEL_UNBOXED_ONLY(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote)
  KERNEL(ADD_NS(_cat), "_cat", Tensor (TensorList, int64_t), promote)
+  KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote)
+  KERNEL_UNBOXED_ONLY(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote)
+  KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote)
+  KERNEL_UNBOXED_ONLY(ADD_NS(index_put), "index_put", Tensor (const Tensor &, TensorList, const Tensor &, bool), promote)
  KERNEL(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote)
+  KERNEL_UNBOXED_ONLY(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote)

  m.impl_UNBOXED("binary_cross_entropy", &at::autocast::binary_cross_entropy_banned);
 }
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -188,6 +188,7 @@ namespace c10 {
  _(prim, unchecked_unwrap_optional) \
  _(aten, __contains__)              \
  _(prim, BailoutTemplate)           \
+  _(prim, grad)                      \
  _(aten, zero_)                     \
  _(aten, fill_)                     \
  FORALL_ATEN_BASE_SYMBOLS(_)        \
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -1481,7 +1481,7 @@ inline TypePtr TensorType::fromBoolType() {

 inline c10::optional<c10::ScalarType> tryScalarTypeFromJitType(const c10::TypePtr & type) {
  if (type == FloatType::get()) {
-    return at::ScalarType::Double;
+    return at::typeMetaToScalarType(c10::get_default_dtype());
  } else if (type == IntType::get()) {
    return at::ScalarType::Long;
  } else if (type == BoolType::get()) {
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -181,6 +181,10 @@ Allocator* CUDAHooks::getPinnedMemoryAllocator() const {
  return at::cuda::getPinnedMemoryAllocator();
 }

+Allocator* CUDAHooks::getCUDADeviceAllocator() const {
+  return at::cuda::getCUDADeviceAllocator();
+}
+
 bool CUDAHooks::compiledWithCuDNN() const {
  return AT_CUDNN_ENABLED();
 }
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -22,6 +22,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  int64_t current_device() const override;
  bool hasPrimaryContext(int64_t device_index) const override;
  c10::optional<int64_t> getDevceIndexWithPrimaryContext() const override;
+  Allocator* getCUDADeviceAllocator() const override;
  Allocator* getPinnedMemoryAllocator() const override;
  bool compiledWithCuDNN() const override;
  bool compiledWithMIOpen() const override;
--- a/aten/src/ATen/cudnn/Handle.cpp
+++ b/aten/src/ATen/cudnn/Handle.cpp
@ -16,10 +16,15 @@ void destroyCuDNNHandle(cudnnHandle_t handle) {
 // happens in fbcode setting. @colesbury and I decided to not destroy
 // the handle as a workaround.
 //   - @soumith
-#ifdef NO_CUDNN_DESTROY_HANDLE
-#else
-    cudnnDestroy(handle);
-#endif
+//
+// Further note: this is now disabled globally, because we are seeing
+// the same issue as mentioned above in CUDA 11 CI.
+//   - @zasdfgbnm
+//
+// #ifdef NO_CUDNN_DESTROY_HANDLE
+// #else
+//   cudnnDestroy(handle);
+// #endif
 }

 using CudnnPoolType = at::cuda::DeviceThreadHandlePool<cudnnHandle_t, createCuDNNHandle, destroyCuDNNHandle>;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -121,6 +121,10 @@ struct CAFFE2_API CUDAHooksInterface {
    TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP);
  }

+  virtual Allocator* getCUDADeviceAllocator() const {
+    TORCH_CHECK(false, "CUDADeviceAllocator requires CUDA. ", CUDA_HELP);
+  }
+
  virtual bool compiledWithCuDNN() const {
    return false;
  }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -262,9 +262,7 @@ auto ConvParams::use_xnnpack(
    const at::Tensor& input,
    const at::Tensor& weight,
    const at::Tensor& bias) const -> bool {
-// Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool
-// TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash.
-#if defined(C10_MOBILE) && !defined(__APPLE__)
+#if defined(C10_MOBILE)
  if (!transposed) {
    return (input.size(1) == groups) &&
            xnnpack::use_convolution2d(
--- a/aten/src/ATen/native/LegacyDefinitions.cpp
+++ b/aten/src/ATen/native/LegacyDefinitions.cpp
@ -22,6 +22,32 @@ Tensor & masked_scatter__cpu(Tensor& self, const Tensor & mask, const Tensor & s
  }
 }

+Tensor masked_select_cpu(const Tensor & self, const Tensor & mask) {
+  namedinference::compute_broadcast_outnames(self, mask);
+
+  Tensor b_self, b_mask;
+  std::tie(b_self, b_mask) = expand_outplace(self, mask, "masked_select");
+  if (b_mask.dtype() == at::ScalarType::Byte) {
+    TORCH_WARN("masked_select received a mask with dtype torch.uint8, this behavior is now deprecated," \
+            "please use a mask with dtype torch.bool instead.");
+    return legacy::cpu::_th_masked_select(b_self, b_mask);
+  } else {
+    return legacy::cpu::_th_masked_select_bool(b_self, b_mask);
+  }
+}
+
+Tensor & masked_select_out_cpu(Tensor & result, const Tensor & self, const Tensor & mask) {
+  namedinference::compute_broadcast_outnames(self, mask);
+
+  Tensor b_self, b_mask;
+  std::tie(b_self, b_mask) = expand_outplace(self, mask, "masked_select_out");
+  if (b_mask.dtype() == at::ScalarType::Bool) {
+    return legacy::cpu::_th_masked_select_bool_out(result, b_self, b_mask);
+  } else {
+    return legacy::cpu::_th_masked_select_out(result, b_self, b_mask);
+  }
+}
+
 Tensor argsort(const Tensor & self, int64_t dim, bool descending) {
  return std::get<1>(at::sort(self, dim, descending));
 }
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -17,9 +17,7 @@ Tensor linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
  if (input.is_mkldnn()) {
    return at::mkldnn_linear(input, weight, bias);
  }
-// Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool
-// TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash.
-#if defined(C10_MOBILE) && !defined(__APPLE__)
+#if defined(C10_MOBILE)
  if (xnnpack::use_linear(input, weight, bias)) {
    return xnnpack::linear(input, weight, bias);
  }
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@ -58,8 +58,9 @@ bool _nnpack_available() {

 #include <nnpack.h>

-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <ATen/native/ConvUtils.h>
+#include <ATen/Parallel.h>

 namespace at {
 namespace native {
@ -87,15 +88,9 @@ static bool init_nnpack() {
 }

 static pthreadpool_t nnpack_threadpool() {
-  // Try initializing a threadpool for NNPACK's use.  If we fail to
-  // successfully initialize an implementation, return nullptr which will
-  // instruct NNPACK to run single threaded.
-
 #ifdef C10_MOBILE
-  // If building for mobile, use Caffe 2's mobile-friendly threadpool.
-  return caffe2::mobile_pthreadpool();
+  return caffe2::pthreadpool_();
 #else
-  // Otherwise, try using pthreadpool if we manage to initialize it successfully.
  static pthreadpool_t nnpack_threadpool_ = nullptr;
  static bool called_nnpack_threadpool_ = false;

--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@ -135,9 +135,7 @@ Tensor max_pool2d(
        self, kernel_size, stride, padding, dilation, ceil_mode);
  }

-// Disable the xnnpack operators for both iOS and macOS temporarily due to the crash in pthreadpool
-// TODO:T66297472 remove `!defined(__APPLE__)` once we figure out the root cause of the crash.
-#if defined(C10_MOBILE) && !defined(__APPLE__)
+#if defined(C10_MOBILE)
  if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride,
                             dilation, ceil_mode)) {
    return xnnpack::max_pool2d(
--- a/aten/src/ATen/native/ScatterGatherChecks.h
+++ b/aten/src/ATen/native/ScatterGatherChecks.h
@ -34,12 +34,17 @@ static void scatter_gather_dtype_check(
 // Test:
 // 1. index.size(d) == self.size(d) for all d != dim
 // 2. index.size(d) <= src.size(d) for all d != dim
+// 3. index.dim() == self.dim() == src.dim()
 static void gather_shape_check(const Tensor& self, int64_t dim,
  const Tensor& index, const Tensor& src
 ) {
  auto self_dims = ensure_nonempty_dim(self.dim());
-
  TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()),
+    "Index tensor must have the same number of dimensions as out tensor"
+  );
+
+  auto src_dims = ensure_nonempty_dim(src.dim());
+  TORCH_CHECK(src_dims == ensure_nonempty_dim(index.dim()),
    "Index tensor must have the same number of dimensions as input tensor"
  );

@ -66,10 +71,16 @@ static void gather_shape_check(const Tensor& self, int64_t dim,
 // Tests:
 //  1. index.size(d) <= self.size(d) for all d != dim
 //  2. index.size(d) <= src.size(d) for all d if src is a Tensor
+//  3. index.dim() == self.dim() == src.dim()
 static void scatter_shape_check(
  const Tensor& self, int64_t dim, const Tensor& index,
  const c10::optional<Tensor>& src_opt = c10::nullopt
 ) {
+  TORCH_CHECK(
+    ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
+    "Index tensor must have the same number of dimensions as self tensor"
+  );
+
  bool is_wrong_shape = false;
  int64_t self_dims = ensure_nonempty_dim(self.dim());

@ -97,6 +108,12 @@ static void scatter_shape_check(

  if (src_opt.has_value()) {
    auto src = src_opt.value();
+
+    TORCH_CHECK(
+      ensure_nonempty_dim(src.dim()) == ensure_nonempty_dim(index.dim()),
+      "Index tensor must have the same number of dimensions as src tensor"
+    );
+
    TORCH_CHECK(!is_wrong_shape,
      "Expected index ", index.sizes(),
      " to be smaller than self ", self.sizes(),
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -71,8 +71,6 @@ DEFINE_DISPATCH(index_put_stub);
 DEFINE_DISPATCH(index_put_accum_stub);
 DEFINE_DISPATCH(masked_fill_stub);
 REGISTER_NO_CPU_DISPATCH(index_put_accum_stub, index_put_accum_fn);
-DEFINE_DISPATCH(masked_select_serial_stub);
-DEFINE_DISPATCH(masked_select_stub);

 DEFINE_DISPATCH(gather_stub);
 DEFINE_DISPATCH(scatter_stub);
@ -629,82 +627,6 @@ Tensor masked_fill(const Tensor & self, const Tensor & mask, const Tensor & sour
  return result;
 }

-static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self, const Tensor & mask) {
-  NoNamesGuard guard;
-
-  TORCH_CHECK(mask.scalar_type() == ScalarType::Byte || mask.scalar_type() == ScalarType::Bool,
-              "masked_select: expected BoolTensor or ByteTensor for mask");
-  TORCH_CHECK(self.scalar_type() == result.scalar_type(),
-              "masked_select(): self and result must have the same scalar type");
-
-  if (mask.dtype() == at::ScalarType::Byte) {
-    TORCH_WARN("masked_select received a mask with dtype torch.uint8, this behavior is now deprecated," \
-            "please use a mask with dtype torch.bool instead.");
-  }
-
-  Tensor _mask, _self;
-  std::tie(_mask, _self) = expand_outplace(mask, self);
-
-  auto shape = _self.sizes();
-  int64_t numel = _mask.sum().item().toLong();
-  result.resize_({numel});
-  if (numel == 0) {
-    return result;
-  }
-
-  // Create strided view of result before feeding into TensorIterator
-  auto strides = DimVector(shape.size(), 0);
-  auto result_strided = result.as_strided(shape, strides);
-
-  // serial kernel
-  bool use_serial_kernel = self.numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
-  if (use_serial_kernel) {
-    auto iter = TensorIteratorConfig()
-      .check_all_same_dtype(false)
-      .resize_outputs(false)
-      .add_output(result_strided)
-      .add_input(_self)
-      .add_input(_mask)
-      .build();
-
-    masked_select_serial_stub(iter.device_type(), iter);
-    return result;
-  }
-
-  // Use a prefix sum to record the output locations of the masked elements,
-  // so as to parallel with TensorIterator.
-  auto mask_long = at::empty(shape, self.options().dtype(at::kLong)).copy_(_mask);
-  auto mask_prefix_sum = at::empty(shape, self.options().dtype(at::kLong));
-  auto mask_long_data = mask_long.data_ptr<int64_t>();
-  auto mask_prefix_sum_data = mask_prefix_sum.data_ptr<int64_t>();
-  // TODO: Here can only use std::partial_sum for C++14,
-  // use std::exclusive_scan when PyTorch upgrades to C++17, which have better peformance.
-  // std::exclusive_scan(mask_long_data, mask_long_data + mask_long.numel(), mask_prefix_sum_data, 0);
-  std::partial_sum(mask_long_data, mask_long_data + mask_long.numel(), mask_prefix_sum_data);
-
-  auto iter = TensorIteratorConfig()
-    .check_all_same_dtype(false)
-    .resize_outputs(false)
-    .add_output(result_strided)
-    .add_input(_self)
-    .add_input(_mask)
-    .add_input(mask_prefix_sum)
-    .build();
-
-  masked_select_stub(iter.device_type(), iter);
-  return result;
-}
-
-Tensor & masked_select_out_cpu(Tensor & result, const Tensor & self, const Tensor & mask) {
-  namedinference::compute_broadcast_outnames(self, mask);
-  return masked_select_out_impl_cpu(result, self, mask);
-}
-
-Tensor masked_select_cpu(const Tensor & self, const Tensor & mask) {
-  Tensor result = at::empty({0}, self.options());
-  return masked_select_out_cpu(result, self, mask);
-}
-
 Tensor _gather_sparse_backward(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& grad){
 // special case scalar input and/or index
    if (self.ndimension() == 0) return at::_sparse_coo_tensor_unsafe(at::empty({0,grad.numel()}, index.options()), grad, self.sizes());
--- a/aten/src/ATen/native/TensorAdvancedIndexing.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.h
@ -15,7 +15,6 @@ using index_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRe
 using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate);
 using index_put_accum_fn = void(*)(Tensor &, TensorList , const Tensor &, bool unsafe);
 using masked_fill_fn = void(*)(TensorIterator &, Scalar scalar);
-using masked_select_fn = void(*)(TensorIterator &);

 using gather_fn = void (*)(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
 using scatter_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
@ -26,8 +25,6 @@ DECLARE_DISPATCH(index_fn, index_stub);
 DECLARE_DISPATCH(index_put_fn, index_put_stub);
 DECLARE_DISPATCH(index_put_accum_fn, index_put_accum_stub);
 DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub);
-DECLARE_DISPATCH(masked_select_fn, masked_select_serial_stub);
-DECLARE_DISPATCH(masked_select_fn, masked_select_stub);

 DECLARE_DISPATCH(gather_fn, gather_stub);
 DECLARE_DISPATCH(scatter_fn, scatter_stub);
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -355,13 +355,12 @@ TensorOptions infer_full_options(

  if (!options.has_dtype()) {
    if (fill_value.isIntegral(true)) {
-      TORCH_WARN_ONCE(
-        "Deprecation warning: In a future PyTorch release torch.full ",
-        "will no longer return tensors of floating dtype by default. ",
-        "Instead, a bool fill_value will return a tensor of torch.bool dtype, ",
-        "and an integral fill_value will return a tensor of torch.long dtype. ",
-        "Set the optional `dtype` or `out` arguments to suppress this warning."
-      );
+      TORCH_CHECK(false,
+        "Providing a bool or integral fill value without setting the optional ",
+        "`dtype` or `out` arguments is currently unsupported. In PyTorch 1.7, ",
+        "when `dtype` and `out` are not set a bool fill value will ",
+        "return a tensor of torch.bool dtype, and an integral fill value ",
+        "will return a tensor of torch.long dtype.");
    } else if (fill_value.isComplex()) {
      auto scalar_type = (get_default_dtype() == ScalarType::Double) ?
                            ScalarType::ComplexDouble :
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@ -706,8 +706,9 @@ TensorIterator TensorIterator::unary_op(Tensor& out, const Tensor& a,
    .set_check_mem_overlap(check_mem_overlap)
    .add_output(out)
    .add_input(a)
-    .cast_common_dtype_to_outputs(true)
-    .enforce_safe_casting_to_output(true)
+    .cast_common_dtype_to_outputs(false)
+    .enforce_safe_casting_to_output(false)
+    .check_all_same_dtype(true)
    .build();
 }

--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -762,7 +762,12 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {

  Tensor xtensor = self.expand(padded_size);

-  Tensor result = at::empty(target_size, self.options());
+  Tensor result;
+  if (self.is_quantized()) {
+    result = at::empty_quantized(target_size, self);
+  } else {
+    result = at::empty(target_size, self.options());
+  }

  // return an empty tensor if one of the repeat dimensions is zero
  if (zero_tensor) {
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@ -67,7 +67,7 @@ static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, co

      // Copies the complex result to the actual result and returns it
      result.resize_(complex_result.sizes());
-      result.copy_(complex_result);
+      result.copy_(at::real(complex_result));
      return result;
    }

--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -163,90 +163,11 @@ void masked_fill_kernel(TensorIterator& iter, Scalar value) {
    });
 }

-template <typename scalar_t, typename mask_t, typename func_t>
-void cpu_masked_select_serial_kernel(TensorIterator& iter, const func_t& f) {
-  auto is_mask_bool = std::is_same<mask_t, bool>::value;
-  int64_t offset = 0;
-  auto loop = [&](char** data, const int64_t* strides, int64_t n) {
-    char* dst = data[0];
-    char* src = data[1];
-    char* mask = data[2];
-    for (int64_t i = 0; i < n; i++) {
-      mask_t mask_value = *(mask_t*)(mask + strides[2] * i);
-      if (!is_mask_bool) {
-        TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only");
-      }
-      if (mask_value) {
-        int64_t offset_bytes = offset * sizeof(scalar_t);
-        f(dst, src + strides[1] * i, offset_bytes);
-        offset++;
-      }
-    }
-  };
-  iter.serial_for_each(loop, {0, iter.numel()});
-}
-
-void masked_select_serial_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16,
-    iter.dtype(), "masked_select", [&] {
-      auto mask_dtype = iter.input_dtype(1);
-      if (mask_dtype == at::ScalarType::Bool) {
-        cpu_masked_select_serial_kernel<scalar_t, bool>(iter, [](char* dst, char* src, int64_t offset) {
-          *(scalar_t*)(dst + offset) = *(scalar_t*)src;
-        });
-      } else {
-        cpu_masked_select_serial_kernel<scalar_t, unsigned char>(iter, [](char* dst, char* src, int64_t offset) {
-          *(scalar_t*)(dst + offset) = *(scalar_t*)src;
-        });
-      }
-    });
-}
-
-template <typename scalar_t, typename mask_t, typename func_t>
-void cpu_masked_select_kernel(TensorIterator& iter, const func_t& f) {
-  auto is_mask_bool = std::is_same<mask_t, bool>::value;
-  auto loop = [&](char** data, const int64_t* strides, int64_t n) {
-    char* dst = data[0];
-    char* src = data[1];
-    char* mask = data[2];
-    char* mask_prefix_sum = data[3];
-    for (int64_t i = 0; i < n; i++) {
-      mask_t mask_value = *(mask_t*)(mask + strides[2] * i);
-      if (!is_mask_bool) {
-        TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only");
-      }
-      if (mask_value) {
-        int64_t offset = *(int64_t*)(mask_prefix_sum + strides[3] * i);
-        int64_t offset_bytes = (offset - 1) * sizeof(scalar_t);
-        f(dst, src + strides[1] * i, offset_bytes);
-      }
-    }
-  };
-  iter.for_each(loop);
-}
-
-void masked_select_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16,
-    iter.dtype(), "masked_select", [&] {
-      auto mask_dtype = iter.input_dtype(1);
-      if (mask_dtype == at::ScalarType::Bool) {
-        cpu_masked_select_kernel<scalar_t, bool>(iter, [](char* dst, char* src, int64_t offset) {
-          *(scalar_t*)(dst + offset) = *(scalar_t*)src;
-        });
-      } else {
-        cpu_masked_select_kernel<scalar_t, unsigned char>(iter, [](char* dst, char* src, int64_t offset) {
-          *(scalar_t*)(dst + offset) = *(scalar_t*)src;
-        });
-      }
-    });
-}
-
 } // anonymous namespace

+
 REGISTER_DISPATCH(index_stub, &index_kernel);
 REGISTER_DISPATCH(index_put_stub, &index_put_kernel);
 REGISTER_DISPATCH(masked_fill_stub, &masked_fill_kernel);
-REGISTER_DISPATCH(masked_select_serial_stub, &masked_select_serial_kernel);
-REGISTER_DISPATCH(masked_select_stub, &masked_select_kernel);

 }} // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -1127,6 +1127,12 @@
  variants: method
  device_guard: False

+- func: empty_quantized(int[] size, Tensor qtensor) -> Tensor
+  variants: function
+  dispatch:
+    QuantizedCPU: empty_quantized
+    QuantizedCUDA: empty_quantized
+
 - func: empty.out(int[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
  device_guard: False

@ -5108,6 +5114,8 @@
  dispatch:
    CPU: unfold
    CUDA: unfold
+    QuantizedCPU: unfold
+    QuantizedCUDA: unfold

 - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
  variants: function
--- a/aten/src/ATen/native/quantized/TensorFactories.cpp
+++ b/aten/src/ATen/native/quantized/TensorFactories.cpp
@ -76,5 +76,28 @@ Tensor empty_per_channel_affine_quantized_other_backends_stub(
  TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }

+// Create an empty quantized Tensor with size, based on the options
+// and quantization parameters of the input quantized Tensor
+Tensor empty_quantized(IntArrayRef size, const Tensor& qtensor) {
+  Tensor output;
+  if (qtensor.qscheme() == kPerTensorAffine) {
+    output = at::_empty_affine_quantized(size, qtensor.options(),
+                                         qtensor.q_scale(),
+                                         qtensor.q_zero_point());
+  } else if (qtensor.qscheme() == kPerChannelAffine) {
+    output = at::_empty_per_channel_affine_quantized(
+        size,
+        qtensor.q_per_channel_scales(),
+        qtensor.q_per_channel_zero_points(),
+        qtensor.q_per_channel_axis(),
+        qtensor.options());
+  } else {
+    TORCH_CHECK(false,
+                "QScheme not supported by empty_quantized:",
+                toString(qtensor.qscheme()));
+  }
+  return output;
+}
+
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/quantized/cpu/q_avgpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/q_avgpool.cpp
@ -5,7 +5,7 @@
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <c10/util/math_compat.h>

 #include <algorithm>
@ -375,7 +375,7 @@ Tensor qnnpack_avg_pool2d(
  CAFFE_ENFORCE(
      setupStatus == pytorch_qnnp_status_success,
      "failed to setup QNNPACK Average Pooling operator");
-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();
  const pytorch_qnnp_status runStatus =
      pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
  TORCH_INTERNAL_ASSERT(
--- a/aten/src/ATen/native/quantized/cpu/q_avgpool3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/q_avgpool3d.cpp
@ -5,7 +5,6 @@
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
 #include <c10/util/math_compat.h>

 #include <algorithm>
--- a/aten/src/ATen/native/quantized/cpu/qadd.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp
@ -7,7 +7,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>

@ -194,7 +194,7 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
      setupStatus == pytorch_qnnp_status_success,
      "failed to setup QNNPACK Add operator");

-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();
  const pytorch_qnnp_status runStatus =
      pytorch_qnnp_run_operator(qnnpack_operator, threadpool);

--- a/aten/src/ATen/native/quantized/cpu/qchannel_shuffle.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qchannel_shuffle.cpp
@ -8,7 +8,7 @@
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <c10/core/TensorOptions.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>

@ -82,7 +82,7 @@ Tensor quantized_channel_shuffle_impl(
      setupStatus == pytorch_qnnp_status_success,
      "failed to setup QNNPACK ChannelShuffle operator");

-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();
  const pytorch_qnnp_status runStatus =
      pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
  TORCH_INTERNAL_ASSERT(
--- a/aten/src/ATen/native/quantized/cpu/qclamp.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
@ -7,7 +7,7 @@
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <ATen/quantized/Quantizer.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>

@ -64,7 +64,7 @@ Tensor qnnpack_clamp(Tensor input, Scalar min, Scalar max) {
  TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
                        "failed to setup QNNPACK Clamp operator");

-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();

  const pytorch_qnnp_status runStatus =
    pytorch_qnnp_run_operator(clamp_op, threadpool);
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@ -10,7 +10,7 @@
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <ATen/native/quantized/cpu/quant_utils.h>
 #include <ATen/native/quantized/cpu/conv_packed_params.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 template <int kSpatialDim = 2>
 bool ConvDimChecks(
@ -603,7 +603,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
      output_min,
      output_max,
      reinterpret_cast<uint8_t*>(output.template data_ptr<c10::quint8>()),
-      caffe2::mobile_pthreadpool());
+      caffe2::pthreadpool_());

  TORCH_INTERNAL_ASSERT(
      run_status == pytorch_qnnp_status_success,
--- a/aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
@ -5,7 +5,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>

@ -57,7 +57,7 @@ Tensor qnnpack_hardsigmoid(Tensor input) {
  TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
                        "failed to setup QNNPACK Hardsigmoid operator");

-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();

  const pytorch_qnnp_status runStatus =
    pytorch_qnnp_run_operator(hardsigmoid_op, threadpool);
--- a/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
@ -5,7 +5,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>

@ -51,7 +51,7 @@ Tensor qnnpack_hardswish(const Tensor& qx, Tensor& qy) {
  TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
                        "failed to setup QNNPACK Hardswish operator");

-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();

  const pytorch_qnnp_status runStatus =
    pytorch_qnnp_run_operator(hardswish_op, threadpool);
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -4,7 +4,7 @@
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/packed_params.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/custom_class.h>
 #include <torch/library.h>

@ -341,7 +341,9 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
      packB->getPackedWeights(),
      (uint8_t*)output.data_ptr<c10::quint8>(),
      rows_w /* output_stride */,
-      caffe2::mobile_pthreadpool() /* threadpool */);
+      // TODO (Ashkan): Disabling temporarily.
+      // Throws a floating point exception with OSS pthreadpool.
+      caffe2::pthreadpool_() /* threadpool */);

  TORCH_INTERNAL_ASSERT(
      runStatus == pytorch_qnnp_status_success,
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@ -5,7 +5,7 @@
 #include <ATen/native/quantized/cpu/packed_params.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <ATen/native/quantized/cpu/quant_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>

 #include <torch/custom_class.h>
@ -241,8 +241,17 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {

  // Calculate statistics for quantization of input Tensor
  // TODO: optimized kernel
-  float x_min = input_contig.min().item<float>();
-  float x_max = input_contig.max().item<float>();
+  float x_min;
+  float x_max;
+  if (input.numel() > 0) {
+    x_min = input_contig.min().item<float>();
+    x_max = input_contig.max().item<float>();
+  } else {
+    // On empty input, no output data will be generated,
+    // so use arbitrary qparams.
+    x_min = 0;
+    x_max = 0;
+  }

  auto q_params = quant_utils::ChooseQuantizationParams(
      /*min=*/x_min,
@ -327,7 +336,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
      bias_ptr,
      output.data_ptr<float>(),
      rows_w /* output_stride */,
-      caffe2::mobile_pthreadpool() /* threadpool */);
+      caffe2::pthreadpool_() /* threadpool */);

  TORCH_INTERNAL_ASSERT(
      runStatus == pytorch_qnnp_status_success,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-dynamic-run.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-dynamic-run.cc
@ -100,6 +100,12 @@ enum pytorch_qnnp_status qnnpackLinearDynamic(
      .ukernel = pytorch_qnnp_params.q8conv.gemm_dq,
  };

+  if (output_size == 0) {
+      // pthreadpool can tolerate a range of 0, but not a tile of 0.
+      // We use output_size as a tile size, so bail here if it's 0.
+      return pytorch_qnnp_status_success;
+  }
+
  pthreadpool_compute_4d_tiled(
      threadpool,
      (pthreadpool_function_4d_tiled_t)compute_q8gemm_dq,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-run.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-run.cc
@ -98,6 +98,12 @@ enum pytorch_qnnp_status qnnpackLinear(
      .ukernel = pytorch_qnnp_params.q8conv.gemm,
  };

+  if (output_size == 0) {
+      // pthreadpool can tolerate a range of 0, but not a tile of 0.
+      // We use output_size as a tile size, so bail here if it's 0.
+      return pytorch_qnnp_status_success;
+  }
+
  pthreadpool_compute_4d_tiled(
      threadpool,
      (pthreadpool_function_4d_tiled_t) compute_q8gemm,
--- a/aten/src/ATen/native/quantized/cpu/qpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qpool.cpp
@ -9,7 +9,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>
 #include <vector>
@ -346,7 +346,7 @@ void check_maxpool2d_params(
       setupStatus == pytorch_qnnp_status_success,
       "failed to setup QNNPACK MaxPool operator");

-   pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+   pthreadpool_t threadpool = caffe2::pthreadpool_();
   const pytorch_qnnp_status runStatus =
       pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
   TORCH_INTERNAL_ASSERT(
--- a/aten/src/ATen/native/quantized/cpu/qreduction.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qreduction.cpp
@ -3,7 +3,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 namespace at {
 namespace native {
@ -66,7 +66,7 @@ Tensor qnnpack_mean(const Tensor& input, IntArrayRef dim) {
  CAFFE_ENFORCE(
      setupStatus == pytorch_qnnp_status_success,
      "failed to setup QNNPACK Global Average Pooling operator");
-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();
  const pytorch_qnnp_status runStatus =
      pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
  TORCH_INTERNAL_ASSERT(
--- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
@ -6,7 +6,7 @@
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>

 #include <algorithm>
@ -69,7 +69,7 @@ Tensor qnnpack_relu(Tensor input) {
      setupStatus == pytorch_qnnp_status_success,
      "failed to setup QNNPACK Relu operator");

-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();

  const pytorch_qnnp_status runStatus =
      pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
--- a/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
@ -7,7 +7,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>

@ -66,7 +66,7 @@ Tensor qnnpack_sigmoid(Tensor input) {
  TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
                        "failed to setup QNNPACK sigmoid operator");

-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();

  const pytorch_qnnp_status runStatus =
    pytorch_qnnp_run_operator(sigmoid_op, threadpool);
--- a/aten/src/ATen/native/quantized/cpu/qtanh.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qtanh.cpp
@ -7,7 +7,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/native/quantized/cpu/qnnpack_utils.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 #include <algorithm>

@ -64,7 +64,7 @@ Tensor qnnpack_tanh(Tensor input) {
  TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
                        "failed to setup QNNPACK TanH operator");

-  pthreadpool_t threadpool = caffe2::mobile_pthreadpool();
+  pthreadpool_t threadpool = caffe2::pthreadpool_();

  const pytorch_qnnp_status runStatus =
    pytorch_qnnp_run_operator(tanh_op, threadpool);
--- a/aten/src/ATen/native/xnnpack/Common.h
+++ b/aten/src/ATen/native/xnnpack/Common.h
@ -5,7 +5,7 @@
 #ifdef USE_XNNPACK

 #include <xnnpack.h>
-#include <caffe2/utils/threadpool/ThreadPoolXNNPACK.h>
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 namespace at {
 namespace native {
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@ -208,15 +208,15 @@ Tensor run(
      padded_input_nhwc.size(Layout::Activation4D::width),   // input_width
      padded_input_nhwc.data_ptr<float>(),                   // input
      output.data_ptr<float>(),                              // output
-      caffe2::xnnpack_threadpool());                         // threadpool
+      caffe2::pthreadpool_());                               // threadpool

  TORCH_CHECK(
      xnn_status_success == setup_status,
      "xnn_setup_convolution2d_nhwc_f32 failed!");

  const xnn_status run_status = xnn_run_operator(
-      context.op.get(),               // operator
-      caffe2::xnnpack_threadpool());  // threadpool
+      context.op.get(),         // operator
+      caffe2::pthreadpool_());  // threadpool

  TORCH_INTERNAL_ASSERT(
      xnn_status_success == run_status,
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@ -137,15 +137,15 @@ Tensor run(
      Layout::ActivationND::batch(padded_input.sizes()),  // Batch,
      padded_input.data_ptr<float>(),                     // input
      output.data_ptr<float>(),                           // output
-      caffe2::xnnpack_threadpool());                      // threadpool
+      caffe2::pthreadpool_());                            // threadpool

  TORCH_CHECK(
      xnn_status_success == setup_status,
      "xnn_setup_fully_connected_nc_f32 failed!");

  const xnn_status run_status = xnn_run_operator(
-      context.op.get(),               // operator
-      caffe2::xnnpack_threadpool());  // threadpool
+      context.op.get(),         // operator
+      caffe2::pthreadpool_());  // threadpool

  TORCH_INTERNAL_ASSERT(
      xnn_status_success == run_status,
--- a/aten/src/ATen/native/xnnpack/MaxPooling.cpp
+++ b/aten/src/ATen/native/xnnpack/MaxPooling.cpp
@ -219,15 +219,15 @@ Tensor max_pool2d(
      input_padded_contig_nhwc.size(Layout::Activation4D::width),   // input_width
      input_padded_contig_nhwc.data_ptr<float>(),                   // input
      output_padded_contig_nhwc.data_ptr<float>(),                  // output
-      caffe2::xnnpack_threadpool());                                // threadpool
+      caffe2::pthreadpool_());                                      // threadpool

  TORCH_CHECK(
      xnn_status_success == setup_status,
      "xnn_setup_max_pooling2d_nhwc_f32 failed!");

  const xnn_status run_status = xnn_run_operator(
-      max_pool_op,                    // operator
-      caffe2::xnnpack_threadpool());  // threadpool
+      max_pool_op,              // operator
+      caffe2::pthreadpool_());  // threadpool

  TORCH_INTERNAL_ASSERT(
      xnn_status_success == run_status,
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@ -4,10 +4,10 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/TensorFactories.h>
 #include <ATen/native/quantized/affine_quantizer.h>
 #include <ATen/quantized/QTensorImpl.h>
-#include <c10/core/Allocator.h>
 #include <c10/core/CPUAllocator.h>
 #include <cmath>
 #include <typeinfo>
@ -66,7 +66,9 @@ inline Tensor new_qtensor(
    const TensorOptions& options,
    QuantizerPtr quantizer) {
  auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
-  at::Allocator* allocator = GetAllocator(options.device().type());
+  at::Allocator* allocator = options.device().type() == DeviceType::CUDA
+    ? at::detail::getCUDAHooks().getCUDADeviceAllocator()
+    : at::getCPUAllocator();

 #ifdef USE_PYTORCH_QNNPACK
  if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
--- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
@ -99,6 +99,50 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src)

 #if !defined(TH_REAL_IS_HALF) /* non half part */

+void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask)
+{
+  at::NoNamesGuard guard;
+  ptrdiff_t numel = THTensor_wrap(mask).sum().item<int64_t>();
+  scalar_t *tensor_data;
+
+#ifdef DEBUG
+  THAssert(numel <= LONG_MAX);
+#endif
+  THTensor_(resize1d)(tensor,numel);
+  tensor_data = tensor->data<scalar_t>();
+  TH_TENSOR_APPLY2(scalar_t, src, unsigned char, mask,
+                   if (*mask_data > 1)
+                   {
+                     THFree(mask_counter);
+                     THFree(src_counter);
+                     THError("Mask tensor can take 0 and 1 values only");
+                   }
+                   else if (*mask_data == 1)
+                   {
+                     *tensor_data = *src_data;
+                     tensor_data++;
+                   });
+}
+
+void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor *src, THBoolTensor *mask)
+{
+  at::NoNamesGuard guard;
+  ptrdiff_t numel = THTensor_wrap(mask).sum().item<int64_t>();
+  scalar_t *tensor_data;
+
+#ifdef DEBUG
+  THAssert(numel <= LONG_MAX);
+#endif
+  THTensor_(resize1d)(tensor,numel);
+  tensor_data = tensor->data<scalar_t>();
+  TH_TENSOR_APPLY2(scalar_t, src, bool, mask,
+                   if (*mask_data)
+                   {
+                     *tensor_data = *src_data;
+                     tensor_data++;
+                   });
+}
+
 void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src )
 {
  THTensor *srct = THTensor_(newContiguous)(src);
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@ -9,6 +9,8 @@ TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);

 #if !defined(TH_REAL_IS_HALF)

+TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask);
+TH_API void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor* src, THBoolTensor *mask);
 TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src);
 TH_API void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTensor* src);

--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@ -155,10 +155,16 @@ public:

  static std::tuple<int, int> priority_range() {
    #ifndef __HIP_PLATFORM_HCC__
+      // Note: this returns the range of priority **supported by PyTorch**, not
+      // the range of priority **supported by CUDA**. The former is a subset of
+      // the latter. Curently PyTorch only supports 0 and -1, which are "low" and
+      // "high" priority.
      int least_priority, greatest_priority;
      C10_CUDA_CHECK(
        cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
-      return std::make_tuple(least_priority, greatest_priority);
+      TORCH_INTERNAL_ASSERT(least_priority >= 0, "Unexpected CUDA stream priority range");
+      TORCH_INTERNAL_ASSERT(greatest_priority <= -1, "Unexpected CUDA stream priority range");
+      return std::make_tuple(0, -1);
    #else
      AT_ERROR("cuDeviceGetStreamPriorityRange with HIP is not supported");
    #endif
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -87,7 +87,6 @@ endif()
 # Note: the folders that are being commented out have not been properly
 # addressed yet.

-# For pthreadpool_new_if_impl. TODO: Remove when threadpools are unitied.
 if(NOT MSVC AND USE_XNNPACK)
  if(NOT TARGET fxdiv)
    set(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
@ -96,10 +95,6 @@ if(NOT MSVC AND USE_XNNPACK)
      "${FXDIV_SOURCE_DIR}"
      "${CMAKE_BINARY_DIR}/FXdiv")
  endif()
-  if(NOT (INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE))
-    set_source_files_properties(
-        utils/threadpool/pthreadpool_new_if_impl.c PROPERTIES COMPILE_FLAGS -fno-openmp)
-  endif()
 endif()

 add_subdirectory(core)
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@ -818,6 +818,67 @@ c10::optional<int> OperatorBase::argumentIndexWithName(
 #endif
 }

+bool OperatorBase::RunAsync(int stream_id) {
+  try {
+    auto result = Run(stream_id);
+    if (result) {
+      if (HasAsyncPart()) {
+        RecordEvent();
+      } else {
+        SetEventFinished();
+      }
+    } else {
+      SetEventFinished(getErrorMsg().c_str());
+    }
+    return result;
+  } catch (EnforceNotMet& err) {
+    SetEventFinishedWithException(err.what());
+    throw;
+  } catch (const std::exception& err) {
+    SetEventFinishedWithException(err.what());
+    throw;
+  } catch (...) {
+    SetEventFinishedWithException(getErrorMsg().c_str());
+    throw;
+  }
+}
+
+void OperatorBase::AddRelatedBlobInfo(EnforceNotMet* err) {
+  CAFFE_ENFORCE(
+      isLegacyOperator(),
+      "AddRelatedBlobInfo(err) not supported for operators exported to c10.");
+
+  if (!has_debug_def()) {
+    return;
+  }
+
+  bool found_input = false;
+  bool found_output = false;
+  if (err->caller() != nullptr) {
+    std::ostringstream oss;
+    for (size_t i = 0; i < inputs_.size(); i++) {
+      if (inputs_[i]->GetRaw() == err->caller()) {
+        found_input = true;
+        oss << "while accessing input: " << debug_def().input(i);
+        break;
+      }
+    }
+    for (size_t i = 0; i < outputs_.size(); i++) {
+      if (outputs_[i]->GetRaw() == err->caller()) {
+        found_output = true;
+        if (found_input) {
+          oss << " OR ";
+        }
+        oss << "while accessing output: " << debug_def().output(i);
+        break;
+      }
+    }
+    if (found_input || found_output) {
+      err->add_context(oss.str());
+    }
+  }
+}
+
 OperatorBase::~OperatorBase() noexcept = default;

 #ifndef C10_MOBILE
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -480,70 +480,13 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {

  virtual void CancelAsyncCallback() {}

-  // RunAsync, if implemenented by the specific operators, will schedule the
+  // RunAsync, if implemented by the specific operators, will schedule the
  // computation on the corresponding context and record the event in its
  // event_ member object. If the specific operator does not support RunAsync,
  // it will simply be synchronous as a fallback.
-  virtual bool RunAsync(int stream_id = 0) {
-    try {
-      auto result = Run(stream_id);
-      if (result) {
-        if (HasAsyncPart()) {
-          RecordEvent();
-        } else {
-          SetEventFinished();
-        }
-      } else {
-        SetEventFinished(getErrorMsg().c_str());
-      }
-      return result;
-    } catch (EnforceNotMet& err) {
-      SetEventFinishedWithException(err.what());
-      throw;
-    } catch (const std::exception& err) {
-      SetEventFinishedWithException(err.what());
-      throw;
-    } catch (...) {
-      SetEventFinishedWithException(getErrorMsg().c_str());
-      throw;
-    }
-  }
+  virtual bool RunAsync(int stream_id = 0);

-  virtual void AddRelatedBlobInfo(EnforceNotMet* err) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "AddRelatedBlobInfo(err) not supported for operators exported to c10.");
-
-    if (!has_debug_def()) {
-      return;
-    }
-
-    bool found_input = false;
-    bool found_output = false;
-    if (err->caller() != nullptr) {
-      std::ostringstream oss;
-      for (size_t i = 0; i < inputs_.size(); i++) {
-        if (inputs_[i]->GetRaw() == err->caller()) {
-          found_input = true;
-          oss << "while accessing input: " << debug_def().input(i);
-          break;
-        }
-      }
-      for (size_t i = 0; i < outputs_.size(); i++) {
-        if (outputs_[i]->GetRaw() == err->caller()) {
-          found_output = true;
-          if (found_input) {
-            oss << " OR ";
-          }
-          oss << "while accessing output: " << debug_def().output(i);
-          break;
-        }
-      }
-      if (found_input || found_output) {
-        err->add_context(oss.str());
-      }
-    }
-  }
+  virtual void AddRelatedBlobInfo(EnforceNotMet* err);

  virtual std::string debug_info_string() const {
    return "";
--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@ -3,6 +3,25 @@

 namespace caffe2 {

+OpSchema::OpSchema(const string& type, const string& file, const int line)
+   : type_(type), file_(file), line_(line), tensor_inference_function_(
+      [](const OperatorDef& def, const vector<TensorShape>&) {
+        vector<TensorShape> out;
+        for (int i = 0; i < def.output_size(); i++) {
+          TensorShape ts;
+          ts.set_unknown_shape(true);
+          out.push_back(ts);
+        }
+        return out;
+      }), device_inference_function_(
+      [](const OperatorDef& def) {
+        auto op_device =
+            def.has_device_option() ? def.device_option() : DeviceOption();
+        vector<DeviceOption> in_dev(def.input_size(), op_device);
+        vector<DeviceOption> out_dev(def.output_size(), op_device);
+        return std::make_pair(in_dev, out_dev);
+      }) {}
+
 bool OpSchema::Verify(const OperatorDef& def) const {
  // Check the number of inputs.
  if (def.input_size() < min_input_ || def.input_size() > max_input_) {
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@ -39,9 +39,8 @@ constexpr int kCannotComputeNumOutputs = -1;
 */
 class CAFFE2_API OpSchema {
 public:
-  OpSchema() : type_("unknown"), file_("unknown"), line_(0) {}
-  OpSchema(const string& type, const string& file, const int line)
-      : type_(type), file_(file), line_(line) {}
+  OpSchema() : OpSchema("unknown", "unknown", 0) {}
+  OpSchema(const string& type, const string& file, const int line);

  /**
   * @brief Returns the file that the op schema is registered from.
@ -443,25 +442,9 @@ class CAFFE2_API OpSchema {
  std::function<bool(int, int)> inplace_enforced_ = [](int, int) {
    return false;
  };
-  TensorInferenceFunctionType tensor_inference_function_ =
-      [](const OperatorDef& def, const vector<TensorShape>&) {
-        vector<TensorShape> out;
-        for (int i = 0; i < def.output_size(); i++) {
-          TensorShape ts;
-          ts.set_unknown_shape(true);
-          out.push_back(ts);
-        }
-        return out;
-      };
+  TensorInferenceFunctionType tensor_inference_function_;
  std::unique_ptr<CostInferenceFunctionType> cost_inference_function_ = nullptr;
-  DeviceInferenceFunctionType device_inference_function_ =
-      [](const OperatorDef& def) {
-        auto op_device =
-            def.has_device_option() ? def.device_option() : DeviceOption();
-        vector<DeviceOption> in_dev(def.input_size(), op_device);
-        vector<DeviceOption> out_dev(def.output_size(), op_device);
-        return std::make_pair(in_dev, out_dev);
-      };
+  DeviceInferenceFunctionType device_inference_function_;

  std::function<std::vector<TensorFiller>(
      const std::vector<std::vector<int64_t>>&)>
--- a/caffe2/operators/quantized/int8_add_op.h
+++ b/caffe2/operators/quantized/int8_add_op.h
@ -88,7 +88,7 @@ class Int8AddOp final : public Operator<CPUContext> {
        setupStatus == qnnp_status_success,
        "failed to setup QNNPACK add operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
    const qnnp_status runStatus =
        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_average_pool_op.h
+++ b/caffe2/operators/quantized/int8_average_pool_op.h
@ -80,7 +80,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> {
          setupStatus == qnnp_status_success,
          "failed to setup QNNPACK Global Average Pooling operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
      const qnnp_status runStatus =
          qnnp_run_operator(this->qnnpackGlobalOperator_,
            nullptr /* thread pool */);
@ -122,7 +122,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> {
          setupStatus == qnnp_status_success,
          "failed to setup QNNPACK Average Pooling operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
      const qnnp_status runStatus =
          qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_channel_shuffle_op.h
+++ b/caffe2/operators/quantized/int8_channel_shuffle_op.h
@ -72,7 +72,7 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> {
        setupStatus == qnnp_status_success,
        "failed to setup QNNPACK channel shuffle operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
    const qnnp_status runStatus =
        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_conv_op.h
+++ b/caffe2/operators/quantized/int8_conv_op.h
@ -141,7 +141,7 @@ class Int8ConvOp final : public ConvPoolOpBase<CPUContext> {
        lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
      }

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
      const qnnp_status runStatus =
          qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_conv_transpose_op.h
+++ b/caffe2/operators/quantized/int8_conv_transpose_op.h
@ -140,7 +140,7 @@ class Int8ConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
        lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
      }

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
      const qnnp_status runStatus =
          qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_fc_op.h
+++ b/caffe2/operators/quantized/int8_fc_op.h
@ -104,7 +104,7 @@ class Int8FCOp final : public Operator<CPUContext> {
        lastOutputPointer_ = Y->t.template mutable_data<uint8_t>();
      }

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
      const qnnp_status runStatus =
          qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_leaky_relu_op.h
+++ b/caffe2/operators/quantized/int8_leaky_relu_op.h
@ -80,7 +80,7 @@ class Int8LeakyReluOp final : public Operator<CPUContext> {
        setupStatus == qnnp_status_success,
        "failed to setup QNNPACK Leaky ReLU operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
    const qnnp_status runStatus =
        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_max_pool_op.h
+++ b/caffe2/operators/quantized/int8_max_pool_op.h
@ -74,7 +74,7 @@ class Int8MaxPoolOp final : public ConvPoolOpBase<CPUContext> {
        setupStatus == qnnp_status_success,
        "failed to setup QNNPACK Max Pooling operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
    const qnnp_status runStatus =
        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_relu_op.h
+++ b/caffe2/operators/quantized/int8_relu_op.h
@ -65,7 +65,7 @@ class Int8ReluOp final : public Operator<CPUContext> {
        setupStatus == qnnp_status_success,
        "failed to setup QNNPACK Clamp operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
    const qnnp_status runStatus =
        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_sigmoid_op.h
+++ b/caffe2/operators/quantized/int8_sigmoid_op.h
@ -73,7 +73,7 @@ class Int8SigmoidOp final : public Operator<CPUContext> {
        setupStatus == qnnp_status_success,
        "failed to setup QNNPACK Sigmoid operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
    const qnnp_status runStatus =
        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
 #else
--- a/caffe2/operators/quantized/int8_softmax_op.h
+++ b/caffe2/operators/quantized/int8_softmax_op.h
@ -73,7 +73,7 @@ class Int8SoftmaxOp final : public Operator<CPUContext> {
        setupStatus == qnnp_status_success,
        "failed to setup QNNPACK SoftArgMax operator");

-#ifdef FBCODE_CAFFE2
+#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
    const qnnp_status runStatus =
        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
 #else
--- a/caffe2/python/init.py
+++ b/caffe2/python/init.py
@ -42,13 +42,48 @@ if platform.system() == 'Windows':
    else:
        cuda_path = ''

-    if not is_conda and sys.version_info >= (3, 8):
-        dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]))
+    import ctypes
+    kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
+    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]))
+    with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
+    prev_error_mode = kernel32.SetErrorMode(0x0001)

-        for dll_path in dll_paths:
+    kernel32.LoadLibraryW.restype = ctypes.c_void_p
+    if with_load_library_flags:
+        kernel32.AddDllDirectory.restype = ctypes.c_void_p
+        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
+
+    for dll_path in dll_paths:
+        if sys.version_info >= (3, 8):
            os.add_dll_directory(dll_path)
-    else:
-        dll_paths = [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]
-        dll_paths = list(filter(os.path.exists, dll_paths)) + [os.environ['PATH']]
+        elif with_load_library_flags:
+            res = kernel32.AddDllDirectory(dll_path)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += ' Error adding "{}" to the DLL directories.'.format(dll_path)
+                raise err

-        os.environ['PATH'] = ';'.join(dll_paths)
+    dlls = glob.glob(os.path.join(th_dll_path, '*.dll'))
+    path_patched = False
+    for dll in dlls:
+        is_loaded = False
+        if with_load_library_flags:
+            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
+            last_error = ctypes.get_last_error()
+            if res is None and last_error != 126:
+                err = ctypes.WinError(last_error)
+                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(dll)
+                raise err
+            elif res is not None:
+                is_loaded = True
+        if not is_loaded:
+            if not path_patched:
+                os.environ['PATH'] = ';'.join(dll_paths + [os.environ['PATH']])
+                path_patched = True
+            res = kernel32.LoadLibraryW(dll)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(dll)
+                raise err
+
+    kernel32.SetErrorMode(prev_error_mode)
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@ -4,6 +4,7 @@
 #include <istream>
 #include <ostream>
 #include <fstream>
+#include <algorithm>

 #include <c10/core/Allocator.h>
 #include <c10/core/Backend.h>
@ -303,10 +304,10 @@ void PyTorchStreamWriter::setup(const string& file_name) {

  mz_zip_writer_init_v2(ar_.get(), 0, MZ_ZIP_FLAG_WRITE_ZIP64);
  valid("initializing archive ", file_name.c_str());
+}

-  std::string version = c10::to_string(kProducedFileFormatVersion);
-  version.push_back('\n');
-  writeRecord("version", version.c_str(), version.size());
+void PyTorchStreamWriter::setMinVersion(const uint64_t version) {
+  version_ = std::max(version, version_);
 }

 void PyTorchStreamWriter::writeRecord(
@ -339,6 +340,11 @@ void PyTorchStreamWriter::writeRecord(
 }

 void PyTorchStreamWriter::writeEndOfFile() {
+  // Writes version info
+  std::string version = c10::to_string(version_);
+  version.push_back('\n');
+  writeRecord("version", version.c_str(), version.size());
+
  AT_ASSERT(!finalized_);
  finalized_ = true;
  mz_zip_writer_finalize_archive(ar_.get());
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@ -94,14 +94,45 @@ constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
 constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L;

 // Versions (i.e. why was the version number bumped?)
+
+// Note [Dynamic Versions and torch.jit.save vs. torch.save]
+//
+// Our versioning scheme has a "produced file format version" which
+// describes how an archive is to be read. The version written in an archive
+// is at least this current produced file format version, but may be greater
+// if it includes certain symbols. We refer to these conditional versions
+// as "dynamic," since they are identified at runtime.
+//
+// Dynamic versioning is useful when an operator's semantics are updated.
+// When using torch.jit.save we want those semantics to be preserved. If
+// we bumped the produced file format version on every change, however,
+// then older versions of PyTorch couldn't read even simple archives, like
+// a single tensor, from newer versions of PyTorch. Instead, we
+// assign dynamic versions to these changes that override the
+// produced file format version as needed. That is, when the semantics
+// of torch.div changed it was assigned dynamic version 4, and when
+// torch.jit.saving modules that use torch.div those archives also have
+// (at least) version 4. This prevents earlier versions of PyTorch
+// from accidentally performing the wrong kind of division. Modules
+// that don't use torch.div or other operators with dynamic versions
+// can write the produced file format version, and these programs will
+// run as expected on earlier versions of PyTorch.
+//
+// While torch.jit.save attempts to preserve operator semantics,
+// torch.save does not. torch.save is analogous to pickling Python, so
+// a function that uses torch.div will have different behavior if torch.saved
+// and torch.loaded across PyTorch versions. From a technical perspective,
+// torch.save ignores dynamic versioning.
+
 // 1. Initial version
 // 2. Removed op_version_set version numbers
 // 3. Added type tags to pickle serialization of container types
-// 4. Stopped integer division using torch.div
+// 4. (Dynamic) Stopped integer division using torch.div
 //      (a versioned symbol preserves the historic behavior of versions 1--3)
-// 5. (Read-only) Stops torch.full inferring a floating point dtype
-//      when given integer fill values.
-constexpr uint64_t kProducedFileFormatVersion = 0x4L;
+// 5. (Dynamic) Stops torch.full inferring a floating point dtype
+//      when given bool or integer fill values.
+//      (a versioned symbol preserves the historic behavior of versions 1--4)
+constexpr uint64_t kProducedFileFormatVersion = 0x3L;

 // Writer-specific constants
 constexpr uint64_t kFieldAlignment = 64;
@ -144,6 +175,8 @@ class CAFFE2_API PyTorchStreamWriter final {
  explicit PyTorchStreamWriter(
      const std::function<size_t(const void*, size_t)>& writer_func);

+   void setMinVersion(const uint64_t version);
+
  void writeRecord(
      const std::string& name,
      const void* data,
@ -171,6 +204,7 @@ class CAFFE2_API PyTorchStreamWriter final {
  std::string padding_;
  std::ofstream file_stream_;
  std::function<size_t(const void*, size_t)> writer_func_;
+  uint64_t version_ = kProducedFileFormatVersion;
  bool finalized_ = false;
  bool err_seen_ = false;
  friend size_t ostream_write_func(
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@ -195,7 +195,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
  const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
                                     .height = static_cast<size_t>(stride_h())};
  initNNPACK();
+
+#if !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
+  pthreadpool_t pool = nullptr;
+#else
  pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
+#endif

  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
    if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@ -1,15 +1,8 @@
-# TODO: Add ThreadPoolXNNPACK.cc when XNNPACK integration is updated
-# to pass the actual threadpool ptr instead of nullptr.
 if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
-  add_definitions(-DUSE_INTERNAL_THREADPOOL_IMPL)
  list(APPEND Caffe2_CPU_SRCS
    utils/string_utils.cc
-    utils/threadpool/pthreadpool.cc
-    utils/threadpool/pthreadpool_impl.cc
-    utils/threadpool/pthreadpool_new_if_impl.c
+    utils/threadpool/pthreadpool-cpp.cc
    utils/threadpool/ThreadPool.cc
-    utils/threadpool/ThreadPoolMobile.cc
-    utils/threadpool/ThreadPoolXNNPACK.cc
  )
  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
  return()
@ -28,23 +21,19 @@ list(APPEND Caffe2_CPU_SRCS
  utils/proto_convert.cc
  utils/proto_utils.cc
  utils/proto_wrap.cc
+  utils/threadpool/ThreadPool.cc
  utils/signal_handler.cc
  utils/smart_tensor_printer.cc
-  utils/string_utils.cc
-  utils/threadpool/ThreadPool.cc)
+  utils/string_utils.cc)

-# ---[ threadpool/pthreadpool* is a local modification of the NNPACK
-# pthreadpool with a very similar interface. Neither NNPACK, nor this
-# thread pool supports Windows.
-if(NOT MSVC AND USE_XNNPACK)
-  add_definitions(-DUSE_INTERNAL_THREADPOOL_IMPL)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
-          utils/threadpool/pthreadpool.cc
-          utils/threadpool/pthreadpool_impl.cc
-          utils/threadpool/pthreadpool_new_if_impl.c
-          utils/threadpool/ThreadPoolMobile.cc
-          utils/threadpool/ThreadPoolXNNPACK.cc
-          )
+if(USE_PTHREADPOOL)
+  list(APPEND Caffe2_CPU_SRCS
+    utils/threadpool/pthreadpool-cpp.cc)
+  if(USE_INTERNAL_PTHREADPOOL_IMPL)
+    list(APPEND Caffe2_CPU_SRCS
+      utils/threadpool/pthreadpool.cc
+      utils/threadpool/pthreadpool_impl.cc)
+  endif()
 endif()

 set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
--- a/caffe2/utils/threadpool/ThreadPoolMobile.cc
+++ b/caffe2/utils/threadpool/ThreadPoolMobile.cc
@ -1,21 +0,0 @@
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
-#include <caffe2/utils/threadpool/ThreadPool.h>
-#include <caffe2/utils/threadpool/pthreadpool.h>
-
-namespace caffe2 {
-
-caffe2::ThreadPool* mobile_threadpool() {
-#ifdef C10_MOBILE
-  static std::unique_ptr<caffe2::ThreadPool> thread_pool =
-      caffe2::ThreadPool::defaultThreadPool();
-  return thread_pool.get();
-#else
-  return nullptr;
-#endif
-}
-
-pthreadpool_t mobile_pthreadpool() {
-  return reinterpret_cast<pthreadpool_t>(mobile_threadpool());
-}
-
-} // namespace caffe2
--- a/caffe2/utils/threadpool/ThreadPoolMobile.h
+++ b/caffe2/utils/threadpool/ThreadPoolMobile.h
@ -1,24 +0,0 @@
-#pragma once
-#include <caffe2/utils/threadpool/pthreadpool.h>
-
-// TODO Implement a parallel_for version for Mobile here, add to Aten/Parallel.h
-
-namespace caffe2 {
-
-class ThreadPool;
-
-// Return a singleton instance of caffe2::ThreadPool for ATen/TH multithreading.
-ThreadPool* mobile_threadpool();
-
-// NOTE: This interface is temporary and should not be used.
-// Please use Aten/Parallel.h for parallel primitives in pytorch.
-// This implementation will be used by pytorch mobile, specifically
-// NNPACK/QNNPACK. For mobile we need to use caffe2::ThreadPool instead of the
-// 3rd party pthreadpool. Future work (TODO) Implement a mobile version of
-// "at::parallel_for" using caffe2::ThreadPool so all ATen/TH multithreading
-// usage is mobile friendly; Refactor QNNPACK or pthreadpool to explicitly using
-// "at::parallel_for" primitive to replace pthreadpool_compute_1d for Pytorch;
-pthreadpool_t mobile_pthreadpool();
-
-size_t getDefaultNumThreads();
-} // namespace caffe2
--- a/caffe2/utils/threadpool/ThreadPoolXNNPACK.cc
+++ b/caffe2/utils/threadpool/ThreadPoolXNNPACK.cc
@ -1,22 +0,0 @@
-#include <caffe2/utils/threadpool/pthreadpool.h>
-#include <caffe2/utils/threadpool/ThreadPoolMobile.h>
-#include <caffe2/utils/threadpool/ThreadPoolXNNPACK.h>
-#include <memory>
-
-namespace caffe2 {
-
-// Will be unified.
-pthreadpool_t xnnpack_threadpool() {
-// Depending on internal implemenation vs. OSS we will link against pthreadpool_create_xnnpack
-// or pthreadpool_create. This is only temporary. It will be unified soon.
-#ifdef USE_INTERNAL_THREADPOOL_IMPL
-  static std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy_xnnpack)>
-      threadpool(pthreadpool_create_xnnpack(getDefaultNumThreads()), pthreadpool_destroy_xnnpack);
-#else
-  static std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)>
-      threadpool(pthreadpool_create(getDefaultNumThreads()), pthreadpool_destroy);
-#endif
-  return threadpool.get();
-}
-
-} // namespace caffe2
--- a/caffe2/utils/threadpool/ThreadPoolXNNPACK.h
+++ b/caffe2/utils/threadpool/ThreadPoolXNNPACK.h
@ -1,7 +0,0 @@
-#pragma once
-// Creating a separate .h/.cc file for creating threadpool for XNNPACK
-// to avoid touching existing internal builds.
-// When we unify threadpools this should all go away.
-namespace caffe2 {
-pthreadpool_t xnnpack_threadpool();
-} // namespace caffe2
--- a/caffe2/utils/threadpool/pthreadpool-cpp.cc
+++ b/caffe2/utils/threadpool/pthreadpool-cpp.cc
@ -0,0 +1,71 @@
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
+#include <c10/util/Exception.h>
+
+namespace caffe2 {
+
+PThreadPool::PThreadPool(const size_t thread_count)
+    : threadpool_(pthreadpool_create(thread_count), pthreadpool_destroy) {}
+
+size_t PThreadPool::get_thread_count() const {
+  std::lock_guard<std::mutex> lock{mutex_};
+
+  TORCH_INTERNAL_ASSERT(threadpool_.get(), "Invalid threadpool!");
+  return pthreadpool_get_threads_count(threadpool_.get());
+}
+
+void PThreadPool::set_thread_count(const size_t thread_count) {
+  std::lock_guard<std::mutex> lock{mutex_};
+
+  // As it stands, pthreadpool is an entirely data parallel framework with no
+  // support for task parallelism.  Hence, all functions are blocking, and no
+  // user-provided tasks can be in flight when the control is returned to the
+  // user of the API, which means re-initializing the library, without the
+  // need to wait on any pending tasks, is all one needs to do to re-adjust
+  // the thread count.
+  threadpool_.reset(pthreadpool_create(thread_count));
+}
+
+void PThreadPool::run(
+    const std::function<void(size_t)>& fn,
+    const size_t range) {
+  std::lock_guard<std::mutex> lock{mutex_};
+
+  TORCH_INTERNAL_ASSERT(threadpool_.get(), "Invalid threadpool!");
+
+  struct Context final {
+    const std::function<void(size_t)>& fn;
+  } context{
+      fn,
+  };
+
+  pthreadpool_parallelize_1d(
+      threadpool_.get(),
+      // Note: pthreadpool_parallelize_1d() is a blocking function.  The
+      // function pointer to this lambda passed on to
+      // pthreadpool_parallelize_1d() cannot go out of scope until
+      // pthreadpool_parallelize_1d() returns.
+      [](void* const context, const size_t item) {
+        reinterpret_cast<Context*>(context)->fn(item);
+      },
+      &context,
+      range,
+      0u);
+}
+
+// Forward declaration
+size_t getDefaultNumThreads();
+
+PThreadPool* pthreadpool() {
+  static std::unique_ptr<PThreadPool> threadpool =
+      std::make_unique<PThreadPool>(getDefaultNumThreads());
+  return threadpool.get();
+}
+
+pthreadpool_t pthreadpool_() {
+  PThreadPool* const threadpool = pthreadpool();
+  TORCH_INTERNAL_ASSERT(
+      threadpool, "Failed to acquire an instance of PThreadPool!");
+  return threadpool->threadpool_.get();
+}
+
+} // namespace caffe2
--- a/caffe2/utils/threadpool/pthreadpool-cpp.h
+++ b/caffe2/utils/threadpool/pthreadpool-cpp.h
@ -0,0 +1,54 @@
+#pragma once
+
+#ifdef USE_PTHREADPOOL
+
+#ifdef USE_INTERNAL_PTHREADPOOL_IMPL
+#include <caffe2/utils/threadpool/pthreadpool.h>
+#else
+#include <pthreadpool.h>
+#endif
+
+#include <functional>
+#include <memory>
+#include <mutex>
+
+namespace caffe2 {
+
+class PThreadPool final {
+ public:
+  explicit PThreadPool(size_t thread_count);
+  ~PThreadPool() = default;
+
+  PThreadPool(const PThreadPool&) = delete;
+  PThreadPool& operator=(const PThreadPool&) = delete;
+
+  PThreadPool(PThreadPool&&) = delete;
+  PThreadPool& operator=(PThreadPool&&) = delete;
+
+  size_t get_thread_count() const;
+  void set_thread_count(size_t thread_count);
+
+  // Run, in parallel, function fn(task_id) over task_id in range [0, range).
+  // This function is blocking.  All input is processed by the time it returns.
+  void run(const std::function<void(size_t)>& fn, size_t range);
+
+ private:
+  friend pthreadpool_t pthreadpool_();
+
+ private:
+  mutable std::mutex mutex_;
+  std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_;
+};
+
+// Return a singleton instance of PThreadPool for ATen/TH multithreading.
+PThreadPool* pthreadpool();
+
+// Exposes the underlying implementation of PThreadPool.
+// Only for use in external libraries so as to unify threading across
+// internal (i.e. ATen, etc.) and external (e.g. NNPACK, QNNPACK, XNNPACK)
+// use cases.
+pthreadpool_t pthreadpool_();
+
+} // namespace caffe2
+
+#endif /* USE_PTHREADPOOL */
--- a/caffe2/utils/threadpool/pthreadpool.cc
+++ b/caffe2/utils/threadpool/pthreadpool.cc
@ -32,7 +32,7 @@ static inline size_t min(size_t a, size_t b) {
 }

 struct compute_1d_tiled_context {
-  pthreadpool_function_1d_tiled_t function;
+  legacy_pthreadpool_function_1d_tiled_t function;
  void* argument;
  size_t range;
  size_t tile;
@ -46,9 +46,9 @@ static void compute_1d_tiled(void* context_, size_t linear_index) {
  context->function(context->argument, index, tile);
 }

-void pthreadpool_compute_1d_tiled(
-  pthreadpool_t threadpool,
-  pthreadpool_function_1d_tiled_t function,
+void legacy_pthreadpool_compute_1d_tiled(
+  legacy_pthreadpool_t threadpool,
+  legacy_pthreadpool_function_1d_tiled_t function,
  void* argument,
  size_t range,
  size_t tile)
@ -65,12 +65,12 @@ void pthreadpool_compute_1d_tiled(
                                               /*.argument = */ argument,
                                               /*.range = */ range,
                                               /*.tile = */ tile};
-    pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
+    legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
  }
 }

 struct compute_2d_context {
-  pthreadpool_function_2d_t function;
+  legacy_pthreadpool_function_2d_t function;
  void* argument;
  caffe2::FixedDivisor<int32_t> range_j;
 };
@ -85,9 +85,9 @@ static void compute_2d(void* context_, size_t linear_index) {
  context->function(context->argument, q, r);
 }

-void pthreadpool_compute_2d(
-  struct pthreadpool* threadpool,
-  pthreadpool_function_2d_t function,
+void legacy_pthreadpool_compute_2d(
+  legacy_pthreadpool_t threadpool,
+  legacy_pthreadpool_function_2d_t function,
  void* argument,
  size_t range_i,
  size_t range_j)
@ -106,12 +106,12 @@ void pthreadpool_compute_2d(
        /*.function = */ function,
        /*.argument = */ argument,
        /*.range_j = */ caffe2::FixedDivisor<int32_t>(range_j)};
-    pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
+    legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
  }
 }

 struct compute_2d_tiled_context {
-  pthreadpool_function_2d_tiled_t function;
+  legacy_pthreadpool_function_2d_tiled_t function;
  void* argument;
  caffe2::FixedDivisor<int32_t> tile_range_j;
  size_t range_i;
@ -135,9 +135,9 @@ static void compute_2d_tiled(void* context_, size_t linear_index) {
  context->function(context->argument, index_i, index_j, tile_i, tile_j);
 }

-void pthreadpool_compute_2d_tiled(
-  pthreadpool_t threadpool,
-  pthreadpool_function_2d_tiled_t function,
+void legacy_pthreadpool_compute_2d_tiled(
+  legacy_pthreadpool_t threadpool,
+  legacy_pthreadpool_function_2d_tiled_t function,
  void* argument,
  size_t range_i,
  size_t range_j,
@ -166,12 +166,12 @@ void pthreadpool_compute_2d_tiled(
        /*.range_j = */ range_j,
        /*.tile_i = */ tile_i,
        /*.tile_j = */ tile_j};
-    pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
+    legacy_pthreadpool_compute_1d(threadpool, (legacy_pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
  }
 }

 struct compute_3d_tiled_context {
-  pthreadpool_function_3d_tiled_t function;
+  legacy_pthreadpool_function_3d_tiled_t function;
  void* argument;
  caffe2::FixedDivisor<int32_t> tile_range_j;
  caffe2::FixedDivisor<int32_t> tile_range_k;
@ -205,9 +205,9 @@ static void compute_3d_tiled(
      context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
 }

-void pthreadpool_compute_3d_tiled(
-    pthreadpool_t threadpool,
-    pthreadpool_function_3d_tiled_t function,
+void legacy_pthreadpool_compute_3d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_3d_tiled_t function,
    void* argument,
    size_t range_i,
    size_t range_j,
@ -251,16 +251,16 @@ void pthreadpool_compute_3d_tiled(
        /*.tile_i = */ tile_i,
        /*.tile_j = */ tile_j,
        /*.tile_k = */ tile_k};
-    pthreadpool_compute_1d(
+    legacy_pthreadpool_compute_1d(
        threadpool,
-        (pthreadpool_function_1d_t)compute_3d_tiled,
+        (legacy_pthreadpool_function_1d_t)compute_3d_tiled,
        &context,
        tile_range_i * tile_range_j * tile_range_k);
  }
 }

 struct compute_4d_tiled_context {
-  pthreadpool_function_4d_tiled_t function;
+  legacy_pthreadpool_function_4d_tiled_t function;
  void* argument;
  caffe2::FixedDivisor<int32_t> tile_range_kl;
  caffe2::FixedDivisor<int32_t> tile_range_j;
@ -310,9 +310,9 @@ static void compute_4d_tiled(
      tile_l);
 }

-void pthreadpool_compute_4d_tiled(
-    pthreadpool_t threadpool,
-    pthreadpool_function_4d_tiled_t function,
+void legacy_pthreadpool_compute_4d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_4d_tiled_t function,
    void* argument,
    size_t range_i,
    size_t range_j,
@ -367,9 +367,9 @@ void pthreadpool_compute_4d_tiled(
        /*.tile_j = */ tile_j,
        /*.tile_k = */ tile_k,
        /*.tile_l = */ tile_l};
-    pthreadpool_compute_1d(
+    legacy_pthreadpool_compute_1d(
        threadpool,
-        (pthreadpool_function_1d_t)compute_4d_tiled,
+        (legacy_pthreadpool_function_1d_t)compute_4d_tiled,
        &context,
        tile_range_i * tile_range_j * tile_range_k * tile_range_l);
  }
--- a/caffe2/utils/threadpool/pthreadpool.h
+++ b/caffe2/utils/threadpool/pthreadpool.h
@ -5,49 +5,16 @@

 #include "ThreadPoolCommon.h"

-
 #include <stddef.h> // for size_t
-
-typedef struct pthreadpool* pthreadpool_t;
-
-typedef void (*pthreadpool_function_1d_t)(void*, size_t);
-typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
-typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
-typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
-typedef void (*pthreadpool_function_3d_tiled_t)(
-    void*,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t);
-typedef void (*pthreadpool_function_4d_tiled_t)(
-    void*,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t);
-
 #include <stdint.h> // for uint32_t

-typedef void (*pthreadpool_task_1d_t)(void*, size_t);
-typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
-typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
-typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
-typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
-typedef void (*pthreadpool_task_3d_tile_2d_t)(
-    void*,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t);
-typedef void (*pthreadpool_task_4d_tile_2d_t)(
+typedef struct pthreadpool* legacy_pthreadpool_t;
+
+typedef void (*legacy_pthreadpool_function_1d_t)(void*, size_t);
+typedef void (*legacy_pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
+typedef void (*legacy_pthreadpool_function_2d_t)(void*, size_t, size_t);
+typedef void (*legacy_pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
+typedef void (*legacy_pthreadpool_function_3d_tiled_t)(
    void*,
    size_t,
    size_t,
@ -55,16 +22,7 @@ typedef void (*pthreadpool_task_4d_tile_2d_t)(
    size_t,
    size_t,
    size_t);
-typedef void (*pthreadpool_task_5d_tile_2d_t)(
-    void*,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t,
-    size_t);
-typedef void (*pthreadpool_task_6d_tile_2d_t)(
+typedef void (*legacy_pthreadpool_function_4d_tiled_t)(
    void*,
    size_t,
    size_t,
@ -90,8 +48,8 @@ extern "C" {
 *    On error the function returns NULL and sets errno accordingly.
 */

-//Returns internal threadpool impl.
-pthreadpool_t pthreadpool_create(size_t threads_count);
+// Returns internal threadpool impl.
+legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count);

 /**
 * Queries the number of threads in a thread pool.
@ -100,7 +58,7 @@ pthreadpool_t pthreadpool_create(size_t threads_count);
 *
 * @returns  The number of threads in the thread pool.
 */
-size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
+size_t legacy_pthreadpool_get_threads_count(legacy_pthreadpool_t threadpool);

 /**
 * Processes items in parallel using threads from a thread pool.
@ -117,38 +75,45 @@ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
 * @param[in]  items       The number of items to process. The @a function
 *    will be called once for each item.
 */
-void pthreadpool_compute_1d(
-    pthreadpool_t threadpool,
-    pthreadpool_function_1d_t function,
+void legacy_pthreadpool_compute_1d(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_1d_t function,
    void* argument,
    size_t range);

-void pthreadpool_compute_1d_tiled(
-    pthreadpool_t threadpool,
-    pthreadpool_function_1d_tiled_t function,
+void legacy_pthreadpool_parallelize_1d(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_1d_t function,
+    void* argument,
+    size_t range,
+    uint32_t flags);
+
+void legacy_pthreadpool_compute_1d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_1d_tiled_t function,
    void* argument,
    size_t range,
    size_t tile);

-void pthreadpool_compute_2d(
-    pthreadpool_t threadpool,
-    pthreadpool_function_2d_t function,
+void legacy_pthreadpool_compute_2d(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_2d_t function,
    void* argument,
    size_t range_i,
    size_t range_j);

-void pthreadpool_compute_2d_tiled(
-    pthreadpool_t threadpool,
-    pthreadpool_function_2d_tiled_t function,
+void legacy_pthreadpool_compute_2d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_2d_tiled_t function,
    void* argument,
    size_t range_i,
    size_t range_j,
    size_t tile_i,
    size_t tile_j);

-void pthreadpool_compute_3d_tiled(
-    pthreadpool_t threadpool,
-    pthreadpool_function_3d_tiled_t function,
+void legacy_pthreadpool_compute_3d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_3d_tiled_t function,
    void* argument,
    size_t range_i,
    size_t range_j,
@ -157,9 +122,9 @@ void pthreadpool_compute_3d_tiled(
    size_t tile_j,
    size_t tile_k);

-void pthreadpool_compute_4d_tiled(
-    pthreadpool_t threadpool,
-    pthreadpool_function_4d_tiled_t function,
+void legacy_pthreadpool_compute_4d_tiled(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_4d_tiled_t function,
    void* argument,
    size_t range_i,
    size_t range_j,
@ -178,129 +143,29 @@ void pthreadpool_compute_4d_tiled(
 *
 * @param[in,out]  threadpool  The thread pool to destroy.
 */
-void pthreadpool_destroy(pthreadpool_t threadpool);
+void legacy_pthreadpool_destroy(legacy_pthreadpool_t threadpool);

-// New interface copy/pasted from pthreadpool.
-// We will merge the internal and third-party/pthreadpool eventually.
-// For now copy-paste to get past build issues.
+#ifdef USE_INTERNAL_PTHREADPOOL_IMPL

-#define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
+#define pthreadpool_t legacy_pthreadpool_t
+#define pthreadpool_function_1d_t legacy_pthreadpool_function_1d_t
+#define pthreadpool_function_1d_tiled_t legacy_pthreadpool_function_1d_tiled_t
+#define pthreadpool_function_2d_t legacy_pthreadpool_function_2d_t
+#define pthreadpool_function_2d_tiled_t legacy_pthreadpool_function_2d_tiled_t
+#define pthreadpool_function_3d_tiled_t legacy_pthreadpool_function_3d_tiled_t
+#define pthreadpool_function_4d_tiled_t legacy_pthreadpool_function_4d_tiled_t
+#define pthreadpool_create legacy_pthreadpool_create
+#define pthreadpool_destroy legacy_pthreadpool_destroy
+#define pthreadpool_get_threads_count legacy_pthreadpool_get_threads_count
+#define pthreadpool_compute_1d legacy_pthreadpool_compute_1d
+#define pthreadpool_parallelize_1d legacy_pthreadpool_parallelize_1d
+#define pthreadpool_compute_1d_tiled legacy_pthreadpool_compute_1d_tiled
+#define pthreadpool_compute_2d legacy_pthreadpool_compute_2d
+#define pthreadpool_compute_2d_tiled legacy_pthreadpool_compute_2d_tiled
+#define pthreadpool_compute_3d_tiled legacy_pthreadpool_compute_3d_tiled
+#define pthreadpool_compute_4d_tiled legacy_pthreadpool_compute_4d_tiled

-// Returns the copied threadpool impl of third-party/pthreadpool
-pthreadpool_t pthreadpool_create_xnnpack(size_t threads_count);
-
-// Copied third-party impl.
-size_t pthreadpool_get_threads_count_xnnpack(pthreadpool_t threadpool);
-
-// Copied third-party impl.
-void pthreadpool_destroy_xnnpack(pthreadpool_t threadpool);
-
-/**
- * Processes items in parallel using threads from a thread pool.
- *
- * When the call returns, all items have been processed and the thread pool is
- * ready for a new task.
- *
- * @note If multiple threads call this function with the same thread pool, the
- *    calls are serialized.
- *
- * @param[in]  threadpool  The thread pool to use for parallelisation.
- * @param[in]  function    The function to call for each item.
- * @param[in]  argument    The first argument passed to the @a function.
- * @param[in]  items       The number of items to process. The @a function
- *    will be called once for each item.
- */
-void pthreadpool_parallelize_1d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_1d_t function,
-    void* argument,
-    size_t range,
-    uint32_t flags);
-
-void pthreadpool_parallelize_1d_tile_1d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_1d_tile_1d_t function,
-    void* argument,
-    size_t range,
-    size_t tile,
-    uint32_t flags);
-
-void pthreadpool_parallelize_2d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_2d_t function,
-    void* argument,
-    size_t range_i,
-    size_t range_j,
-    uint32_t flags);
-
-void pthreadpool_parallelize_2d_tile_1d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_2d_tile_1d_t function,
-    void* argument,
-    size_t range_i,
-    size_t range_j,
-    size_t tile_j,
-    uint32_t flags);
-
-void pthreadpool_parallelize_2d_tile_2d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_2d_tile_2d_t function,
-    void* argument,
-    size_t range_i,
-    size_t range_j,
-    size_t tile_i,
-    size_t tile_j,
-    uint32_t flags);
-
-void pthreadpool_parallelize_3d_tile_2d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_3d_tile_2d_t function,
-    void* argument,
-    size_t range_i,
-    size_t range_j,
-    size_t range_k,
-    size_t tile_j,
-    size_t tile_k,
-    uint32_t flags);
-
-void pthreadpool_parallelize_4d_tile_2d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_4d_tile_2d_t function,
-    void* argument,
-    size_t range_i,
-    size_t range_j,
-    size_t range_k,
-    size_t range_l,
-    size_t tile_k,
-    size_t tile_l,
-    uint32_t flags);
-
-void pthreadpool_parallelize_5d_tile_2d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_5d_tile_2d_t function,
-    void* argument,
-    size_t range_i,
-    size_t range_j,
-    size_t range_k,
-    size_t range_l,
-    size_t range_m,
-    size_t tile_l,
-    size_t tile_m,
-    uint32_t flags);
-
-void pthreadpool_parallelize_6d_tile_2d(
-    pthreadpool_t threadpool,
-    pthreadpool_task_6d_tile_2d_t function,
-    void* argument,
-    size_t range_i,
-    size_t range_j,
-    size_t range_k,
-    size_t range_l,
-    size_t range_m,
-    size_t range_n,
-    size_t tile_m,
-    size_t tile_n,
-    uint32_t flags);
+#endif /* USE_INTERNAL_PTHREADPOOL_IMPL */

 #ifdef __cplusplus
 } /* extern "C" */
--- a/caffe2/utils/threadpool/pthreadpool_impl.cc
+++ b/caffe2/utils/threadpool/pthreadpool_impl.cc
@ -6,9 +6,9 @@
 // External API
 //

-void pthreadpool_compute_1d(
-    pthreadpool_t threadpool,
-    pthreadpool_function_1d_t function,
+void legacy_pthreadpool_compute_1d(
+    legacy_pthreadpool_t threadpool,
+    legacy_pthreadpool_function_1d_t function,
    void* argument,
    size_t range) {
  if (threadpool == nullptr) {
@ -27,30 +27,31 @@ void pthreadpool_compute_1d(
          range);
 }

-size_t pthreadpool_get_threads_count(pthreadpool_t threadpool) {
-  // The current fix only useful when XNNPACK calls pthreadpool_get_threads_count with nullptr.
+void legacy_pthreadpool_parallelize_1d(
+    const legacy_pthreadpool_t threadpool,
+    const legacy_pthreadpool_function_1d_t function,
+    void* const argument,
+    const size_t range,
+    uint32_t) {
+  legacy_pthreadpool_compute_1d(threadpool, function, argument, range);
+}
+
+size_t legacy_pthreadpool_get_threads_count(legacy_pthreadpool_t threadpool) {
+  // The current fix only useful when XNNPACK calls legacy_pthreadpool_get_threads_count with nullptr.
  if (threadpool == nullptr) {
    return 1;
  }
  return reinterpret_cast<caffe2::ThreadPool*>(threadpool)->getNumThreads();
-  // TODO: Future fix: If we keep maintaining two different threadpools.
-  // Old C2 and new one for XNNPACK, then the we have two different pthreadpool pointer
-  // types. One is caffe2::Thredpool*, the other is pthreadpool* (pthreadpool_new_if_impl.c)
-  // XNNPACK calls pthreadpool_get_threads_count during op setup using pthreadpool*, and
-  // uses _parallelize_ interface for for actual work.
-  // While NNPACK uses caffe2::Threadpool*.
-  // Thus if pthreadpool_get_threads_count is getting called from XNNPACK we cannot
-  // reinterpret_cast it to ThreadPool. It will seg fault or worse will have unedfined behavior.
 }

-pthreadpool_t pthreadpool_create(size_t threads_count) {
+legacy_pthreadpool_t legacy_pthreadpool_create(size_t threads_count) {
  std::mutex thread_pool_creation_mutex_;
  std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);

-  return reinterpret_cast<pthreadpool_t>(new caffe2::ThreadPool(threads_count));
+  return reinterpret_cast<legacy_pthreadpool_t>(new caffe2::ThreadPool(threads_count));
 }

-void pthreadpool_destroy(pthreadpool_t pthreadpool) {
+void legacy_pthreadpool_destroy(legacy_pthreadpool_t pthreadpool) {
  if (pthreadpool) {
    caffe2::ThreadPool* threadpool =
        reinterpret_cast<caffe2::ThreadPool*>(pthreadpool);
--- a/caffe2/utils/threadpool/pthreadpool_new_if_impl.c
+++ b/caffe2/utils/threadpool/pthreadpool_new_if_impl.c
--- a/caffe2/utils/threadpool/pthreadpool_utils_new_if.h
+++ b/caffe2/utils/threadpool/pthreadpool_utils_new_if.h
@ -1,62 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-#if defined(__SSE__) || defined(__x86_64__)
-#include <xmmintrin.h>
-#endif
-
-struct fpu_state {
-#if defined(__SSE__) || defined(__x86_64__)
-  uint32_t mxcsr;
-#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
-  uint32_t fpscr;
-#elif defined(__aarch64__)
-  uint64_t fpcr;
-#else
-  char unused;
-#endif
-};
-
-static inline struct fpu_state get_fpu_state() {
-  struct fpu_state state = { 0 };
-#if defined(__SSE__) || defined(__x86_64__)
-  state.mxcsr = (uint32_t) _mm_getcsr();
-#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
-  __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr));
-#elif defined(__aarch64__)
-  __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr));
-#endif
-  return state;
-}
-
-static inline void set_fpu_state(const struct fpu_state state) {
-#if defined(__SSE__) || defined(__x86_64__)
-  _mm_setcsr((unsigned int) state.mxcsr);
-#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
-  __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr));
-#elif defined(__aarch64__)
-  __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr));
-#endif
-}
-
-static inline void disable_fpu_denormals() {
-#if defined(__SSE__) || defined(__x86_64__)
-  _mm_setcsr(_mm_getcsr() | 0x8040);
-#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
-  uint32_t fpscr;
-  __asm__ __volatile__(
-      "VMRS %[fpscr], fpscr\n"
-      "ORR %[fpscr], #0x1000000\n"
-      "VMSR fpscr, %[fpscr]\n"
-    : [fpscr] "=r" (fpscr));
-#elif defined(__aarch64__)
-  uint64_t fpcr;
-  __asm__ __volatile__(
-      "MRS %[fpcr], fpcr\n"
-      "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
-      "ORR %w[fpcr], %w[fpcr], 0x80000\n"
-      "MSR fpcr, %[fpcr]\n"
-    : [fpcr] "=r" (fpcr));
-#endif
-}
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -239,10 +239,10 @@ if(USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK)
  endif()

  if(DISABLE_NNPACK_AND_FAMILY)
-    set(USE_NNPACK OFF)
-    set(USE_QNNPACK OFF)
-    set(USE_PYTORCH_QNNPACK OFF)
-    set(USE_XNNPACK OFF)
+    caffe2_update_option(USE_NNPACK OFF)
+    caffe2_update_option(USE_QNNPACK OFF)
+    caffe2_update_option(USE_PYTORCH_QNNPACK OFF)
+    caffe2_update_option(USE_XNNPACK OFF)
  else()
    set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")

@ -261,11 +261,9 @@ if(USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK)
    if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
      set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
    endif()
-
-    set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "")
-    set(CPUINFO_LOG_LEVEL "error" CACHE STRING "")
-    set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "")
  endif()
+else()
+  set(DISABLE_NNPACK_AND_FAMILY ON)
 endif()

 set(CONFU_DEPENDENCIES_SOURCE_DIR ${PROJECT_BINARY_DIR}/confu-srcs
@ -281,45 +279,48 @@ if(INTERN_BUILD_MOBILE AND INTERN_USE_EIGEN_BLAS)
 endif()

 # ---[ pthreadpool
-# QNNPACK and NNPACK both depend on pthreadpool, but when building with libtorch
-# they should use the pthreadpool implementation under caffe2/utils/threadpool
-# instead of the default implementation. To avoid confusion, add pthreadpool
-# subdirectory explicitly with EXCLUDE_FROM_ALL property prior to QNNPACK/NNPACK
-# does so, which will prevent it from installing the default pthreadpool library.
-if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE AND (USE_QNNPACK OR USE_NNPACK OR USE_XNNPACK))
-  if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
-    set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
-    set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
-  endif()
+# Only add a dependency on pthreadpool if we are on a mobile build
+# or are building any of the libraries in the {Q/X}NNPACK family.
+if(INTERN_BUILD_MOBILE OR NOT DISABLE_NNPACK_AND_FAMILY)
+  set(USE_PTHREADPOOL ON CACHE BOOL "" FORCE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_PTHREADPOOL")
+
+  # Always use third_party/pthreadpool.
+  set(USE_INTERNAL_PTHREADPOOL_IMPL OFF CACHE BOOL "" FORCE)

  if(NOT TARGET pthreadpool)
-    set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "")
-    set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "")
-    add_subdirectory(
-      "${PTHREADPOOL_SOURCE_DIR}"
-      "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool"
-      EXCLUDE_FROM_ALL)
-  endif()
-endif()
+    if(USE_SYSTEM_PTHREADPOOL)
+      add_library(pthreadpool SHARED IMPORTED)
+      find_library(PTHREADPOOL_LIBRARY pthreadpool)
+      set_property(TARGET pthreadpool PROPERTY IMPORTED_LOCATION "${PTHREADPOOL_LIBRARY}")
+      if(NOT PTHREADPOOL_LIBRARY)
+        message(FATAL_ERROR "Cannot find pthreadpool")
+      endif()
+      message("-- Found pthreadpool: ${PTHREADPOOL_LIBRARY}")
+    elseif(NOT USE_INTERNAL_PTHREADPOOL_IMPL)
+      if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
+        set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
+        set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
+      endif()

-# XNNPACK has not option of like QNNPACK_CUSTOM_THREADPOOL
-# that allows us to hijack pthreadpool interface.
-# Thus not doing this ends up building pthreadpool as well as
-# the internal implemenation of pthreadpool which results in symbol conflicts.
-if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
-  if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
-    set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
-    set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory")
-  endif()
+      set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "")
+      set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "")
+      set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "")
+      set(PTHREADPOOL_ALLOW_DEPRECATED_API ON CACHE BOOL "")
+      add_subdirectory(
+        "${PTHREADPOOL_SOURCE_DIR}"
+        "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool")
+      set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
+    endif()

-  if(NOT TARGET pthreadpool)
-    set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "")
-    set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "")
-    add_subdirectory(
-      "${PTHREADPOOL_SOURCE_DIR}"
-      "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool"
-      EXCLUDE_FROM_ALL)
+    if(USE_INTERNAL_PTHREADPOOL_IMPL)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTERNAL_PTHREADPOOL_IMPL")
+    else()
+      list(APPEND Caffe2_DEPENDENCY_LIBS pthreadpool)
+    endif()
  endif()
+else()
+  set(USE_PTHREADPOOL OFF CACHE BOOL "" FORCE)
 endif()

 # ---[ Caffe2 uses cpuinfo library in the thread pool
@ -369,9 +370,12 @@ if(USE_QNNPACK)
  endif()

  if(NOT TARGET qnnpack)
+    if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL)
+      set(QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
+    endif()
+
    set(QNNPACK_BUILD_TESTS OFF CACHE BOOL "")
    set(QNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
-    set(QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
    set(QNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
    add_subdirectory(
      "${QNNPACK_SOURCE_DIR}"
@ -379,8 +383,29 @@ if(USE_QNNPACK)
    # We build static versions of QNNPACK and pthreadpool but link
    # them into a shared library for Caffe2, so they need PIC.
    set_property(TARGET qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
-    set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
    set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+    if(QNNPACK_CUSTOM_THREADPOOL)
+      target_compile_definitions(
+        qnnpack PRIVATE
+        pthreadpool_t=legacy_pthreadpool_t
+        pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t
+        pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t
+        pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t
+        pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t
+        pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t
+        pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t
+        pthreadpool_create=legacy_pthreadpool_create
+        pthreadpool_destroy=legacy_pthreadpool_destroy
+        pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count
+        pthreadpool_compute_1d=legacy_pthreadpool_compute_1d
+        pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d
+        pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled
+        pthreadpool_compute_2d=legacy_pthreadpool_compute_2d
+        pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled
+        pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled
+        pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled)
+    endif()
  endif()

  list(APPEND Caffe2_DEPENDENCY_LIBS qnnpack)
@ -400,9 +425,12 @@ if(USE_PYTORCH_QNNPACK)
    endif()

    if(NOT TARGET pytorch_qnnpack)
+      if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL)
+        set(PYTORCH_QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
+      endif()
+
      set(PYTORCH_QNNPACK_BUILD_TESTS OFF CACHE BOOL "")
      set(PYTORCH_QNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
-      set(PYTORCH_QNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
      set(PYTORCH_QNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
      add_subdirectory(
        "${PYTORCH_QNNPACK_SOURCE_DIR}"
@ -410,10 +438,29 @@ if(USE_PYTORCH_QNNPACK)
      # We build static versions of QNNPACK and pthreadpool but link
      # them into a shared library for Caffe2, so they need PIC.
      set_property(TARGET pytorch_qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
-      if(NOT USE_SYSTEM_PTHREADPOOL)
-        set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
-      endif()
      set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+      if(PYTORCH_QNNPACK_CUSTOM_THREADPOOL)
+        target_compile_definitions(
+          pytorch_qnnpack PRIVATE
+          pthreadpool_t=legacy_pthreadpool_t
+          pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t
+          pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t
+          pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t
+          pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t
+          pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t
+          pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t
+          pthreadpool_create=legacy_pthreadpool_create
+          pthreadpool_destroy=legacy_pthreadpool_destroy
+          pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count
+          pthreadpool_compute_1d=legacy_pthreadpool_compute_1d
+          pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d
+          pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled
+          pthreadpool_compute_2d=legacy_pthreadpool_compute_2d
+          pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled
+          pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled
+          pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled)
+      endif()
    endif()

    list(APPEND Caffe2_DEPENDENCY_LIBS pytorch_qnnpack)
@ -447,7 +494,6 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
  endif()

  if(NOT TARGET XNNPACK)
-    set(XNNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
    set(XNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
    set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
    set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "")
@ -457,15 +503,6 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
      "${CONFU_DEPENDENCIES_BINARY_DIR}/XNNPACK")

    set_property(TARGET XNNPACK PROPERTY POSITION_INDEPENDENT_CODE ON)
-    # Context: pthreadpool_get_threads_count implementation that is built in pytorch, uses
-    # implementation defined in caffe2/utils/threadpool/pthreadpool_impl.cc. This implementation
-    # assumes the the pthreadpool* passed is of type caffe2::ThradPool and thus does reinterpret cast.
-    # This is not valid when we create pthreadpool via caffe2::xnnpack_threadpool, which is of type
-    # compatible with new pthreadpool interface and is used in PT's XNNPACK integration.
-    # Thus all the calls for pthreadpool_get_threads_count originating from XNNPACK must be routed
-    # appropriately to pthreadpool_get_threads_count_xnnpack, which does not do the aforementioned
-    # casting to caffe2::ThradPool. Once the threadpools are unified, we will not need this.
-    target_compile_definitions(XNNPACK PRIVATE -Dpthreadpool_get_threads_count=pthreadpool_get_threads_count_xnnpack)
  endif()

  include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
--- a/cmake/External/nnpack.cmake
+++ b/cmake/External/nnpack.cmake
@ -59,9 +59,12 @@ if(ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAM
  set(GOOGLETEST_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/googletest" CACHE STRING "Google Test source directory")

  if(NOT TARGET nnpack)
+    if(NOT USE_SYSTEM_PTHREADPOOL AND USE_INTERNAL_PTHREADPOOL_IMPL)
+      set(NNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
+    endif()
+
    set(NNPACK_BUILD_TESTS OFF CACHE BOOL "")
    set(NNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
-    set(NNPACK_CUSTOM_THREADPOOL ON CACHE BOOL "")
    set(NNPACK_LIBRARY_TYPE "static" CACHE STRING "")
    set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "")
    set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "")
@ -73,6 +76,28 @@ if(ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAM
    set_property(TARGET nnpack PROPERTY POSITION_INDEPENDENT_CODE ON)
    set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON)
    set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+    if(NNPACK_CUSTOM_THREADPOOL)
+      target_compile_definitions(
+        nnpack PRIVATE
+        pthreadpool_t=legacy_pthreadpool_t
+        pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t
+        pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t
+        pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t
+        pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t
+        pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t
+        pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t
+        pthreadpool_create=legacy_pthreadpool_create
+        pthreadpool_destroy=legacy_pthreadpool_destroy
+        pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count
+        pthreadpool_compute_1d=legacy_pthreadpool_compute_1d
+        pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d
+        pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled
+        pthreadpool_compute_2d=legacy_pthreadpool_compute_2d
+        pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled
+        pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled
+        pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled)
+    endif()
  endif()

  set(NNPACK_FOUND TRUE)
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@ -69,6 +69,11 @@ if(NOT @BUILD_SHARED_LIBS@)
    list(APPEND TORCH_LIBRARIES ${XNNPACK_LIBRARY})
  endif()

+  if(NOT @USE_INTERNAL_PTHREADPOOL_IMPL@)
+    find_library(PTHREADPOOL_LIBRARY pthreadpool PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    list(APPEND TORCH_LIBRARIES ${PTHREADPOOL_LIBRARY})
+  endif()
+
  if(@INTERN_USE_EIGEN_BLAS@)
    find_library(EIGEN_BLAS_LIBRARY eigen_blas PATHS "${TORCH_INSTALL_PREFIX}/lib")
    list(APPEND TORCH_LIBRARIES ${EIGEN_BLAS_LIBRARY})
--- a/Show More
+++ b/Show More