Enable dynamo traced test_param_group_with_lrscheduler_goes_right_direction

ghstack-source-id: 73462085c1665607b0ca6cc09a1c4924de8116e6 Pull Request resolved: https://github.com/pytorch/pytorch/pull/124544
Fix capturable enablement conditions
2025-11-02 14:34:54 +08:00 · 2024-05-10 18:10:30 -07:00 · 2024-05-10 18:10:30 -07:00 · 2024-05-10 18:10:29 -07:00 · 2024-05-11 00:20:39 +00:00 · 2024-05-11 00:03:52 +00:00
1522 changed files with 6553 additions and 255758 deletions
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -310,3 +310,5 @@ lxml==5.0.0.
 #Description: This is a requirement of unittest-xml-reporting

 # Python-3.9 binaries
+
+PyGithub==2.3.0
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -21,6 +21,7 @@ self-hosted-runner:
    - linux.rocm.gpu
    - macos-m1-stable
    - macos-m1-13
+    - macos-m1-14
    - macos-12-xl
    - macos-12
    - macos12.3-m1
--- a/.github/scripts/get_workflow_type.py
+++ b/.github/scripts/get_workflow_type.py
@ -0,0 +1,99 @@
+import json
+from argparse import ArgumentParser
+from typing import Any
+
+from github import Auth, Github
+from github.Issue import Issue
+
+
+WORKFLOW_TYPE_LABEL = "label"
+WORKFLOW_TYPE_RG = "rg"
+WORKFLOW_TYPE_BOTH = "both"
+
+
+def parse_args() -> Any:
+    parser = ArgumentParser("Get dynamic rollout settings")
+    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
+    parser.add_argument(
+        "--github-repo",
+        type=str,
+        required=False,
+        default="pytorch/test-infra",
+        help="GitHub repo to get the issue",
+    )
+    parser.add_argument(
+        "--github-issue", type=int, required=True, help="GitHub issue umber"
+    )
+    parser.add_argument(
+        "--github-user", type=str, required=True, help="GitHub username"
+    )
+    parser.add_argument(
+        "--github-branch", type=str, required=True, help="Current GitHub branch"
+    )
+
+    return parser.parse_args()
+
+
+def get_gh_client(github_token: str) -> Github:
+    auth = Auth.Token(github_token)
+    return Github(auth=auth)
+
+
+def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
+    repo = gh.get_repo(repo)
+    return repo.get_issue(number=issue_num)
+
+
+def is_exception_branch(branch: str) -> bool:
+    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+
+
+def get_workflow_type(issue: Issue, username: str) -> str:
+    user_list = issue.get_comments()[0].body.split("\r\n")
+    try:
+        run_option = issue.get_comments()[1].body.split("\r\n")[0]
+    except Exception as e:
+        run_option = "single"
+
+    if user_list[0] == "!":
+        # Use old runners for everyone
+        return WORKFLOW_TYPE_LABEL
+    elif user_list[1] == "*":
+        if run_option == WORKFLOW_TYPE_BOTH:
+            # Use ARC runners and old runners for everyone
+            return WORKFLOW_TYPE_BOTH
+        else:
+            # Use only ARC runners for everyone
+            return WORKFLOW_TYPE_RG
+    elif username in user_list:
+        if run_option == WORKFLOW_TYPE_BOTH:
+            # Use ARC runners and old runners for a specific user
+            return WORKFLOW_TYPE_BOTH
+        else:
+            # Use only ARC runners for a specific user
+            return WORKFLOW_TYPE_RG
+    else:
+        # Use old runners by default
+        return WORKFLOW_TYPE_LABEL
+
+
+def main() -> None:
+    args = parse_args()
+
+    if is_exception_branch(args.github_branch):
+        output = {"workflow_type": WORKFLOW_TYPE_LABEL}
+    else:
+        try:
+            gh = get_gh_client(args.github_token)
+            issue = get_issue(gh, args.github_repo, args.github_issue)
+
+            output = {"workflow_type": get_workflow_type(issue, args.github_user)}
+        except Exception as e:
+            output = {"workflow_type": WORKFLOW_TYPE_LABEL}
+
+    json_output = json.dumps(output)
+    print(json_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -24,11 +24,6 @@ on:
        default: "3.8"
        description: |
          The python version to be used. Will be 3.8 by default
-      arch:
-        required: true
-        type: string
-        description: |
-          Contains the architecture to run the tests with
      timeout-minutes:
        required: false
        type: number
@ -44,7 +39,7 @@ jobs:
    # Also ensure that we always run with the right architecture
    defaults:
      run:
-        shell: arch -arch ${{ inputs.arch }} bash -e -l {0}
+        shell: bash -e -l {0}
    strategy:
      matrix: ${{ fromJSON(inputs.test-matrix) }}
      fail-fast: false
@ -133,12 +128,6 @@ jobs:
          test-matrix: ${{ inputs.test-matrix }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

-      - name: Pre-process arm64 wheels
-        if: inputs.build-environment == 'macos-12-py3-arm64'
-        run: |
-          # As wheels are cross-compiled they are reported as x86_64 ones
-          ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv "${ORIG_WHLNAME}" "${ARM_WHLNAME}"
-
      - name: Set Test step time
        id: test-timeout
        shell: bash
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -0,0 +1,58 @@
+name: Check whether the workflow owner can use ARC runners
+
+on:
+  workflow_call:
+    inputs:
+      user_name:
+        required: true
+        type: string
+        description: The name of the workflow owner.
+      curr_branch:
+        required: true
+        type: string
+        description: Current branch.
+      issue_number:
+        required: false
+        type: string
+        default: "5132"
+
+    outputs:
+      workflow-type:
+        description: Type of runners to use
+        value: ${{ jobs.runner-determinator.outputs.workflow-type }}
+
+jobs:
+  runner-determinator:
+    runs-on: linux.4xlarge
+    outputs:
+      workflow-type: ${{ steps.set-condition.outputs.workflow-type }}
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      ISSUE_NUMBER: ${{ inputs.issue_number }}
+      USERNAME: ${{ inputs.user_name }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          fetch-depth: 1
+          submodules: true
+
+      - name: Install dependencies
+        run: python3 -m pip install urllib3==1.26.18 PyGithub==2.3.0
+
+      - name: Get the workflow type for the current user
+        id: set-condition
+        run: |
+          curr_branch="${{ inputs.curr_branch }}"
+          echo "Current branch is '$curr_branch'"
+
+          output="$(python3 .github/scripts/get_workflow_type.py \
+            --github-token "$GITHUB_TOKEN" \
+            --github-issue "$ISSUE_NUMBER" \
+            --github-branch "$curr_branch" \
+            --github-user "$USERNAME")"
+
+          echo "Output: '${output}'"
+
+          WORKFLOW_TYPE=$(echo "${output}" | jq -r '.workflow_type')
+          echo "workflow-type=$WORKFLOW_TYPE" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -127,15 +127,10 @@ jobs:
        run: |
          make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
      - name: Push nightly tags
-        if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }}
+        if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
        run: |
          PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
          CUDA_SUFFIX="-cu${CUDA_VERSION}"
-          if [[ ${CUDA_VERSION_SHORT} == "cpu" ]]; then
-            PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime"
-            CUDA_SUFFIX=""
-          fi
-
          PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
                                          python -c 'import torch; print(torch.version.git_version[:7],end="")')

--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -230,11 +230,11 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.5
+      - name: Setup Python 3.6
        if: matrix.test_type == 'older_python_version'
        uses: actions/setup-python@v4
        with:
-          python-version: '3.5'
+          python-version: '3.6'
          architecture: x64
          check-latest: false
          cache: pip
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@ -13,33 +13,29 @@ concurrency:
 permissions: read-all

 jobs:
-  macos-12-py3-arm64-build:
-    name: macos-12-py3-arm64
+  macos-13-py3-arm64-build:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-build.yml
    with:
-      sync-tag: macos-12-py3-arm64-build
-      build-environment: macos-12-py3-arm64
+      sync-tag: macos-py3-arm64-build
+      build-environment: macos-13-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
      python-version: 3.9.12
-      # We need to set the environment file here instead of trying to detect it automatically because
-      # MacOS arm64 is cross-compiled from x86-64. Specifically, it means that arm64 conda environment
-      # is needed when building PyTorch MacOS arm64 from x86-64
-      environment-file: .github/requirements/conda-env-macOS-ARM64
      test-matrix: |
        { include: [
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-14" },
        ]}

-  macos-12-py3-arm64-mps-test:
-    name: macos-12-py3-arm64-mps
+  macos-py3-arm64-mps-test:
+    name: macos-py3-arm64-mps
    uses: ./.github/workflows/_mac-test-mps.yml
-    needs: macos-12-py3-arm64-build
+    needs: macos-13-py3-arm64-build
    with:
-      sync-tag: macos-12-py3-arm64-mps-test
-      build-environment: macos-12-py3-arm64
+      sync-tag: macos-py3-arm64-mps-test
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
-      test-matrix: ${{ needs.macos-12-py3-arm64-build.outputs.test-matrix }}
+      test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -34,18 +34,6 @@ jobs:
      id-token: write
      contents: read

-  # Build PyTorch with BUILD_CAFFE2=ON
-  caffe2-linux-jammy-py3_8-gcc11-build:
-    name: caffe2-linux-jammy-py3.8-gcc11
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: caffe2-linux-jammy-py3.8-gcc11
-      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-
  linux-focal-cuda12_1-py3_10-gcc9-build:
    name: linux-focal-cuda12.1-py3.10-gcc9
    uses: ./.github/workflows/_linux-build.yml
@ -106,20 +94,16 @@ jobs:
          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
        ]}

-  macos-12-py3-arm64-build:
-    name: macos-12-py3-arm64
+  macos-13-py3-arm64-build:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-build.yml
    with:
-      sync-tag: macos-12-py3-arm64-build
-      build-environment: macos-12-py3-arm64
+      sync-tag: macos-py3-arm64-build
+      build-environment: macos-13-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
      python-version: 3.9.12
-      # We need to set the environment file here instead of trying to detect it automatically because
-      # MacOS arm64 is cross-compiled from x86-64. Specifically, it means that arm64 conda environment
-      # is needed when building PyTorch MacOS arm64 from x86-64
-      environment-file: .github/requirements/conda-env-macOS-ARM64
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
@ -127,33 +111,34 @@ jobs:
          { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
        ]}

-  macos-12-py3-arm64-mps-test:
-    name: macos-12-py3-arm64-mps
+  macos-py3-arm64-mps-test:
+    name: macos-py3-arm64-mps
    uses: ./.github/workflows/_mac-test-mps.yml
-    needs: macos-12-py3-arm64-build
-    if: needs.macos-12-py3-arm64-build.outputs.build-outcome == 'success'
+    needs: macos-13-py3-arm64-build
+    if: needs.macos-13-py3-arm64-build.outputs.build-outcome == 'success'
    with:
-      sync-tag: macos-12-py3-arm64-mps-test
-      build-environment: macos-12-py3-arm64
+      sync-tag: macos-py3-arm64-mps-test
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
      test-matrix: |
        { include: [
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+
        ]}

-  macos-12-py3-arm64-test:
-    name: macos-12-py3-arm64
+  macos-13-py3-arm64-test:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-test.yml
    needs:
-      - macos-12-py3-arm64-build
+      - macos-13-py3-arm64-build
      - target-determination
    with:
-      build-environment: macos-12-py3-arm64
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
-      test-matrix: ${{ needs.macos-12-py3-arm64-build.outputs.test-matrix }}
-      arch: arm64
+      test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}

  win-vs2019-cpu-py3-build:
    name: win-vs2019-cpu-py3
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -49,22 +49,6 @@ jobs:
      - run: |
          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12

-      - name: Upload test stats
-        env:
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
-          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
-          WORKFLOW_URL: ${{ github.event.workflow_run.html_url }}
-          HEAD_REPOSITORY: ${{ github.event.workflow_run.head_repository.full_name }}
-          HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
-        run: |
-          echo "${WORKFLOW_URL}"
-          python3 -m tools.stats.upload_test_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --head-branch "${HEAD_BRANCH}" --head-repository "${HEAD_REPOSITORY}"
-          python3 -m tools.stats.upload_sccache_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}"
-
      - name: Upload test artifacts
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@ -81,6 +65,22 @@ jobs:
          # anything on GitHub to upload. The command should return right away
          python3 -m tools.stats.upload_artifacts --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"

+      - name: Upload test stats
+        env:
+          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
+          WORKFLOW_URL: ${{ github.event.workflow_run.html_url }}
+          HEAD_REPOSITORY: ${{ github.event.workflow_run.head_repository.full_name }}
+          HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
+        run: |
+          echo "${WORKFLOW_URL}"
+          python3 -m tools.stats.upload_test_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --head-branch "${HEAD_BRANCH}" --head-repository "${HEAD_REPOSITORY}"
+          python3 -m tools.stats.upload_sccache_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}"
+
      - name: Analyze disabled tests rerun
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1114,8 +1114,6 @@ exclude_patterns = [
    'test/test_optim.py',
    'test/test_out_dtype_op.py',
    'test/test_overrides.py',
-    'test/test_package.py',
-    'test/test_per_overload_api.py',
    'test/test_prims.py',
    'test/test_proxy_tensor.py',
    'test/test_pruning_op.py',
--- a/5
+++ b/5
@ -65,8 +65,9 @@ ARG CUDA_VERSION=12.1
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch-nightly
 # Automatically set by buildx
-# Note conda needs to be pinned to 23.5.2 see: https://github.com/pytorch/pytorch/issues/106470
-RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=23.5.2
+RUN /opt/conda/bin/conda update -y -n base -c defaults conda
+RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION}
+
 ARG TARGETPLATFORM

 # On arm64 we can only install wheel packages.
--- a/RELEASE.md
+++ b/RELEASE.md
@ -65,8 +65,8 @@ Following is the release cadence for year 2023/2024. All dates below are tentati
 | --- | --- | --- | --- | --- |
 | 2.1 | Aug 2023 | Oct 2023 | Nov 2023 | Dec 2023 |
 | 2.2 | Dec 2023 | Jan 2024 | Feb 2024 | Mar 2024 |
-| 2.3 | Mar 2024 | Apr 2024 | May 2024 | Jun 2024 |
-| 2.4 | May 2024 | Jul 2024 | Aug 2024 | Sep 2024 |
+| 2.3 | Mar 2024 | Apr 2024 | Jun 2024 | Not planned |
+| 2.4 | Jun 2024 | Jul 2024 | Aug 2024 | Sep 2024 |
 | 2.5 | Aug 2024 | Oct 2024 | Nov 2024 | Dec 2024 |

 ## General Overview
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -97,7 +97,16 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
      /*resizable=*/true
    ),
    base_(base)
-  {
+{
+  // SparseTensorImpl has no storage, so we cannot query its nbytes.
+  // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
+  // Same for XLA
+  if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
+    original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
+  } else {
+    original_storage_size_ = -1;
+  }
+  curr_storage_size_ = original_storage_size_;
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }

--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@ -105,6 +105,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
    frozen_ = true;
  }

+  c10::SymInt get_storage_size(bool before) {
+    if (before) {
+      return original_storage_size_;
+    } else {
+      return curr_storage_size_;
+    }
+  }
+
  ~FunctionalStorageImpl() override = default;

  void mark_mutation() {
@ -132,6 +140,15 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
    return mutation_counter_ <= mutation_counter_hidden_from_autograd_;
  }

+  void mark_inductor_storage_resize(c10::SymInt new_size) {
+    inductor_storage_resized_ = true;
+    curr_storage_size_ = new_size;
+  }
+
+  bool was_inductor_storage_resized() {
+    return inductor_storage_resized_;
+  }
+
 private:
  // NB: base_ should always point to a tensor BELOW the current
  // functionalization layer. This is mainly to avoid reference cycles. e.g.
@ -172,6 +189,13 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
  uint64_t mutation_counter_during_no_grad_or_inference_mode_ = 0;
  uint64_t mutation_counter_ = 0;
  uint64_t mutation_counter_hidden_from_autograd_ = 0;
+
+  // Used to tell if:
+  // (1) There were any storage resizes on a graph input
+  // (2) The original/curr storage size tell us if these resizes result in a nop
+  bool inductor_storage_resized_ = false;
+  c10::SymInt original_storage_size_;
+  c10::SymInt curr_storage_size_;
 };

 } // namespace at::functionalization
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -276,6 +276,32 @@ void FunctionalTensorWrapper::set__impl(const FunctionalTensorWrapper* other) {
  set_sizes_and_strides(sizes_, strides_, storage_offset_);
 }

+void FunctionalTensorWrapper::storage_resize_(c10::SymInt new_size) {
+  auto curr_storage_size = value_.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
+  // storage resizing is severely limited: we only support resizing either to zero, or from zero bytes.
+  TORCH_CHECK(new_size == 0 || curr_storage_size == 0, "new_size: ", new_size, ". curr_storage_size: ", curr_storage_size);
+  // The "functionalization rule" for storage resizing is a giant no-op, mainly because we don't want
+  // resize_() calls to actualy emit any ops in the functional graph.
+  // How does it work?
+  // Resizing up (old size == 0):
+  //   We do nothing in this case.
+  //   The expection is that for the user code to be valid, the next op that should run against the current tensor "x"
+  //   will be a x.copy_(y) (or similar), that will fully overwrite the data of x.
+  //   If there are any outstanding aliases of x, we expect them not to be used until after the copy_() call
+  //   (otherwise the eager code would be invalid),
+  //   and therefore functionalization will regenerate the aliases off of the result of `x.copy(y)`.
+  // Resizing down (new size == 0):
+  //   We also do nothing in this case. The assumption is that after resizing a tensor down,
+  //   it is fully unused in the program (unless it is later resized back up first, has data copied in)
+  //   Although it might be saved for backward, which happens in FSDP.
+  //   The expected pattern is that the param will then be resized back up from zero in the backward.
+
+  // Mark the tensor as having its storage resized.
+  // This is so we can detect it for inputs in AOTAutograd and error / emit
+  // an input mutation resize_() appropriately
+  functional_storage_impl()->mark_inductor_storage_resize(new_size);
+}
+
 void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
  // Note [resize_() in functionalization pass]
  // resize_() is a special operator in functionalization because it can reallocate its underlying storage.
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -141,6 +141,9 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  // Custom implementation of self.set_(src)
  void set__impl(const FunctionalTensorWrapper* other);

+  // Custom implementation of resize_storage_bytes_(self, new_size)
+  void storage_resize_(c10::SymInt new_size);
+
  // Returns whether the current tensor's data was ever mutated
  bool has_data_mutation();
  //
@ -150,6 +153,16 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
    return was_storage_changed_;
  }

+  c10::SymInt get_storage_size(bool before) {
+    return functional_storage_impl()->get_storage_size(before);
+  }
+
+  // Returns whether the FunctionalTensor experienced an
+  // untyped_storage().resize_() call
+  bool was_inductor_storage_resized() {
+    return functional_storage_impl()->was_inductor_storage_resized();
+  }
+
  // The functionalization pass can be used to remove mutations.
  // It does so by replacing any mutation op with it's corresponding
  // out-of-place op, followed by a call to replace_(). e.g:
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@ -335,6 +335,9 @@ static at::Tensor& set__functionalize(at::Tensor& self, const at::Tensor& src) {
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(src));
  auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
  auto src_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(src);
+  // See Note [Ordering of resize_() and set_()]
+  TORCH_CHECK(!self_impl->was_inductor_storage_resized(),
+    "storage_resize_() followed by set_() in torch.compile is not supported today");
  self_impl->set__impl(src_impl);
  return self;
 }
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@ -57,6 +57,8 @@ SparseCsrTensorImpl::SparseCsrTensorImpl(
  TORCH_INTERNAL_ASSERT(((key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kCPU)
                         || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kCUDA)
                         || (key_set.has(DispatchKey::SparseCsrMeta) && device().type() == kMeta)
+                         || (key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kMeta)   // fake tensor
+                         || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kMeta)  // fake tensor
                         || (key_set.has(DispatchKey::SparseCsrPrivateUse1) && device().type() == kPrivateUse1)),
                        "Inconsistent key_set (=", key_set, ") and device (=", device(), ")");

--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@ -2,6 +2,7 @@

 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Exception.h>
 namespace at {

@ -107,6 +108,39 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
    }
  }

+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const {
+    const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
+    c10::impl::PyInterpreter&& interpreter = nullptr;
+    if (mode_stack_len > 0 &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      const auto& cur_torch_dispatch_mode_state =
+          c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+      interpreter = cur_torch_dispatch_mode_state->pyinterpreter();
+    } else if (
+        key_set_.has(DispatchKey::Python) &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      interpreter = pyobj_slot_.load_pyobj_interpreter();
+    } else {
+      // otherwise just copy the SparseTensorImpl and not the PyObject.
+      auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
+          key_set(), device(), layout_impl(), dtype());
+      copy_tensor_metadata(
+          /*src_sparse_impl=*/this,
+          /*dest_sparse_impl=*/impl.get(),
+          /*version_counter=*/version_counter,
+          /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+      impl->refresh_numel();
+      return impl;
+    }
+    auto r = interpreter->detach(this);
+    r->set_version_counter(std::forward<VariableVersion>(version_counter));
+    r->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    return r;
+  }
+
  /**
   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
   *
@ -116,15 +150,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      const c10::VariableVersion& version_counter,
      bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
-        key_set(), device(), layout_impl(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/version_counter,
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        version_counter, allow_tensor_metadata_change);
  }

  /**
@ -136,15 +163,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      c10::VariableVersion&& version_counter,
      bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
-        key_set(), device(), layout_impl(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/std::move(version_counter),
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        std::move(version_counter), allow_tensor_metadata_change);
  }

 private:
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -2,6 +2,7 @@

 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>

@ -306,6 +307,38 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
      const Tensor& indices,
      const Tensor& values);

+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const {
+    const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
+    c10::impl::PyInterpreter&& interpreter = nullptr;
+    if (mode_stack_len > 0 &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      const auto& cur_torch_dispatch_mode_state =
+          c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+      interpreter = cur_torch_dispatch_mode_state->pyinterpreter();
+    } else if (
+        key_set_.has(DispatchKey::Python) &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      interpreter = pyobj_slot_.load_pyobj_interpreter();
+    } else {
+      // otherwise just copy the SparseTensorImpl and not the PyObject.
+      auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
+      copy_tensor_metadata(
+          /*src_sparse_impl=*/this,
+          /*dest_sparse_impl=*/impl.get(),
+          /*version_counter=*/version_counter,
+          /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+      impl->refresh_numel();
+      return impl;
+    }
+    auto r = interpreter->detach(this);
+    r->set_version_counter(std::forward<VariableVersion>(version_counter));
+    r->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    return r;
+  }
+
  /**
   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
   *
@ -315,14 +348,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      const c10::VariableVersion& version_counter,
      bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/version_counter,
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        version_counter, allow_tensor_metadata_change);
  }

  /**
@ -334,14 +361,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
      c10::VariableVersion&& version_counter,
      bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/std::move(version_counter),
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        std::move(version_counter), allow_tensor_metadata_change);
  }

  /**
--- a/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
@ -298,19 +298,18 @@ class Vectorized<c10::Half> {
    } else if (count == (size() >> 1)) {
      Vectorized<c10::Half> res;
      res.values.val[0] = vld1q_f16(reinterpret_cast<const float16_t*>(ptr));
-      res.values.val[1] = vdupq_n_f16(0);
+      std::memset(&res.values.val[1], 0, sizeof(res.values.val[1]));
      return res;
-    } else {
-      __at_align__ float16_t tmp_values[size()];
-      for (const auto i : c10::irange(size())) {
-        tmp_values[i] = 0;
-      }
-      std::memcpy(
-          tmp_values,
-          reinterpret_cast<const float16_t*>(ptr),
-          count * sizeof(float16_t));
-      return vld1q_f16_x2(reinterpret_cast<const float16_t*>(tmp_values));
    }
+    __at_align__ float16_t tmp_values[size()];
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const float16_t*>(ptr),
+        count * sizeof(float16_t));
+    return vld1q_f16_x2(reinterpret_cast<const float16_t*>(tmp_values));
  }
  void store(void* ptr, int64_t count = size()) const {
    if (count == size()) {
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@ -213,12 +213,36 @@ struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplI
    hip_stream.synchronize();
  }

+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
+    C10_HIP_CHECK(hipEventSynchronize(hip_event));
+  }
+
  void recordDataPtrOnStream(
    const c10::DataPtr& data_ptr,
    const Stream& stream) const override {
    HIPStreamMasqueradingAsCUDA hip_stream{stream};
    HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(data_ptr, hip_stream);
  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    TORCH_CHECK(
+        event1 && event2,
+        "Both events must be recorded before calculating elapsed time.");
+    int orig_device;
+    C10_HIP_CHECK(hipGetDevice(&orig_device));
+    C10_HIP_CHECK(hipSetDevice(device_index));
+    hipEvent_t hip_event1 = static_cast<hipEvent_t>(event1);
+    hipEvent_t hip_event2 = static_cast<hipEvent_t>(event2);
+    float time_ms = 0;
+    // raise hipErrorNotReady if either event is recorded but not yet completed
+    C10_HIP_CHECK(hipEventElapsedTime(&time_ms, hip_event1, hip_event2));
+    C10_HIP_CHECK(hipSetDevice(orig_device));
+    return static_cast<double>(time_ms);
+  }
 };

 // All of the guards which have HIPGuardImpl burned in need to also have
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Copy.h>
+#include <ATen/native/Copy.h>

 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
@ -25,8 +26,12 @@
 #include <ATen/ops/_copy_from.h>
 #include <ATen/ops/_propagate_xla_data.h>
 #include <ATen/ops/_propagate_xla_data_native.h>
+#include <ATen/ops/copy.h>
 #include <ATen/ops/copy_native.h>
+#include <ATen/ops/_foreach_copy.h>
+#include <ATen/ops/_foreach_copy_native.h>
 #include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
 #include <ATen/ops/expand_copy.h>
 #endif

@ -303,15 +308,45 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
  return self;
 }

-Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
-  // copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
-  // (1) It isn't exposed to the frontend (no python bindings)
-  // (2) It isn't exposed to the backend (it's a composite, that decomposes into to() and expand_as() calls.
+Tensor copy_meta(const Tensor& self, const Tensor& src, bool non_blocking) {
+  // Must directly use self(), so we can dispatch properly is self is a subclass
  auto r = clone_preserve_strides(self);
  r.copy_(src, non_blocking);
  return r;
 }

+Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
+  at::Tensor r;
+  // copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
+  // (1) It isn't exposed to the frontend (no python bindings)
+  // (2) It isn't exposed to the backend (it's a composite, that decomposes into to() and expand_as() calls.
+  auto self_storage = self.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl();
+  // If self has no real storage, we can't actually clone it.
+  // Instead, generate an empty tensor with the right sizes/strides, since we should be able to assume
+  // that copy_() will fully overwrite all data with that of src
+  if (self_storage->nbytes() == 0) {
+    r = at::empty_strided(self.sizes(), self.strides());
+  } else {
+    r = clone_preserve_strides(self);
+  }
+  r.copy_(src, non_blocking);
+  return r;
+}
+
+::std::vector<at::Tensor> _foreach_copy(at::TensorList self, at::TensorList src, bool non_blocking) {
+  std::vector<at::Tensor> outs;
+  outs.reserve(self.size());
+  // This is a very slow implementation, but needs to directly call the copy() kernel above to handle
+  // when self has zero storage.
+  // This kernel should never really be run, except with debugging using compile(backend="aot_eager")
+  for (const auto i : c10::irange(src.size())) {
+    auto curr_src = src[i];
+    auto curr_self = self[i];
+    outs.push_back(at::copy(curr_self, curr_src, non_blocking));
+  }
+  return outs;
+}
+
 Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
  auto maybe_outnames = namedinference::compute_broadcast_outnames(self, src);
  {
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@ -178,6 +178,12 @@ inline bool _check_tensors_do_type_promotion_with_scalars(
 // - All tensors must be non-overlapping and dense
 // - Resulting tensor must have the same dtype as the input one

+// [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
+//     ``does_op_promote_integer_inputs_to_float=true`` means that the result of
+//     the op will be float even if inputs are integer or boolean, which
+//     currently fast path does not support. In short, this flag, when
+//     turned on, gatekeeps the op from going down the fastpath.
+
 // Please, make sure to call check_foreach_api_restrictions before calling this
 // method. There is a set of preconditions that have to be satisfied.
 inline bool check_fast_path_restrictions(
@ -231,6 +237,7 @@ inline std::vector<c10::Scalar> convert_tensor_to_scalar_list(
  return scalarList;
 }

+// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
 inline bool can_use_fast_route(
    ArrayRef<TensorList> tensorLists,
    ArrayRef<Scalar> scalarList = {},
@ -239,6 +246,7 @@ inline bool can_use_fast_route(
      tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
 }

+// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
 inline bool can_use_fast_route(
    TensorList tensors1,
    TensorList tensors2,
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@ -230,16 +230,18 @@ FOREACH_BINARY_OP_LIST(
    div,
    std::divides,
    /*division_op*/ true);
+// NOTE(crcrpar): `all_types_half_bfloat16` does not cover bool, so temporarily
+// set `division_op` to true.
 FOREACH_BINARY_OP_LIST(
    all_types_half_bfloat16,
    clamp_max,
    minimum,
-    /*division_op*/ false);
+    /*division_op*/ true);
 FOREACH_BINARY_OP_LIST(
    all_types_half_bfloat16,
    clamp_min,
    maximum,
-    /*division_op*/ false);
+    /*division_op*/ true);
 // NOTE(crcrpar): [Why is foreach_pow's division_op=true?]
 // To push integer inputs to slow path. This is because with integer type inputs
 // the fast path behaves differently from the slow one. Need to investigate
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@ -239,7 +239,9 @@ std::vector<Tensor> foreach_tensor_sub_scalar_kernel_cuda(
      });
 }

-FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_max, minimum, false);
-FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_min, maximum, false);
+// NOTE(crcrpar): `all_types_half_bfloat16` does not cover bool, so temporarily
+// set `division_op` to true.
+FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_max, minimum, true);
+FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_min, maximum, true);

 } // namespace at::native
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@ -235,15 +235,7 @@ std::vector<Tensor> foreach_tensor_sub_scalarlist_kernel_cuda(
      });
 }

-FOREACH_BINARY_OP_SCALARLIST(
-    all_types_half_bfloat16,
-    clamp_max,
-    minimum,
-    false);
-FOREACH_BINARY_OP_SCALARLIST(
-    all_types_half_bfloat16,
-    clamp_min,
-    maximum,
-    false);
+FOREACH_BINARY_OP_SCALARLIST(all_types_half_bfloat16, clamp_max, minimum, true);
+FOREACH_BINARY_OP_SCALARLIST(all_types_half_bfloat16, clamp_min, maximum, true);

 } // namespace at::native
--- a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
@ -27,7 +27,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
    TensorList tensors2,
    TensorList tensors3) {
  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
-  if (!can_use_fast_route({tensors1, tensors2, tensors3})) {
+  if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) {
    return foreach_tensor_ternary_lerp_slow(tensors1, tensors2, tensors3);
  }

@ -64,7 +64,7 @@ void foreach_tensor_lerp_ternary_cuda_(
    TensorList tensors2,
    TensorList tensors3) {
  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
-  if (!can_use_fast_route({tensors1, tensors2, tensors3})) {
+  if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) {
    return foreach_tensor_ternary_lerp_slow_(tensors1, tensors2, tensors3);
  }

@ -94,7 +94,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
    TensorList tensors2,
    const Scalar& weight) {
  check_foreach_api_restrictions(tensors1, tensors2);
-  if (!can_use_fast_route({tensors1, tensors2})) {
+  if (!can_use_fast_route({tensors1, tensors2}, {}, true)) {
    return foreach_tensor_lerp_list_kernel_slow(tensors1, tensors2, weight);
  }

@ -132,7 +132,7 @@ void foreach_tensor_lerp_list_cuda_(
    TensorList tensors2,
    const Scalar& weight) {
  check_foreach_api_restrictions(tensors1, tensors2);
-  if (!can_use_fast_route({tensors1, tensors2})) {
+  if (!can_use_fast_route({tensors1, tensors2}, {}, true)) {
    return foreach_tensor_lerp_list_kernel_slow_(tensors1, tensors2, weight);
  }

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -1750,6 +1750,7 @@
 - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
  variants: function
  dispatch:
+    Meta: copy_meta
    CompositeExplicitAutogradNonFunctional: copy
  tags: core

@ -11357,7 +11358,13 @@
  dispatch:
    CPU: foreach_tensor_copy_list_kernel_slow_
    CUDA: foreach_tensor_copy_list_kernel_cuda_
-  autogen: _foreach_copy, _foreach_copy.out
+  autogen: _foreach_copy.out
+
+- func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _foreach_copy

 - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
  dispatch:
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@ -485,7 +485,7 @@ static at::Tensor _quantized_convolution_onednn(
    torch::List<int64_t> dilation,
    bool transposed,
    int64_t groups,
-    double inv_output_scale,
+    double output_scale,
    int64_t output_zero_point,
    c10::optional<at::Tensor> accum=c10::nullopt, // accum to fused with conv add
    double accum_scale=1.0,
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@ -1397,7 +1397,7 @@ static at::Tensor _quantized_convolution_onednn(
    torch::List<int64_t> dilation,
    bool transposed,
    int64_t groups,
-    double inv_output_scale,  // inv_output_scale is the reciprocal of scale in fake quant
+    double output_scale,
    int64_t output_zero_point,
    c10::optional<at::Tensor> accum, // accum to fused with conv add
    double accum_scale,
@ -1420,10 +1420,10 @@ static at::Tensor _quantized_convolution_onednn(
  bool bfloat16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16);
  if (fp32_output || bfloat16_output) {
    // When fp32 or bf16 output, oneDNN expects op_attr doesn't set_scales and set_zero_points.
-    // So, we will use default inv_output_scale as 1.0 and output_zero_point as 0, since
-    // when inv_output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep;
+    // So, we will use default output_scale as 1.0 and output_zero_point as 0, since
+    // when output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep;
    // when output_zero_point is 0, we will skip invoking of op_attr.set_zero_points in ideep.
-    TORCH_CHECK(inv_output_scale == 1.0,  " (ONEDNN): fp32 or bf16 output, inv_output_scale must be 1.0.");
+    TORCH_CHECK(output_scale == 1.0,  " (ONEDNN): fp32 or bf16 output, output_scale must be 1.0.");
    TORCH_CHECK(output_zero_point == 0,  " (ONEDNN): fp32 or bf16 output, output_zero_point must be 0");
  }

@ -1634,7 +1634,7 @@ static at::Tensor _quantized_convolution_onednn(
  int oc_per_group = packed_weight.get_dim(0) / groups;
  int wei_scale_mask = ideep::utils::conv_weight_scale_mask(weight_scales.numel(), oc_per_group, groups, false);
  op_attr.set_scales_mask(DNNL_ARG_WEIGHTS, wei_scale_mask);
-  if (inv_output_scale != 1.0f) {
+  if (output_scale != 1.0f) {
    op_attr.set_scales_mask(DNNL_ARG_DST, 0);
  }
  if (output_zero_point != 0) {
@ -1671,13 +1671,13 @@ static at::Tensor _quantized_convolution_onednn(
  }
  tensor src_scales_t = tensor(ideep::scale_t(1, act_scale));
  tensor wei_scales_t = tensor(weights_scales);
-  tensor dst_scales_t = tensor(ideep::scale_t(1, 1.0/inv_output_scale));
+  tensor dst_scales_t = tensor(ideep::scale_t(1, output_scale));
  tensor src_zp_t = tensor(ideep::zero_point_t(1, act_zero_point));
  tensor dst_zp_t = tensor(ideep::zero_point_t(1, output_zero_point));
  if (act_scale != 1.0f) {
    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_t});
  }
-  if (inv_output_scale != 1.0f) {
+  if (output_scale != 1.0f) {
    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales_t});
  }
  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_t});
@ -1697,7 +1697,7 @@ static at::Tensor _quantized_convolution_onednn(
    const ideep::scale_t accum_ideep_scale = ideep::scale_t(1, 1.0/accum_scale);
    const ideep::zero_point_t accum_ideep_zero_points = ideep::zero_point_t(1, accum_zero_point);
    // Set the dst scale and zero point with the value of accum.
-    // The true scale and zero point is stored in ideep::scale_t(scale_size, inv_output_scale) and dst_zero_points.
+    // The true scale and zero point is stored in ideep::scale_t(scale_size, output_scale) and dst_zero_points.
    dst.set_scale(accum_ideep_scale);
    dst.set_zero_point(accum_ideep_zero_points);
  }
@ -1707,7 +1707,7 @@ static at::Tensor _quantized_convolution_onednn(
  ideep::convolution_forward::prepare(
      params, src, packed_weight, expected_bias, dst_dims, dst,
      stride.vec(), dilation.vec(), padding.vec(), padding.vec(), groups,
-      src_scales, weights_scales, ideep::scale_t(1, inv_output_scale),
+      src_scales, weights_scales, ideep::scale_t(1, 1.0f / output_scale),
      src_zero_points, dst_zero_points,
      op_attr, dnnl::algorithm::convolution_direct,
      dnnl::prop_kind::forward_inference,
@ -1872,7 +1872,7 @@ class QConvoneDNN final {
      torch::List<int64_t> padding,
      torch::List<int64_t> dilation,
      int64_t groups,
-      double inv_output_scale,  // inv_output_scale is the reciprocal of scale in fake quant
+      double output_scale,
      int64_t output_zero_point,
      c10::optional<c10::ScalarType> output_dtype,
      c10::string_view attr,
@ -1900,7 +1900,7 @@ class QConvoneDNN final {
        act, act_scale, act_zero_point,
        weight, weight_scales, weight_zero_points,
        bias, stride, padding, dilation, /*transposed*/false,
-        groups, inv_output_scale, output_zero_point,
+        groups, output_scale, output_zero_point,
        /*accum*/c10::nullopt, /*accum_scale*/0.0, /*accum_zero_point*/0,
        /*output_dtype*/output_dtype, /*binary_attr*/c10::nullopt, /*binary_alpha*/c10::nullopt,
        /*unary_attr*/attr, /*unary_scalars*/scalars, /*unary_algorithm*/algorithm
@ -1924,7 +1924,7 @@ class QConvoneDNN final {
      torch::List<int64_t> padding,
      torch::List<int64_t> dilation,
      int64_t groups,
-      double inv_output_scale,  // inv_output_scale is the reciprocal of scale in fake quant
+      double output_scale,
      int64_t output_zero_point,
      c10::optional<c10::ScalarType> output_dtype,
      c10::string_view binary_attr,
@ -1952,7 +1952,7 @@ class QConvoneDNN final {
        act, act_scale, act_zero_point,
        weight, weight_scales, weight_zero_points,
        bias, stride, padding, dilation, /*transposed*/false,
-        groups, inv_output_scale, output_zero_point,
+        groups, output_scale, output_zero_point,
        accum, accum_scale, accum_zero_point,
        /*output_dtype*/output_dtype, binary_attr, alpha,
        unary_attr, unary_scalars, unary_algorithm
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -931,7 +931,6 @@ static at::Tensor linear_int8_with_onednn_weight(
    c10::string_view& unary_post_op_algorithm) {
  using ideep::tensor;
  const int64_t dim = input.dim();
-  output_scale = 1.0f / output_scale;
  TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte,
      "qlinear with mkldnn tensor: data type of input should be uint8 (unsigned char).");
  TORCH_CHECK(onednn_weight.scalar_type() == c10::ScalarType::Char,
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@ -10,6 +10,7 @@ extern template int register_conv_params<3>();
 int register_embedding_params();

 TORCH_LIBRARY(quantized, m) {
+  m.set_python_module("caffe2.torch.fb.model_transform.splitting.split_dispatcher");
  register_linear_params();
  register_conv_params<2>();
  register_conv_params<3>();
@ -257,12 +258,12 @@ TORCH_LIBRARY(onednn, m) {
  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv_prepack(Tensor weight, Tensor w_scales, float x_scale, int x_zp, int[] stride, int[] padding, int[] dilation, int groups, int[]? x_shape=None) -> Tensor"));

  // Conv1D/2D/3D with unary postop
-  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));

  // Conv2D with binary postop
-  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));

  // Linear prepack
  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_prepack(Tensor weight, int[]? x_shape) -> Tensor"));
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -301,18 +301,55 @@ bool check_all_tensors_on_device(sdp_params const& params, bool debug) {
 }

 bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
-  const auto num_heads{params.query.sym_size(1)},
-      query_lengths{params.query.sym_size(2)},
-      head_dim{params.query.sym_size(3)};
-  const bool ok = query_lengths % 64 == 0 && head_dim % 64 == 0;
-  if (!ok) {
+  const auto s_q = params.query.sym_size(2);
+  const auto s_k = params.key.sym_size(2);
+  const auto head_dim = params.query.sym_size(3);
+  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
+  if (cudnn_version >= 90000) {
+    if (head_dim % 8 != 0 || head_dim > 256) {
+      if (debug) {
+        TORCH_WARN("head_dim should be a multiple of 8 and no more than 256");
+      }
+      return false;
+    }
+  } else {
+    if (head_dim % 8 != 0 || head_dim > 128) {
+      if (debug) {
+        TORCH_WARN("head_dim should be a multiple of 8 and no more than 128");
+      }
+      return false;
+    }
+  }
+  if (cudnn_version < 8903) {
    if (debug) {
-      TORCH_WARN(
-          "CuDNN requires sequence length and head dim to be divisible by 64. Got sequence length: ",
-          query_lengths,
-          ", head dim: ",
-          head_dim,
-          ".");
+      TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
+    }
+    return false;
+  }
+  if (params.dropout != 0.0 && cudnn_version < 8906) {
+    if (debug) {
+      TORCH_WARN("Dropout reference is only supported on 8.9.6 onwards.");
+    }
+    return false;
+  }
+  if (cudnn_version < 90000) {
+    if (s_q < 64) {
+      if (debug) {
+        TORCH_WARN("s_q less than 64 is not supported before cudnn 9.0.0");
+      }
+      return false;
+    }
+    if ((s_q % 64 != 0 || s_k % 64 != 0) && params.dropout != 0.0) {
+      if (debug) {
+        TORCH_WARN(
+            "s_q not a multiple of 64 with padding/dropout is not supported with cudnn version 9.0.0");
+      }
+      return false;
+    }
+  }
+  if (s_k % 64 != 0 && cudnn_version < 8906) {
+    if (debug) {
+      TORCH_WARN("not-multiple-of-64 seq_kv is not supported below 8.9.6");
    }
    return false;
  }
@ -326,24 +363,64 @@ bool check_cudnn_layout(sdp_params const& params, bool debug) {
  const int64_t s_k = params.key.size(2);
  const int64_t s_v = params.value.size(2);
  // corresponds to cuDNN's "packed QKV" layout
-  const bool query_layout_ok = (params.query.stride(0) == s_q * 3 * h * d) &&
+  const bool packed_query_layout_ok = (params.query.stride(0) == s_q * 3 * h * d) &&
                                 (params.query.stride(1) == d) &&
                                 (params.query.stride(2) == 3 * h * d) &&
                                 (params.query.stride(3) == 1);
-  const bool key_layout_ok = (params.key.stride(0) == s_k * 3 * h * d) &&
+  const bool packed_key_layout_ok = (params.key.stride(0) == s_k * 3 * h * d) &&
                               (params.key.stride(1) == d) &&
                               (params.key.stride(2) == 3 * h * d) &&
                               (params.key.stride(3) == 1);
-  const bool value_layout_ok = (params.value.stride(0) == s_v * 3 * h * d) &&
+  const bool packed_value_layout_ok = (params.value.stride(0) == s_v * 3 * h * d) &&
                                 (params.value.stride(1) == d) &&
                                 (params.value.stride(2) == 3 * h * d) &&
                                 (params.value.stride(3) == 1);
-  if (debug) {
-    if (!query_layout_ok) { TORCH_WARN("Query tensor was not in cuDNN-supported packed QKV layout", params.query.strides()); }
-    if (!key_layout_ok) { TORCH_WARN("Key tensor was not in cuDNN-supported packed QKV layout"); }
-    if (!value_layout_ok) { TORCH_WARN("Value tensor was not in cuDNN-supported packed QKV layout"); }
+
+  const bool packed_layout_ok = packed_query_layout_ok && packed_key_layout_ok && packed_value_layout_ok;
+
+  const bool query_layout_ok = (params.query.stride(0) == s_q * h * d) &&
+                               (params.query.stride(1) == d) &&
+                               (params.query.stride(2) == h * d) &&
+                               (params.query.stride(3) == 1);
+  const bool key_layout_ok = (params.key.stride(0) == s_k * h * d) &&
+                              (params.key.stride(1) == d) &&
+                              (params.key.stride(2) == h * d) &&
+                              (params.key.stride(3) == 1);
+  const bool value_layout_ok = (params.value.stride(0) == s_v * h * d) &&
+                               (params.value.stride(1) == d) &&
+                               (params.value.stride(2) == h * d) &&
+                               (params.value.stride(3) == 1);
+
+  const bool layout_ok = query_layout_ok && key_layout_ok && value_layout_ok;
+
+  if (!packed_value_layout_ok && !layout_ok) {
+    if (debug) {
+      if (!packed_layout_ok) {
+        if (!packed_query_layout_ok) {
+          TORCH_WARN("Query tensor was not in cuDNN-supported packed QKV layout", params.query.strides());
+        }
+        if (!packed_key_layout_ok) {
+          TORCH_WARN("Key tensor was not in cuDNN-supported packed QKV layout", params.key.strides());
+        }
+        if (!packed_value_layout_ok) {
+          TORCH_WARN("Value tensor was not in cuDNN-supported packed QKV layout", params.value.strides());
+        }
+      }
+      if (!layout_ok) {
+        if (!query_layout_ok) {
+          TORCH_WARN("Query tensor was not in cuDNN-supported unpacked QKV layout", params.query.strides());
+        }
+        if (!key_layout_ok) {
+          TORCH_WARN("Key tensor was not in cuDNN-supported unpacked QKV layout", params.key.strides());
+        }
+        if (!value_layout_ok) {
+          TORCH_WARN("Value tensor was not in cuDNN-supported unpacked QKV layout", params.value.strides());
+        }
+      }
+    }
+    return false;
  }
-  return query_layout_ok && key_layout_ok && value_layout_ok;
+  return true;
 }

 bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
@ -434,14 +511,14 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
  constexpr auto general_constraints =
      array_of<bool (*)(sdp_params const&, bool)>(
          check_runtime_enabled_cudnn,
-          check_cudnn_hardware_support);
-          // check_all_tensors_on_device,
-          // check_cudnn_tensor_shapes,
-          // check_cudnn_layout,
+          check_cudnn_hardware_support,
+          check_all_tensors_on_device,
+          check_cudnn_tensor_shapes,
+          check_cudnn_layout,
          // check_is_causal,
-          // check_for_nested_inputs,
-          // check_cudnn_requires_grad,
-          // check_dtypes_low_precision
+          check_for_nested_inputs,
+          check_cudnn_requires_grad,
+          check_dtypes_low_precision);
  for (auto& constraint : general_constraints) {
    if (!constraint(params, debug)) {
      return false;
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -2005,33 +2005,6 @@ def get_dynamo_stats():
    )


-def maybe_fresh_cache(fn, is_cold_start):
-    def inner(*args, **kwargs):
-        cache_minder = contextlib.nullcontext()
-        if is_cold_start:
-            cache_entries = {}
-            cache_minder = fresh_inductor_cache(cache_entries)
-
-        try:
-            with cache_minder:
-                return fn(*args, **kwargs)
-        finally:
-            dump_cache = False
-            if dump_cache and is_cold_start:
-                output_csv(
-                    output_filename[:-4] + "_triton_cache.csv",
-                    ["dev", "name", "batch_size", "triton_cache"],
-                    [
-                        current_device,
-                        current_name,
-                        current_batch_size,
-                        cache_entries,
-                    ],
-                )
-
-    return inner
-
-
@contextmanager
 def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789"):
    try:
@ -3297,12 +3270,6 @@ def parse_args(args=None):
        action="store_true",
        help="print dataframe result used for calculating accuracy",
    )
-    parser.add_argument(
-        "--cold-start-latency",
-        "--cold_start_latency",
-        action="store_true",
-        help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
-    )
    parser.add_argument(
        "--disable-cudagraphs",
        action="store_true",
@ -3415,6 +3382,19 @@ def parse_args(args=None):
        help="Enables Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",
    )

+    group_latency = parser.add_mutually_exclusive_group()
+    group_latency.add_argument(
+        "--cold-start-latency",
+        "--cold_start_latency",
+        action="store_true",
+        help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
+    )
+    group_latency.add_argument(
+        "--warm-start-latency",
+        action="store_true",
+        help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
+    )
+
    group_fuser = parser.add_mutually_exclusive_group()
    # --nvfuser is now the default, keep the option to not break scripts
    group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
@ -3571,9 +3551,17 @@ def process_entry(rank, runner, original_dir, args):
        world_size=args.world_size,
        port=args.distributed_master_port,
    ):
-        return maybe_fresh_cache(
-            run, (args.cold_start_latency and args.only) or args.ci
-        )(runner, args, original_dir)
+        return run(runner, args, original_dir)
+
+
+def maybe_fresh_cache(args):
+    cache_dir_assigned = "TORCHINDUCTOR_CACHE_DIR" in os.environ
+    if not cache_dir_assigned and (
+        args.cold_start_latency or args.warm_start_latency or args.ci
+    ):
+        return fresh_inductor_cache()
+    else:
+        return contextlib.nullcontext()


 def main(runner, original_dir=None, args=None):
@ -3598,23 +3586,39 @@ def main(runner, original_dir=None, args=None):
                f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
            )

-    args.init_distributed = args.only and args.multiprocess
-    if args.init_distributed:
-        # NB: Do NOT query device count before CUDA initialization; we're
-        # going to overwrite CUDA_VISIBLE_DEVICES and this will result in
-        # https://github.com/pytorch/pytorch/issues/107300
-        device_count = torch.cuda.device_count()
-        if device_count <= 1:
-            log.warning(
-                "The use multiprocess flag is set but there are <= 1 devices available."
+    with maybe_fresh_cache(args):
+        args.init_distributed = args.only and args.multiprocess
+        if args.init_distributed:
+            # NB: Do NOT query device count before CUDA initialization; we're
+            # going to overwrite CUDA_VISIBLE_DEVICES and this will result in
+            # https://github.com/pytorch/pytorch/issues/107300
+            device_count = torch.cuda.device_count()
+            if device_count <= 1:
+                log.warning(
+                    "The use multiprocess flag is set but there are <= 1 devices available."
+                )
+            # multiprocess path
+            args.world_size = device_count
+            mp.spawn(
+                process_entry, args=(runner, original_dir, args), nprocs=device_count
            )
-        # multiprocess path
-        args.world_size = device_count
-        mp.spawn(process_entry, args=(runner, original_dir, args), nprocs=device_count)
-    else:
-        # single process path just uses the main process
-        args.world_size = 1
-        process_entry(0, runner, original_dir, args)
+        elif args.only and args.warm_start_latency:
+            # Warm start mode. Enable FX graph caching and perform back-to-back runs in
+            # separate processes (but ensure the inductor cache is preserved across runs).
+            env = os.environ.copy()
+            env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
+            cmd = [sys.executable] + sys.argv
+            cmd.remove("--warm-start-latency")
+
+            print(f"Executing cold-start run for {args.only}")
+            subprocess.check_call(cmd, timeout=args.timeout, env=env)
+
+            print(f"Executing warm-start run for {args.only}")
+            subprocess.check_call(cmd, timeout=args.timeout, env=env)
+        else:
+            # single process path just uses the main process
+            args.world_size = 1
+            process_entry(0, runner, original_dir, args)


 def write_csv_when_exception(args, name: str, status: str, device=None):
--- a/benchmarks/gpt_fast/benchmark.py
+++ b/benchmarks/gpt_fast/benchmark.py
@ -173,6 +173,19 @@ def _load_model(x: Experiment, device="cuda", precision=torch.bfloat16):
    return model.eval()


+def _get_model_size(model):
+    model_size = 0
+    for name, child in model.named_children():
+        if not isinstance(child, torch.nn.Embedding):
+            model_size += sum(
+                [
+                    p.numel() * p.dtype.itemsize
+                    for p in itertools.chain(child.parameters(), child.buffers())
+                ]
+            )
+    return model_size
+
+
 def run_experiment(
    x: Experiment,
    num_samples: int = 5,
@ -193,10 +206,7 @@ def run_experiment(
    prompt_length = prompt.size(0)

    torch.manual_seed(1234)
-    model_size = sum(
-        p.numel() * p.dtype.itemsize
-        for p in itertools.chain(model.parameters(), model.buffers())
-    )
+    model_size = _get_model_size(model)

    aggregate_metrics = {"tokens_per_sec": []}
    start = -1
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@ -37,24 +37,12 @@ endif()
 if(USE_CUDA)
  caffe2_binary_target("inspect_gpu.cc")
  target_link_libraries(inspect_gpu ${CUDA_LIBRARIES})
-  caffe2_binary_target("print_core_object_sizes_gpu.cc")

-  if(BUILD_TEST)
-    # Core overhead benchmark
-    caffe2_binary_target("core_overhead_benchmark_gpu.cc")
-    target_link_libraries(core_overhead_benchmark_gpu benchmark ${CUDA_curand_LIBRARY})
-  endif()
 endif()

 if(USE_ROCM)
  caffe2_hip_binary_target("hip/inspect_gpu.cc")
-  caffe2_hip_binary_target("hip/print_core_object_sizes_gpu.cc")

-  if(BUILD_TEST)
-    # Core overhead benchmark
-    caffe2_hip_binary_target("hip/core_overhead_benchmark_gpu.cc")
-    target_link_libraries(core_overhead_benchmark_gpu benchmark)
-  endif()
 endif()

 if(USE_MPI)
--- a/binaries/core_overhead_benchmark_gpu.cc
+++ b/binaries/core_overhead_benchmark_gpu.cc
@ -1,222 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "benchmark/benchmark.h"
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-
-#define CAFFE2_SKIP_IF_NO_GPU                                      \
-  if (!caffe2::NumCudaDevices()) {                                 \
-    state.SkipWithError("No CUDA available, skipping benchmark."); \
-    return;                                                        \
-  }
-
-using namespace caffe2;
-
-static void BM_CUDAContextCreation(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  volatile CUDAContext context_so_we_do_initialization_work;
-  while (state.KeepRunning()) {
-    volatile CUDAContext context;
-  }
-}
-BENCHMARK(BM_CUDAContextCreation);
-
-static void BM_CUDAContextStreamAccess(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  CUDAContext context;
-  while (state.KeepRunning()) {
-    volatile cudaStream_t stream = context.cuda_stream();
-  }
-}
-BENCHMARK(BM_CUDAContextStreamAccess);
-
-static void BM_cudaGetDevice(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  int id;
-  while (state.KeepRunning()) {
-    CUDA_ENFORCE(cudaGetDevice(&id));
-  }
-}
-BENCHMARK(BM_cudaGetDevice);
-
-static void BM_cudaSetDevice(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  int total = NumCudaDevices();
-  int i = 0;
-  while (state.KeepRunning()) {
-    CUDA_ENFORCE(cudaSetDevice((i++) % total));
-  }
-}
-BENCHMARK(BM_cudaSetDevice);
-
-static void BM_cudaSetAndGetDevice(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  int total = NumCudaDevices();
-  int i = 0;
-  int id;
-  while (state.KeepRunning()) {
-    CUDA_ENFORCE(cudaSetDevice((i++) % total));
-    CUDA_ENFORCE(cudaGetDevice(&id));
-  }
-}
-BENCHMARK(BM_cudaSetAndGetDevice);
-
-static void BM_cudaSetSameDevice(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  while (state.KeepRunning()) {
-    CUDA_ENFORCE(cudaSetDevice(0));
-  }
-}
-BENCHMARK(BM_cudaSetSameDevice);
-
-static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  cudaStream_t stream;
-  while (state.KeepRunning()) {
-    CUDA_ENFORCE(cudaStreamCreate(&stream));
-    CUDA_ENFORCE(cudaStreamSynchronize(stream));
-    CUDA_ENFORCE(cudaStreamDestroy(stream));
-  }
-}
-BENCHMARK(BM_cudaStreamCreateSyncDelete);
-
-static void BM_cudaStreamSynchronize(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  cudaStream_t stream;
-  CUDA_ENFORCE(cudaStreamCreate(&stream));
-  while (state.KeepRunning()) {
-    CUDA_ENFORCE(cudaStreamSynchronize(stream));
-  }
-}
-BENCHMARK(BM_cudaStreamSynchronize);
-
-static void BM_cudaEventRecord(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  cudaStream_t stream;
-  cudaEvent_t event;
-  CUDA_ENFORCE(cudaStreamCreate(&stream));
-  CUDA_ENFORCE(cudaEventCreateWithFlags(
-      &event, cudaEventDefault | cudaEventDisableTiming));
-  while (state.KeepRunning()) {
-    CUDA_ENFORCE(cudaEventRecord(event, stream));
-  }
-}
-BENCHMARK(BM_cudaEventRecord);
-
-static void BM_cudaStreamWaitEventThenStreamSynchronize(
-    benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  cudaStream_t stream;
-  cudaEvent_t event;
-  CUDA_ENFORCE(cudaStreamCreate(&stream));
-  CUDA_ENFORCE(cudaEventCreateWithFlags(
-      &event, cudaEventDefault | cudaEventDisableTiming));
-  CUDA_ENFORCE(cudaEventRecord(event, stream));
-  CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
-  CUDA_ENFORCE(cudaStreamSynchronize(stream));
-  while (state.KeepRunning()) {
-    CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
-    CUDA_ENFORCE(cudaStreamSynchronize(stream));
-  }
-}
-BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
-
-static void BM_CudaPointerAffinity(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  Tensor tensor(vector<int64_t>{1, 2, 3, 4}, CUDA);
-  float* ptr = tensor.mutable_data<float>();
-  while (state.KeepRunning()) {
-    volatile int id = GetGPUIDForPointer(ptr);
-  }
-}
-BENCHMARK(BM_CudaPointerAffinity);
-
-namespace {
-template <class Context>
-class DummyEmptyOp : public Operator<Context> {
- public:
-  DummyEmptyOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws) {}
-
-  bool RunOnDevice() final { return true; }
-};
-
-REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
-REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
-OPERATOR_SCHEMA(DummyEmpty);
-}  // namespace
-
-static void BM_OperatorCreationCPU(benchmark::State& state) {
-  std::unique_ptr<OperatorBase> op;
-  OperatorDef def;
-  Workspace ws;
-  def.set_type("DummyEmpty");
-  def.mutable_device_option()->set_device_type(PROTO_CPU);
-  while (state.KeepRunning()) {
-    op = CreateOperator(def, &ws);
-  }
-}
-BENCHMARK(BM_OperatorCreationCPU);
-
-static void BM_OperatorCreationCUDA(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  std::unique_ptr<OperatorBase> op;
-  OperatorDef def;
-  Workspace ws;
-  def.set_type("DummyEmpty");
-  def.mutable_device_option()->set_device_type(PROTO_CUDA);
-  while (state.KeepRunning()) {
-    op = CreateOperator(def, &ws);
-  }
-}
-BENCHMARK(BM_OperatorCreationCUDA);
-
-static void BM_RawAllocDeallocCPU(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    // Allocating only 1 byte in order to measure the overhead.
-    auto data_ptr = GetCPUAllocator()->allocate(1);
-    // Deallocated when it's out of scope
-  }
-}
-BENCHMARK(BM_RawAllocDeallocCPU);
-
-static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
-  Tensor tensor(CPU);
-  // small allocation
-  tensor.Resize(32, 32);
-  while (state.KeepRunning()) {
-    CHECK(tensor.mutable_data<float>());
-    tensor.FreeMemory();
-  }
-}
-BENCHMARK(BM_TensorAllocDeallocCPU);
-
-static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
-  CAFFE2_SKIP_IF_NO_GPU;
-  Tensor tensor(CUDA);
-  // small allocation
-  tensor.Resize(32, 32);
-  while (state.KeepRunning()) {
-    CHECK(tensor.mutable_data<float>());
-    tensor.FreeMemory();
-  }
-}
-BENCHMARK(BM_TensorAllocDeallocCUDA);
-
-BENCHMARK_MAIN();
--- a/binaries/print_core_object_sizes_gpu.cc
+++ b/binaries/print_core_object_sizes_gpu.cc
@ -1,40 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-
-#include "caffe2/core/operator.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-#define PRINT_SIZE(cls) \
-  std::cout << "Size of " #cls ": " << sizeof(cls) << " bytes." \
-            << std::endl;
-
-int main(int /* unused */, char** /* unused */) {
-  PRINT_SIZE(caffe2::Blob);
-  PRINT_SIZE(caffe2::Tensor);
-  PRINT_SIZE(caffe2::CPUContext);
-  PRINT_SIZE(caffe2::CUDAContext);
-  PRINT_SIZE(caffe2::OperatorBase);
-  PRINT_SIZE(caffe2::OperatorDef);
-  PRINT_SIZE(caffe2::Operator<caffe2::CPUContext>);
-  PRINT_SIZE(caffe2::Operator<caffe2::CUDAContext>);
-  PRINT_SIZE(caffe2::TypeMeta);
-  PRINT_SIZE(caffe2::Workspace);
-  return 0;
-}
--- a/binaries/tsv_2_proto.cc
+++ b/binaries/tsv_2_proto.cc
@ -1,49 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sstream>
-#include <string>
-
-#include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-
-C10_DEFINE_string(f_in, "", "The input data file name.");
-C10_DEFINE_string(f_out, "", "The output data file name.");
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  std::ifstream f_in(FLAGS_f_in);
-  std::ofstream f_out(FLAGS_f_out);
-  std::string line;
-  caffe2::TensorProtos tensor_protos;
-  while (std::getline(f_in, line)) {
-    caffe2::TensorProto* data = tensor_protos.add_protos();
-    data->set_data_type(caffe2::TensorProto::STRING);
-    data->add_dims(1);
-    data->add_string_data(line);
-    data->set_name("text");
-  }
-  f_in.close();
-  std::string output_str;
-  tensor_protos.SerializeToString(&output_str);
-  f_out << output_str;
-  f_out.close();
-  return 0;
-}
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -824,6 +824,7 @@ libtorch_python_core_sources = [
    "torch/csrc/mps/Module.cpp",
    "torch/csrc/mtia/Module.cpp",
    "torch/csrc/inductor/aoti_runner/pybind.cpp",
+    "torch/csrc/inductor/aoti_eager/kernel_holder.cpp",
    "torch/csrc/jit/backends/backend_init.cpp",
    "torch/csrc/jit/python/init.cpp",
    "torch/csrc/jit/passes/onnx.cpp",
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@ -16,7 +16,7 @@ namespace c10 {
 class DataPtr;

 /**
- * Flags defining the behavior of events.
+ * Note [Flags defining the behavior of events]
 *
 * PYTORCH_DEFAULT and BACKEND_DEFAULT are valid for all backends. The
 * BACKEND_DEFAULT is what a particular backend would select if no
@ -223,7 +223,10 @@ struct C10_API DeviceGuardImplInterface {
  /**
   * Fetch the elapsed time between two recorded events.
   */
-  virtual double elapsedTime(void* /*event1*/, void* /*event2*/) const {
+  virtual double elapsedTime(
+      void* /*event1*/,
+      void* /*event2*/,
+      const DeviceIndex /*device_index*/) const {
    TORCH_CHECK(false, "Backend doesn't support elapsedTime.");
  }

--- a/c10/core/impl/InlineEvent.h
+++ b/c10/core/impl/InlineEvent.h
@ -118,7 +118,7 @@ struct InlineEvent final {
        " does not match other's device type ",
        DeviceTypeName(other.device_type()),
        ".");
-    return backend_.elapsedTime(event_, other.event_);
+    return backend_.elapsedTime(event_, other.event_, device_index_);
  }

  void synchronize() const {
--- a/c10/core/impl/VirtualGuardImpl.h
+++ b/c10/core/impl/VirtualGuardImpl.h
@ -87,8 +87,9 @@ class VirtualGuardImpl final : public DeviceGuardImplInterface {
    impl_->recordDataPtrOnStream(data_ptr, stream);
  }

-  double elapsedTime(void* event1, void* event2) const override {
-    return impl_->elapsedTime(event1, event2);
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    return impl_->elapsedTime(event1, event2, device_index);
  }

  void synchronizeEvent(void* event) const override {
--- a/c10/cuda/impl/CUDAGuardImpl.h
+++ b/c10/cuda/impl/CUDAGuardImpl.h
@ -184,6 +184,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    if (!event)
      return true;
    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    // Note: cudaEventQuery can be safely called from any device
    const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event));
    if (err != cudaErrorNotReady) {
      C10_CUDA_CHECK(err);
@ -205,11 +206,44 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    cuda_stream.synchronize();
  }

+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_synchronization(
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
+    }
+    // Note: cudaEventSynchronize can be safely called from any device
+    C10_CUDA_CHECK(cudaEventSynchronize(cuda_event));
+  }
+
  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
      const override {
    CUDAStream cuda_stream{stream};
    CUDACachingAllocator::recordStream(data_ptr, cuda_stream);
  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    TORCH_CHECK(
+        event1 && event2,
+        "Both events must be recorded before calculating elapsed time.");
+    // Even though cudaEventElapsedTime can be safely called from any device, if
+    // the current device is not initialized, it will create a new cuda context,
+    // which will consume a lot of memory.
+    DeviceIndex orig_device{-1};
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device));
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device_index));
+    cudaEvent_t cuda_event1 = static_cast<cudaEvent_t>(event1);
+    cudaEvent_t cuda_event2 = static_cast<cudaEvent_t>(event2);
+    float time_ms = 0;
+    // raise cudaErrorNotReady if either event is recorded but not yet completed
+    C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, cuda_event1, cuda_event2));
+    C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device));
+    return static_cast<double>(time_ms);
+  }
 };

 } // namespace c10::cuda::impl
--- a/c10/util/strong_type.h
+++ b/c10/util/strong_type.h
@ -20,8 +20,6 @@
 #include <type_traits>
 #include <utility>

-#define STRONG_NODISCARD [[nodiscard]]
-
 #if defined(_MSC_VER) && !defined(__clang__) && __MSC_VER < 1922
 #define STRONG_CONSTEXPR
 #else
@ -127,18 +125,18 @@ public:
    swap(a.val, b.val);
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  constexpr T& value_of() & noexcept { return val;}
-  STRONG_NODISCARD
+  [[nodiscard]]
  constexpr const T& value_of() const & noexcept { return val;}
-  STRONG_NODISCARD
+  [[nodiscard]]
  constexpr T&& value_of() && noexcept { return std::move(val);}

-  STRONG_NODISCARD
+  [[nodiscard]]
  friend constexpr T& value_of(type& t) noexcept { return t.val;}
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend constexpr const T& value_of(const type& t) noexcept { return t.val;}
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend constexpr T&& value_of(type&& t) noexcept { return std::move(t).val;}
 private:
  T val;
@ -192,7 +190,7 @@ namespace impl {
  template <
    typename T,
    typename = impl::WhenStrongType<T>>
-  STRONG_NODISCARD
+  [[nodiscard]]
  constexpr
  auto
  access(T&& t)
@ -215,7 +213,7 @@ class equality::modifier<::strong::type<T, Tag, M...>>
 {
  using type = ::strong::type<T, Tag, M...>;
 public:
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -228,7 +226,7 @@ public:
    return value_of(lh) == value_of(rh);
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -251,7 +249,7 @@ namespace impl
    using TT = underlying_type_t<T>;
    using OT = underlying_type_t<Other>;
  public:
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator==(const T& lh, const Other& rh)
@ -260,7 +258,7 @@ namespace impl
    {
      return value_of(lh) == impl::access(rh);
    }
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator==(const Other& lh, const T& rh)
@ -269,7 +267,7 @@ namespace impl
    {
      return impl::access(lh) == value_of(rh) ;
    }
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator!=(const T& lh, const Other rh)
@ -278,7 +276,7 @@ namespace impl
    {
      return value_of(lh) != impl::access(rh);
    }
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator!=(const Other& lh, const T& rh)
@ -307,7 +305,7 @@ namespace impl
    using TT = underlying_type_t<T>;
    using OT = underlying_type_t<Other>;
  public:
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator<(const T& lh, const Other& rh)
@ -316,7 +314,7 @@ namespace impl
    {
      return value_of(lh) < impl::access(rh);
    }
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator<(const Other& lh, const T& rh)
@ -326,7 +324,7 @@ namespace impl
      return impl::access(lh) < value_of(rh) ;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator<=(const T& lh, const Other& rh)
@ -335,7 +333,7 @@ namespace impl
    {
      return value_of(lh) <= impl::access(rh);
    }
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator<=(const Other& lh, const T& rh)
@ -345,7 +343,7 @@ namespace impl
      return impl::access(lh) <= value_of(rh) ;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator>(const T& lh, const Other& rh)
@ -354,7 +352,7 @@ namespace impl
    {
      return value_of(lh) > impl::access(rh);
    }
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator>(const Other& lh, const T& rh)
@ -364,7 +362,7 @@ namespace impl
      return impl::access(lh) > value_of(rh) ;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator>=(const T& lh, const Other& rh)
@ -373,7 +371,7 @@ namespace impl
    {
      return value_of(lh) >= impl::access(rh);
    }
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    auto operator>=(const Other& lh, const T& rh)
@ -487,7 +485,7 @@ class ordered::modifier<::strong::type<T, Tag, M...>>
 {
  using type = ::strong::type<T, Tag, M...>;
 public:
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -500,7 +498,7 @@ public:
    return value_of(lh) < value_of(rh);
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -513,7 +511,7 @@ public:
    return value_of(lh) <= value_of(rh);
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -526,7 +524,7 @@ public:
    return value_of(lh) > value_of(rh);
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR

@ -829,7 +827,7 @@ class affine_point<D>::modifier<::strong::type<T, Tag, M...>>
 public:
  using difference = std::conditional_t<std::is_same<D, void>{}, strong::type<base_diff_type, Tag, strong::difference>, D>;
  static_assert(std::is_constructible<difference, base_diff_type>::value, "");
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  difference
@ -864,7 +862,7 @@ public:
    return lh;
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  type
@ -875,7 +873,7 @@ public:
    return lh += d;
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  type
@ -886,7 +884,7 @@ public:
    return rh+= d;
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  type
@ -911,7 +909,7 @@ class pointer::modifier<::strong::type<T, Tag, M...>>
  using type = strong::type<T, Tag, M...>;
 public:
  template <typename TT = T>
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -925,7 +923,7 @@ public:
  }

  template <typename TT = T>
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -939,7 +937,7 @@ public:
  }

  template <typename TT = T>
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -953,7 +951,7 @@ public:
  }

  template <typename TT = T>
-  STRONG_NODISCARD
+  [[nodiscard]]
  friend
  STRONG_CONSTEXPR
  auto
@ -966,7 +964,7 @@ public:
    return value_of(t) != nullptr;
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  STRONG_CONSTEXPR
  decltype(*std::declval<const T&>())
  operator*()
@ -976,7 +974,7 @@ public:
    return *value_of(self);
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  STRONG_CONSTEXPR
  decltype(&(*std::declval<const T&>())) operator->() const { return &operator*();}
 };
@ -987,7 +985,7 @@ struct arithmetic
  class modifier
  {
  public:
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1058,7 +1056,7 @@ struct arithmetic
      return lh;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1070,7 +1068,7 @@ struct arithmetic
      return lh;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1082,7 +1080,7 @@ struct arithmetic
      return lh;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1094,7 +1092,7 @@ struct arithmetic
      return lh;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1107,7 +1105,7 @@ struct arithmetic
    }

    template <typename TT = T, typename = decltype(value_of(std::declval<TT>()) % value_of(std::declval<TT>()))>
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1191,7 +1189,7 @@ struct bitarithmetic
      return lh;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1203,7 +1201,7 @@ struct bitarithmetic
      return T(v);
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1215,7 +1213,7 @@ struct bitarithmetic
      return lh;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1227,7 +1225,7 @@ struct bitarithmetic
      return lh;
    }

-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1240,7 +1238,7 @@ struct bitarithmetic
    }

    template <typename C>
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1253,7 +1251,7 @@ struct bitarithmetic
    }

    template <typename C>
-    STRONG_NODISCARD
+    [[nodiscard]]
    friend
    STRONG_CONSTEXPR
    T
@ -1286,7 +1284,7 @@ struct indexed<void> {
    using type = strong::type<T, Tag, Ms...>;
  public:
    template<typename I>
-    STRONG_NODISCARD
+    [[nodiscard]]
    auto
    operator[](
      const I &i)
@ -1298,7 +1296,7 @@ struct indexed<void> {
    }

    template<typename I>
-    STRONG_NODISCARD
+    [[nodiscard]]
    auto
    operator[](
      const I &i)
@ -1310,7 +1308,7 @@ struct indexed<void> {
    }

    template<typename I>
-    STRONG_NODISCARD
+    [[nodiscard]]
    auto
    operator[](
      const I &i)
@ -1322,7 +1320,7 @@ struct indexed<void> {
    }

    template<typename I, typename C = cref>
-    STRONG_NODISCARD
+    [[nodiscard]]
    auto
    at(
      const I &i)
@ -1333,7 +1331,7 @@ struct indexed<void> {
    }

    template<typename I, typename R = ref>
-    STRONG_NODISCARD
+    [[nodiscard]]
    auto
    at(
      const I &i)
@ -1344,7 +1342,7 @@ struct indexed<void> {
    }

    template<typename I, typename R = rref>
-    STRONG_NODISCARD
+    [[nodiscard]]
    auto
    at(
      const I &i)
@ -1362,7 +1360,7 @@ class indexed<I>::modifier<type<T, Tag, M...>>
 {
  using type = ::strong::type<T, Tag, M...>;
 public:
-  STRONG_NODISCARD
+  [[nodiscard]]
  auto
  operator[](
    const I& i)
@ -1374,7 +1372,7 @@ public:
    return value_of(self)[impl::access(i)];
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  auto
  operator[](
    const I& i)
@ -1386,7 +1384,7 @@ public:
    return value_of(self)[impl::access(i)];
  }

-  STRONG_NODISCARD
+  [[nodiscard]]
  auto
  operator[](
    const I& i)
@ -1399,7 +1397,7 @@ public:
  }

  template <typename TT = T>
-  STRONG_NODISCARD
+  [[nodiscard]]
  auto
  at(
    const I& i)
@ -1411,7 +1409,7 @@ public:
  }

  template <typename TT = T>
-  STRONG_NODISCARD
+  [[nodiscard]]
  auto
  at(
    const I& i)
@ -1423,7 +1421,7 @@ public:
  }

  template <typename TT = T>
-  STRONG_NODISCARD
+  [[nodiscard]]
  auto
  at(
    const I& i)
--- a/c10/xpu/impl/XPUGuardImpl.h
+++ b/c10/xpu/impl/XPUGuardImpl.h
@ -48,6 +48,10 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    return getCurrentXPUStream(d.index()).unwrap();
  }

+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPool(priority, d.index());
+  }
+
  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
      const override {
    return getStreamFromPool(isHighPriority, d.index());
@ -100,6 +104,7 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    if (xpu_event)
      delete xpu_event;
    xpu_event = new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier());
+    *event = reinterpret_cast<void*>(xpu_event);

    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
    if (C10_UNLIKELY(interp)) {
@ -146,11 +151,29 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    xpu_stream.synchronize();
  }

+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_synchronization(
+          c10::kXPU, reinterpret_cast<uintptr_t>(xpu_event));
+    }
+    xpu_event->wait_and_throw();
+  }
+
  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
      const override {
    const XPUStream xpu_stream{stream};
    XPUCachingAllocator::recordStream(data_ptr, xpu_stream);
  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "elapsedTime is not supported by XPU backend.");
+  }
 };

 } // namespace c10::xpu::impl
--- a/c10/xpu/test/impl/XPUGuardTest.cpp
+++ b/c10/xpu/test/impl/XPUGuardTest.cpp
@ -54,7 +54,9 @@ TEST(XPUGuardTest, EventBehavior) {
  c10::impl::VirtualGuardImpl impl(device.type());
  c10::Stream stream1 = impl.getStream(device);
  c10::Stream stream2 = impl.getStream(device);
-  c10::Event event(device.type());
+  c10::Event event1(device.type());
+  // event is lazily created.
+  EXPECT_FALSE(event1.eventId());

  constexpr int numel = 1024;
  int hostData1[numel];
@ -63,32 +65,54 @@ TEST(XPUGuardTest, EventBehavior) {
  clearHostData(hostData2, numel);

  auto xpu_stream1 = c10::xpu::XPUStream(stream1);
-  int* deviceData = sycl::malloc_device<int>(numel, xpu_stream1);
+  int* deviceData1 = sycl::malloc_device<int>(numel, xpu_stream1);

-  // Copy hostData1 to deviceData via stream1, and then copy deviceData to
+  // Copy hostData1 to deviceData1 via stream1, and then copy deviceData1 to
  // hostData2 via stream2.
-  xpu_stream1.queue().memcpy(deviceData, hostData1, sizeof(int) * numel);
+  xpu_stream1.queue().memcpy(deviceData1, hostData1, sizeof(int) * numel);
  // stream2 wait on stream1's completion.
-  event.record(stream1);
-  event.block(stream2);
+  event1.record(stream1);
+  event1.block(stream2);
  auto xpu_stream2 = c10::xpu::XPUStream(stream2);
-  xpu_stream2.queue().memcpy(hostData2, deviceData, sizeof(int) * numel);
+  xpu_stream2.queue().memcpy(hostData2, deviceData1, sizeof(int) * numel);
  xpu_stream2.synchronize();

-  EXPECT_TRUE(event.query());
+  EXPECT_TRUE(event1.query());
  validateHostData(hostData2, numel);
-  event.record(stream2);
-  EXPECT_TRUE(event.query());
+  event1.record(stream2);
+  event1.synchronize();
+  EXPECT_TRUE(event1.query());

  clearHostData(hostData2, numel);
-  xpu_stream1.queue().memcpy(deviceData, hostData1, sizeof(int) * numel);
+  xpu_stream1.queue().memcpy(deviceData1, hostData1, sizeof(int) * numel);
  // stream2 wait on stream1's completion.
-  event.record(stream1);
-  event.block(stream2);
-  // event will overwrite the previously captured state.
-  event.record(stream2);
-  xpu_stream2.queue().memcpy(hostData2, deviceData, sizeof(int) * numel);
+  event1.record(stream1);
+  event1.block(stream2);
+  // event1 will overwrite the previously captured state.
+  event1.record(stream2);
+  xpu_stream2.queue().memcpy(hostData2, deviceData1, sizeof(int) * numel);
  xpu_stream2.synchronize();
-  EXPECT_TRUE(event.query());
+  EXPECT_TRUE(event1.query());
  validateHostData(hostData2, numel);
+
+  clearHostData(hostData2, numel);
+  // ensure deviceData1 and deviceData2 are different buffers.
+  int* deviceData2 = sycl::malloc_device<int>(numel, xpu_stream1);
+  sycl::free(deviceData1, c10::xpu::get_device_context());
+  c10::Event event2(device.type());
+
+  // Copy hostData1 to deviceData2 via stream1, and then copy deviceData2 to
+  // hostData1 via stream1.
+  xpu_stream1.queue().memcpy(deviceData2, hostData1, sizeof(int) * numel);
+  event2.record(xpu_stream1);
+  event2.synchronize();
+  EXPECT_TRUE(event2.query());
+  clearHostData(hostData1, numel);
+  xpu_stream1.queue().memcpy(hostData1, deviceData2, sizeof(int) * numel);
+  event2.record(xpu_stream1);
+  event2.synchronize();
+  EXPECT_TRUE(event2.query());
+  EXPECT_NE(event1.eventId(), event2.eventId());
+  ASSERT_THROW(event1.elapsedTime(event2), c10::Error);
+  sycl::free(deviceData2, c10::xpu::get_device_context());
 }
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -123,7 +123,6 @@ if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
  if(BUILD_CAFFE2_OPS)
  endif()
  add_subdirectory(proto)
-  add_subdirectory(python)
 endif()
 if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
  add_subdirectory(proto)
@ -1998,158 +1997,10 @@ if(BUILD_PYTHON)
    set_source_files_properties(../aten/src/ATen/native/EmbeddingBag.cpp PROPERTIES COMPILE_FLAGS -Wno-attributes)
    set_source_files_properties(${TORCH_SRC_DIR}/../caffe2/operators/box_with_nms_limit_op.cc PROPERTIES COMPILE_FLAGS -Wno-attributes)
  endif()
-  # Allow different install locations for libcaffe2
-  # For setuptools installs (that all build Python), install libcaffe2 into
-  # site-packages, alongside the torch libraries. The pybind11 library needs
-  # an rpath to the torch library folder
-  # For cmake installs, including c++ only installs, install libcaffe2 into
-  # CMAKE_INSTALL_PREFIX/lib . The pybind11 library can have a hardcoded
-  # rpath
-  set(caffe2_pybind11_rpath "${_rpath_portable_origin}")
-  if(${BUILDING_WITH_TORCH_LIBS})
-    # site-packages/caffe2/python/caffe2_pybind11_state
-    # site-packages/torch/lib
-    set(caffe2_pybind11_rpath "${_rpath_portable_origin}/../../torch/lib")
-  endif(${BUILDING_WITH_TORCH_LIBS})
-
-  # Must also include `CMAKE_SHARED_LINKER_FLAGS` in linker flags for
-  # `caffe2_pybind11_state_*` targets because paths to required libraries may
-  # need to be found there (e.g., specifying path to `libiomp5` with `LDFLAGS`).
-  set(_caffe2_pybind11_state_linker_flags "${CMAKE_SHARED_LINKER_FLAGS}")
-  if(APPLE)
-    set(_caffe2_pybind11_state_linker_flags "${_caffe2_pybind11_state_linker_flags} -undefined dynamic_lookup")
-  endif()

  # ---[ Python.
  if(BUILD_CAFFE2)
-  add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
  target_compile_definitions(torch PRIVATE BUILD_CAFFE2)
-  target_compile_definitions(torch_python PRIVATE BUILD_CAFFE2)
-  if(USE_NUMPY)
-    target_compile_options(caffe2_pybind11_state PRIVATE "-DUSE_NUMPY")
-    target_link_libraries(caffe2_pybind11_state  PRIVATE numpy::numpy)
-  endif()
-  if(NOT MSVC)
-    set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
-  endif()
-  set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "" DEBUG_POSTFIX "")
-  set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
-  set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
-  target_include_directories(caffe2_pybind11_state PRIVATE $<INSTALL_INTERFACE:include>)
-  target_include_directories(caffe2_pybind11_state PRIVATE ${Caffe2_CPU_INCLUDE})
-
-  target_link_libraries(caffe2_pybind11_state PRIVATE
-      torch_library python::python pybind::pybind11)
-  if(USE_MKLDNN)
-      target_link_libraries(caffe2_pybind11_state PRIVATE caffe2::mkldnn)
-  endif()
-  if(WIN32)
-    target_link_libraries(caffe2_pybind11_state PRIVATE onnx_proto)
-  endif(WIN32)
-
-  # Install caffe2_pybind11_state(_gpu|hip) in site-packages/caffe2/python,
-  # so it needs an rpath to find libcaffe2
-  set_target_properties(
-      caffe2_pybind11_state PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-      ${CMAKE_BINARY_DIR}/caffe2/python)
-  install(TARGETS caffe2_pybind11_state DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
-  if(MSVC AND BUILD_SHARED_LIBS)
-    install(FILES $<TARGET_PDB_FILE:caffe2_pybind11_state> DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python" OPTIONAL)
-  endif()
-  set_target_properties(caffe2_pybind11_state PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
-
-  if(USE_CUDA)
-    add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS})
-    if(USE_NUMPY)
-      target_compile_options(caffe2_pybind11_state_gpu PRIVATE "-DUSE_NUMPY")
-      target_link_libraries(caffe2_pybind11_state_gpu PRIVATE numpy::numpy)
-    endif()
-    if(NOT MSVC)
-      set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
-    endif()
-    set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "" DEBUG_POSTFIX "")
-    set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
-    set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
-    target_include_directories(caffe2_pybind11_state_gpu PRIVATE $<INSTALL_INTERFACE:include>)
-    target_include_directories(caffe2_pybind11_state_gpu PRIVATE ${Caffe2_CPU_INCLUDE})
-    target_link_libraries(caffe2_pybind11_state_gpu PRIVATE
-        torch_library python::python pybind::pybind11)
-    if(USE_MKLDNN)
-        target_link_libraries(caffe2_pybind11_state_gpu PRIVATE caffe2::mkldnn)
-    endif()
-    if(WIN32)
-      target_link_libraries(caffe2_pybind11_state_gpu PRIVATE onnx_proto)
-    endif(WIN32)
-
-    # Install with same rpath as non-gpu caffe2_pybind11_state
-    set_target_properties(
-        caffe2_pybind11_state_gpu PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-        ${CMAKE_BINARY_DIR}/caffe2/python)
-    install(TARGETS caffe2_pybind11_state_gpu DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
-    if(MSVC AND BUILD_SHARED_LIBS)
-      install(FILES $<TARGET_PDB_FILE:caffe2_pybind11_state_gpu> DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python" OPTIONAL)
-    endif()
-    set_target_properties(caffe2_pybind11_state_gpu PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
-  endif()
-
-  if(USE_ROCM)
-    add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS})
-    if(USE_NUMPY)
-      target_compile_options(caffe2_pybind11_state_hip PRIVATE "-DUSE_NUMPY")
-      target_link_libraries(caffe2_pybind11_state_hip PRIVATE numpy::numpy)
-    endif()
-    if(NOT MSVC)
-      target_compile_options(caffe2_pybind11_state_hip PRIVATE ${HIP_CXX_FLAGS} -fvisibility=hidden)
-    endif()
-    set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "")
-    set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
-    set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
-    target_include_directories(caffe2_pybind11_state_hip PRIVATE $<INSTALL_INTERFACE:include>)
-    target_include_directories(caffe2_pybind11_state_hip PRIVATE ${Caffe2_CPU_INCLUDE} ${Caffe2_HIP_INCLUDE})
-    target_link_libraries(caffe2_pybind11_state_hip PRIVATE
-        torch_library python::python pybind::pybind11)
-
-    # Install with same rpath as non-hip caffe2_pybind11_state
-    set_target_properties(
-        caffe2_pybind11_state_hip PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-        ${CMAKE_BINARY_DIR}/caffe2/python)
-    install(TARGETS caffe2_pybind11_state_hip DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
-    set_target_properties(caffe2_pybind11_state_hip PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
-  endif()
-
-  if(MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio")
-    # If we are building under windows, we will copy the file from
-    # build/caffe2/python/{Debug,Release}/caffe2_pybind11_state.pyd
-    # to its parent folder so that we can do in-build execution.
-    add_custom_target(windows_python_copy_lib ALL)
-    add_dependencies(windows_python_copy_lib caffe2_pybind11_state)
-    add_custom_command(
-        TARGET windows_python_copy_lib POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy
-        $<TARGET_FILE:caffe2_pybind11_state>
-        ${CMAKE_BINARY_DIR}/caffe2/python)
-    if(USE_CUDA)
-      add_dependencies(windows_python_copy_lib caffe2_pybind11_state_gpu)
-      add_custom_command(
-          TARGET windows_python_copy_lib POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy
-          $<TARGET_FILE:caffe2_pybind11_state_gpu>
-          ${CMAKE_BINARY_DIR}/caffe2/python)
-    endif()
-    if(USE_ROCM)
-      add_dependencies(windows_python_copy_lib caffe2_pybind11_state_hip)
-      add_custom_command(
-          TARGET windows_python_copy_lib POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy
-          $<TARGET_FILE:caffe2_pybind11_state_hip>
-          ${CMAKE_BINARY_DIR}/caffe2/python)
-    endif()
-  endif()
-
-  # Finally, Copy all python files to build directory
-  # Create a custom target that copies all python files.
-  file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR}
-       "${PROJECT_SOURCE_DIR}/caffe2/*.py")
  endif()

  # generated pb files are copied from build/caffe2 to caffe2
--- a/caffe2/ideep/CMakeLists.txt
+++ b/caffe2/ideep/CMakeLists.txt
@ -1,20 +0,0 @@
-if(USE_MKLDNN)
-  message(STATUS "Including IDEEP operators")
-
-  # ---[ CPU files.
-  file(GLOB_RECURSE tmp *.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB_RECURSE tmp *_test.cc)
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-
-  # ---[ CPU test files - currently none but just to be safe
-  file(GLOB_RECURSE tmp *_test.cc)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-else()
-  message(STATUS "Excluding ideep operators as we are not using ideep")
-endif()
--- a/caffe2/ideep/ideep_utils.h
+++ b/caffe2/ideep/ideep_utils.h
@ -1,48 +0,0 @@
-#pragma once
-
-#include <caffe2/core/macros.h>  // For caffe2 macros.
-#include <caffe2/utils/eigen_utils.h>
-// All caffe2 ideep related headers
-#include <ideep.hpp>
-#include <caffe2/ideep/utils/ideep_context.h>
-#include <caffe2/ideep/utils/ideep_operator.h>
-
-namespace caffe2 {
-
-enum ConvAlgorithm {
-  CONV_ALGORITHM_AUTO = 0,
-  CONV_ALGORITHM_WINOGRAD = 1,
-  CONV_ALGORITHM_MAX
-};
-
-enum FusionType {
-  FUSION_UNKNOWN = 0,
-  FUSION_CONV_RELU = 1,
-  FUSION_CONV_SUM = 2,
-  FUSION_CONV_SUM_RELU = 3,
-  FUSION_MAX
-};
-
-#define USE_IDEEP_DEF_ALIASES()                                                \
-  /* the tensor type created/handled by iDEEP  */                              \
-  using itensor = ideep::tensor;                                               \
-  /* the date layout of iDEEP tensor */                                        \
-  using iformat = ideep::format_tag;                                           \
-  /* the scales for iDEEP tensor with different data type */                   \
-  using iscale = ideep::scale_t;                                               \
-  /* the detial algorithm for iDEEP operators, e.g. winograd */                \
-  using ialgo = ideep::algorithm;                                              \
-  /* the kind of propagation for iDEEP operators, e.g. forward, training */    \
-  using iprop = ideep::prop_kind;                                              \
-  /* the kind of low precision operators, e.g. signed/unsigned activation */   \
-  using ilowp_kind = ideep::lowp_kind;                                         \
-  /* the data type of iDEEP tensor, e.g. f32, u8, s8 */                        \
-  using idtype = ideep::tensor::data_type;                                     \
-  /* the descriptor of iDEEP tensor */                                         \
-  using itdesc = ideep::tensor::descriptor;                                    \
-  /* the attribute for operator to describe the details of inputs&fusion */    \
-  using iattr = ideep::attr_t;                                                 \
-  /* the detail flags for batch normalization */                               \
-  using ibn_flag = ideep::batch_normalization_flag;
-
-} // namespace caffe2
--- a/caffe2/ideep/operators/adam_op.cc
+++ b/caffe2/ideep/operators/adam_op.cc
@ -1,160 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-void adam_ideep_compute(
-    int N,
-    const float* w,
-    const float* g,
-    const float* m,
-    const float* v,
-    float* nw,
-    float* nm,
-    float* nv,
-    float beta1,
-    float beta2,
-    float eps_hat,
-    float correction,
-    const float* lr) {
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static)
-#endif
-  for (auto i = 0; i < N; ++i) {
-    float gi = g[i];
-    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
-    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
-    nw[i] = w[i] + lr[0] * correction * mi / (std::sqrt(vi) + eps_hat);
-  }
-}
-
-void adam_ideep_compute_output_grad(
-    int N,
-    const float* w,
-    const float* g,
-    const float* m,
-    const float* v,
-    float* nw,
-    float* nm,
-    float* nv,
-    float* ng,
-    float beta1,
-    float beta2,
-    float eps_hat,
-    float correction,
-    const float* lr) {
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static)
-#endif
-  for (auto i = 0; i < N; ++i) {
-    float gi = g[i];
-    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
-    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
-    float ngi = ng[i] = correction * mi / (std::sqrt(vi) + eps_hat);
-    nw[i] = w[i] + lr[0] * ngi;
-  }
-}
-
-template <typename T>
-class IDEEPAdamOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPAdamOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        beta1_(OperatorBase::GetSingleArgument<float>("beta1", 0.9f)),
-        beta2_(OperatorBase::GetSingleArgument<float>("beta2", 0.999f)),
-        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
-  bool RunOnDevice() override {
-    // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
-    const auto& params = Input(PARAM);
-    const auto& moment_1 = Input(MOMENT_1);
-    const auto& moment_2 = Input(MOMENT_2);
-    const auto& grad = Input(GRAD);
-    // TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
-    const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
-    auto* out_params = Output(OUTPUT_PARAM);
-    auto* out_moment1 = Output(OUTPUT_MOMENT_1);
-    auto* out_moment2 = Output(OUTPUT_MOMENT_2);
-
-    CAFFE_ENFORCE(lr.size() == 1);
-    CAFFE_ENFORCE(grad.get_nelems() == params.get_nelems());
-    CAFFE_ENFORCE(grad.get_nelems() == moment_1.get_nelems());
-    CAFFE_ENFORCE(grad.get_nelems() == moment_2.get_nelems());
-    if (params != *out_params)
-        out_params->init(params.get_descriptor());
-    if (moment_1 != *out_moment1)
-        out_moment1->init(moment_1.get_descriptor());
-    if (moment_2 != *out_moment2)
-        out_moment2->init(moment_2.get_descriptor());
-    const auto w = static_cast<float *>(params.get_data_handle());
-    const auto g = static_cast<float *>(grad.get_data_handle());
-    const auto m = static_cast<float *>(moment_1.get_data_handle());
-    const auto v = static_cast<float *>(moment_2.get_data_handle());
-    auto nw = static_cast<float *>(out_params->get_data_handle());
-    auto nm = static_cast<float *>(out_moment1->get_data_handle());
-    auto nv = static_cast<float *>(out_moment2->get_data_handle());
-    const auto nlr = lr.template data<T>();
-    const auto iter =
-        OperatorBase::Input<TensorCPU>(ITER, CPU).template data<int64_t>()[0];
-    const auto t = iter + 1;
-    const auto correction =
-        std::sqrt(T(1.) - std::pow(beta2_, t)) / (T(1.) - std::pow(beta1_, t));
-    if (OutputSize() == 3) {
-      adam_ideep_compute(
-          grad.get_nelems(),
-          w,
-          g,
-          m,
-          v,
-          nw,
-          nm,
-          nv,
-          beta1_,
-          beta2_,
-          epsilon_,
-          correction,
-          nlr);
-    } else {
-      auto* out_grad = Output(OUTPUT_GRAD);
-      if (grad != *out_grad)
-        out_grad->init(grad.get_descriptor());
-      auto ng = static_cast<float *>(out_grad->get_data_handle());
-      adam_ideep_compute_output_grad(
-          grad.get_nelems(),
-          w,
-          g,
-          m,
-          v,
-          nw,
-          nm,
-          nv,
-          ng,
-          beta1_,
-          beta2_,
-          epsilon_,
-          correction,
-          nlr);
-    }
-
-    return true;
-  }
-
- protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes,cppcoreguidelines-avoid-magic-numbers)
-  T beta1_{0.9};
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes,cppcoreguidelines-avoid-magic-numbers)
-  T beta2_{0.999};
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes,cppcoreguidelines-avoid-magic-numbers)
-  T epsilon_{1e-8};
-  INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, GRAD, LR, ITER);
-  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(Adam, IDEEPAdamOp<float>);
-
-} // namespace
--- a/caffe2/ideep/operators/channel_shuffle_op.cc
+++ b/caffe2/ideep/operators/channel_shuffle_op.cc
@ -1,55 +0,0 @@
-#include <caffe2/ideep/operators/conv_pool_base_op.h>
-
-using namespace caffe2;
-
-namespace {
-
-class ChannelShuffleOp final : public IDEEPConvPoolOpBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws) {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-
-    ideep::channel_shuffle_forward::compute(X, *Y, group_);
-
-    return true;
-  }
-
- private:
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class ChannelShuffleGradientOp final : public IDEEPConvPoolOpBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  ChannelShuffleGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws) {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dX = Output(INPUT_GRAD);
-
-    ideep::channel_shuffle_backward::compute(dY, *dX, group_);
-
-    return true;
-  }
-
- private:
-  INPUT_TAGS(OUTPUT_GRAD);
-  OUTPUT_TAGS(INPUT_GRAD);
-};
-
-
-REGISTER_IDEEP_OPERATOR(ChannelShuffle, ChannelShuffleOp);
-REGISTER_IDEEP_OPERATOR(ChannelShuffleGradient, ChannelShuffleGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@ -1,176 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-#include <caffe2/ideep/operators/operator_fallback_ideep.h>
-#include <caffe2/operators/concat_split_op.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPConcatOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-  using FALLBACK_OP = IDEEPFallbackOp<ConcatOp<CPUContext>, SkipIndices<0>>;
-
-  IDEEPConcatOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        fallback_(operator_def, ws) {
-    CAFFE_ENFORCE(
-      !(OperatorBase::HasArgument("axis") && OperatorBase::HasArgument("order")),
-        "You shouldn't specify both the dim to concat, and the order "
-        "in the case of 4-D images.");
-    if (OperatorBase::HasArgument("axis")) {
-      axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
-      add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
-    } else {
-      axis_ = 1;
-      add_axis_ = 0;
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPConcatOp() override {}
-
-  bool RunOnDevice() override {
-    bool fallback_to_cpu = false;
-    vector<itensor> inputs_itensor;
-
-    for (int i = 0; i < InputSize(); ++i) {
-      if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
-        auto& tensor_ideep = Input(i);
-        if (tensor_ideep.ndims() == 0 || tensor_ideep.get_nelems() == 0)
-          continue;
-        inputs_itensor.emplace_back(tensor_ideep);
-      } else {
-        CAFFE_ENFORCE(
-            BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
-            "Expect cpu tensor if not itensor");
-        auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
-        if (tensor_cpu.sizes().size() == 0 || tensor_cpu.numel() == 0)
-          continue;
-        fallback_to_cpu = true;
-        break;
-      }
-    }
-
-    if (!fallback_to_cpu) {
-      int adj_size = inputs_itensor[0].ndims() + (add_axis_ ? 1 : 0);
-      int canonical_axis = canonical_axis_index_(axis_, adj_size);
-      auto* output = Output(OUTPUT);
-      Tensor* axis_info = OutputTensor(AXIS_INFO,
-        vector<int64_t>(1, InputSize()), at::dtype<int>().device(CPU));
-      auto* axis_data = axis_info->template mutable_data<int>();
-      auto axis_vdata =
-        ideep::concat::compute(inputs_itensor, canonical_axis, add_axis_, *output);
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      for (int i = 0; i < axis_vdata.size(); i++) {
-        axis_data[i] = axis_vdata[i];
-      }
-      return true;
-    }
-
-    return fallback_.Run(0);
-  }
-
- private:
-  int axis_;
-  int add_axis_;
-  FALLBACK_OP fallback_;
-
-  INPUT_TAGS(INPUT0);
-  OUTPUT_TAGS(OUTPUT, AXIS_INFO);
-};
-
-class IDEEPSplitOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPSplitOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        axis_offset_(OperatorBase::GetRepeatedArgument<int>("split")) {
-    CAFFE_ENFORCE(
-      !(OperatorBase::HasArgument("axis") && OperatorBase::HasArgument("order")),
-        "You shouldn't specify both the dim to split, and the order "
-        "in the case of 4-D images.");
-    if (OperatorBase::HasArgument("axis")) {
-      axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
-      // only exists for computing the gradient of a Concat with 'add_axis'
-      add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
-    } else {
-      axis_ = 1;
-      add_axis_ = 0;
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPSplitOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& input = Input(INPUT);
-    int canonical_axis = canonical_axis_index_(axis_, input.ndims());
-    const int input_channels = input.get_dim(canonical_axis);
-    vector<int> axis_vdata(OutputSize(), 0);
-    if (InputSize() == 2) {
-      // We obtain split from the input tensor.
-      CAFFE_ENFORCE_EQ(
-          axis_offset_.size(),
-          0,
-          "If you set split with an input blob, do not pass in "
-          "split in the argument.");
-      auto& axis_info = OperatorBase::Input<Tensor>(AXIS_INFO, CPU);
-      CAFFE_ENFORCE_EQ(axis_info.numel(), OutputSize());
-      auto* axis_data = axis_info.template data<int>();
-      axis_vdata.assign(axis_data, axis_data + OutputSize());
-    } else if (axis_offset_.size() == 0) {
-      CAFFE_ENFORCE_EQ(
-          input_channels % OutputSize(),
-          0,
-          "If you did not specify split explicitly, the number of "
-          "input channels should be divisible by the output size.");
-      axis_vdata.assign(OutputSize(), input_channels / OutputSize());
-    } else {
-      // We obtain split from the parameters.
-      CAFFE_ENFORCE_EQ(
-          axis_offset_.size(),
-          OutputSize(),
-          "The number of splits specified should be equal to the "
-          "number of outputs.");
-      axis_vdata = axis_offset_;
-    }
-
-    CAFFE_ENFORCE_EQ(
-        add_axis_ ? OutputSize()
-                  : std::accumulate(
-                    axis_vdata.data(), axis_vdata.data() + OutputSize(), 0),
-        input_channels,
-        "Sum of split dimensions do not match: should be ",
-        input_channels);
-
-    auto iten_vector = ideep::spliter::compute(
-        input, axis_vdata, canonical_axis, add_axis_);
-    CAFFE_ENFORCE_EQ(
-        iten_vector.size(),
-        OutputSize(),
-        "Output size does not match: should be ",
-        OutputSize());
-
-    for (int i = 0; i < OutputSize(); i++) {
-      auto* output = Output(i);
-      *output = iten_vector[i];
-    }
-
-    return true;
-  }
-
- private:
-  int axis_;
-  int add_axis_;
-  vector<int> axis_offset_;
-
-  INPUT_TAGS(INPUT, AXIS_INFO);
-};
-
-
-REGISTER_IDEEP_OPERATOR(Concat, IDEEPConcatOp);
-REGISTER_IDEEP_OPERATOR(Split, IDEEPSplitOp);
-
-} // namespace
--- a/caffe2/ideep/operators/conv_op.cc
+++ b/caffe2/ideep/operators/conv_op.cc
@ -1,357 +0,0 @@
-#include <caffe2/ideep/operators/conv_pool_base_op.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPConvOp : public IDEEPConvPoolOpBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPConvOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(
-        order_ == StorageOrder::NCHW, "Unsupported storage order.");
-    OPERATOR_NEEDS_FEATURE(
-        pad_l() == pad_r() && pad_t() == pad_b(),
-        "Uneven padding not supported.");
-
-    fusion_type_ = FUSION_UNKNOWN;
-    last_input_ = BIAS_OR_INPUT_S;
-
-    training_mode_ = OperatorBase::GetSingleArgument<int>("training_mode", 0);
-    pk_ = training_mode_ ? iprop::forward_training : iprop::forward_inference;
-
-    algo_ = ialgo::convolution_direct;
-    auto conv_algorithm = OperatorBase::GetSingleArgument<int>(
-        "conv_algorithm", CONV_ALGORITHM_AUTO);
-    if (conv_algorithm == CONV_ALGORITHM_WINOGRAD) {
-      algo_ = ialgo::convolution_winograd;
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPConvOp() override {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& X = Input(INPUT_X);
-    const auto& filter = Input(FILTER);
-    auto* Y = Output(OUTPUT);
-
-    CAFFE_ENFORCE(4 == X.ndims());
-    CAFFE_ENFORCE(4 == filter.ndims());
-    CAFFE_ENFORCE_EQ(filter.get_dim(2), kernel_h());
-    CAFFE_ENFORCE_EQ(filter.get_dim(3), kernel_w());
-    CAFFE_ENFORCE(
-        X.get_dim(1) == filter.get_dim(1) * group_,
-        "Convolution op: input channels does not match: # of input channels ",
-        X.get_dim(1),
-        " is not equal to kernel channels * group:",
-        filter.get_dim(1),
-        "*",
-        group_);
-
-    bool input_changed = (cached_X_descriptor_ != X.get_descriptor());
-    if (input_changed) {
-      cached_X_descriptor_ = X.dup_descriptor();
-    }
-
-    bool weights_changed = (cached_weights_descriptor_ != filter.get_descriptor());
-    if (!training_mode_ && weights_changed) {
-      cached_weights_descriptor_ = filter.dup_descriptor();
-      auto expected_descriptor =
-          ideep::convolution_forward::expected_weights_desc(
-              filter.get_dims(),
-              idtype::f32,
-              {stride_.begin(), stride_.end()},
-              pad_tl(),
-              pad_br(),
-              {dilation_.begin(), dilation_.end()},
-              group_,
-              algo_,
-              pk_,
-              idtype::f32,
-              X.get_dims());
-      if (filter.get_descriptor() != expected_descriptor) {
-        filter_.init(expected_descriptor);
-        filter_.feed_from(filter);
-      } else {
-        filter_ = filter;
-      }
-    }
-
-    bool with_bias = InputSize() > last_input_;
-    auto filter_in = training_mode_ ? filter : filter_;
-    if (training_mode_ || input_changed || weights_changed) {
-      auto Y_dims_conv = CalcOutputDims(X, filter.get_dim(0));
-      if (with_bias) {
-        ideep::convolution_forward::prepare(
-            conv_param,
-            X,
-            filter_in,
-            Input(BIAS_OR_INPUT_S),
-            Y_dims_conv,
-            *Y,
-            {stride_.begin(), stride_.end()},
-            {dilation_.begin(), dilation_.end()},
-            pad_tl(),
-            pad_br(),
-            group_,
-            dummy_scale_,
-            dummy_scale_,
-            dummy_scale_,
-            attr_,
-            algo_,
-            pk_);
-      } else {
-          ideep::convolution_forward::prepare(
-            conv_param,
-            X,
-            filter_in,
-            Y_dims_conv,
-            *Y,
-            {stride_.begin(), stride_.end()},
-            {dilation_.begin(), dilation_.end()},
-            pad_tl(),
-            pad_br(),
-            group_,
-            dummy_scale_,
-            dummy_scale_,
-            dummy_scale_,
-            attr_,
-            algo_,
-            pk_);
-      }
-    }
-
-    if (with_bias) {
-      ideep::convolution_forward::compute(conv_param, X, filter_in,
-                                          Input(BIAS_OR_INPUT_S), *Y);
-    } else {
-      ideep::convolution_forward::compute(conv_param, X, filter_in, *Y);
-    }
-
-    if (fusion_type_ == FUSION_CONV_SUM
-        || fusion_type_ == FUSION_CONV_SUM_RELU) {
-      CAFFE_ENFORCE_EQ(Y,  &(Input(InputSize() - 1)),
-          "Convolution fusion op: InPlace is enforced for sum fusion.");
-    }
-
-    return true;
-  }
-
- protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  iprop pk_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  ialgo algo_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  iattr attr_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  int last_input_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  bool training_mode_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  FusionType fusion_type_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  itensor filter_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  iscale dummy_scale_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  itensor::descriptor cached_X_descriptor_, cached_weights_descriptor_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  ideep::convolution_forward_params conv_param;
-
-  INPUT_TAGS(INPUT_X, FILTER, BIAS_OR_INPUT_S, INPUT_S);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPConvFusionOp final : public IDEEPConvOp {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPConvFusionOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvOp(operator_def, ws) {
-    CAFFE_ENFORCE(OperatorBase::HasArgument("fusion_type"),
-          "You should specify the fusion type");
-    fusion_type_ = static_cast<FusionType>(
-        OperatorBase::GetSingleArgument<int>("fusion_type", FUSION_UNKNOWN));
-    OPERATOR_NEEDS_FEATURE(
-        fusion_type_ > FUSION_UNKNOWN && fusion_type_ < FUSION_MAX,
-        "Undefined Conv fusion type.",
-        fusion_type_);
-
-    switch (fusion_type_) {
-      case FUSION_CONV_RELU:
-        attr_ = iattr::fuse_relu();
-        last_input_ = BIAS_OR_INPUT_S;
-        break;
-      case FUSION_CONV_SUM:
-        attr_ = iattr::fuse_sum();
-        last_input_ = INPUT_S;
-        break;
-      case FUSION_CONV_SUM_RELU:
-        attr_ = iattr::residual();
-        last_input_ = INPUT_S;
-        break;
-      default:
-        CAFFE_THROW("Unsupported conv fusion type!");
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPConvFusionOp() override {}
-};
-
-const char* kConvFusionDoc = R"DOC(
-Note that other parameters, such as the stride and
-kernel size, or the pads' sizes in each direction are not necessary for input
-because they are provided by the ConvPoolOpBase operator. Various dimension
-checks are done implicitly, and the sizes are specified in the Input docs for
-this operator. As is expected, the filter is convolved with a subset of the
-image and the bias is added; this is done throughout the image data and the
-output is computed. As a side note on the implementation layout:
-conv_op_impl.h is the templated implementation of the conv_op.h file, which is
-why they are separate files.
-)DOC";
-
-std::function<void(OpSchema&)> ConvFusionDocGenerator(const char* dim) {
-  return [=](OpSchema& schema) {
-    string doc = R"DOC(
-The convolution fusion operator consumes an input vector, a {dim}filter blob,
-a bias blob and another input vector and computes the output. This operator
-gives the chance to fuse the ReLU or element-wise Sum with a convolution
-operator. {conv_fusion_doc})DOC";
-    c10::ReplaceAll(doc, "{dim}", dim);
-    c10::ReplaceAll(doc, "{conv_fusion_doc}", kConvFusionDoc);
-    schema.SetDoc(doc);
-    schema.Input(
-        0,
-        "X",
-        "Input data blob from previous layer; has size (N x C x H x W), "
-        "where N is the batch size, C is the number of channels, "
-        "and H and W are the height and width. Note that this is for the NCHW "
-        "usage. On the other hand, the NHWC Op has a different set of "
-        "dimension constraints. ");
-    schema.Input(
-        1,
-        "filter",
-        "The filter blob that will be used in the "
-        "convolutions; has size (M x C x kH x kW), where C is the number of "
-        "channels, and kH and kW are the height and width of the kernel.");
-    schema.Input(
-        2,
-        "bias",
-        "The 1D bias blob that is added through the "
-        "convolution; has size (M).");
-    schema.Input(
-        3,
-        "S",
-        "Input data blob for element-wise Sum fusion from previous layer; "
-        "has the same size of convolution output. Its input index should "
-        "be 2 if no bias for this convolution, and it MUST be inplace with "
-        "output Y.");
-    schema.Output(
-        0,
-        "Y",
-        "Output data blob that contains the result of the "
-        "convolution fusion. The output dimensions are functions of the kernel "
-        "size, stride size, and pad lengths."
-        "");
-  };
-}
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,clang-diagnostic-unused-function)
-OPERATOR_SCHEMA(ConvFusion)
-    .NumInputs(2, 4)
-    .NumOutputs(1)
-    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        ConvPoolOpBase<CPUContext>::CostInferenceForConv))
-    .Arg("fusion_type", "Which fusion type is used")
-    .AllowInplace({{2, 0}, {3, 0}})
-    .FillUsing(ConvFusionDocGenerator(""));
-
-class IDEEPConvGradientOp final : public IDEEPConvPoolOpBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws),
-        no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
-    OPERATOR_NEEDS_FEATURE(
-        pad_l() == pad_r() && pad_t() == pad_b(),
-        "Uneven padding not supported.");
-    CAFFE_ENFORCE(
-        !(no_bias_ && OutputSize() == 3),
-        "If bias is not present, you should not have 3 grad output.");
-    CAFFE_ENFORCE(
-        OperatorBase::GetSingleArgument<int>("training_mode", 0),
-        "In order to backward propagate weights correctly, "
-        "please set training_mode=1");
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPConvGradientOp() override {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& X = Input(INPUT);
-    const auto& filter = Input(FILTER);
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dfilter = Output(FILTER_GRAD);
-
-    if (no_bias_) {
-      ideep::convolution_backward_weights::compute(
-          X,
-          dY,
-          filter.get_dims(),
-          *dfilter,
-          {stride_.begin(), stride_.end()},
-          {dilation_.begin(), dilation_.end()},
-          pad_tl(),
-          pad_br(),
-          group_);
-    } else {
-      auto* dbias = Output(BIAS_OR_INPUT_GRAD);
-      ideep::convolution_backward_weights::compute(
-          X,
-          dY,
-          filter.get_dims(),
-          *dfilter,
-          *dbias,
-          {stride_.begin(), stride_.end()},
-          {dilation_.begin(), dilation_.end()},
-          pad_tl(),
-          pad_br(),
-          group_);
-    }
-
-    if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
-      auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
-      ideep::convolution_backward_data::compute(
-          dY,
-          filter,
-          X.get_dims(),
-          *dX,
-          {stride_.begin(), stride_.end()},
-          {dilation_.begin(), dilation_.end()},
-          pad_tl(),
-          pad_br(),
-          group_);
-    }
-
-    return true;
-  }
-
- private:
-  bool no_bias_;
-
-  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
-  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(Conv, IDEEPConvOp);
-REGISTER_IDEEP_OPERATOR(ConvFusion, IDEEPConvFusionOp);
-REGISTER_IDEEP_OPERATOR(ConvGradient, IDEEPConvGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/conv_pool_base_op.h
+++ b/caffe2/ideep/operators/conv_pool_base_op.h
@ -1,77 +0,0 @@
-#ifndef CAFFE2_IDEEP_OPERATORS_CONV_POOL_BASE_OP_H_
-#define CAFFE2_IDEEP_OPERATORS_CONV_POOL_BASE_OP_H_
-
-#include <vector>
-
-#include "caffe2/ideep/ideep_utils.h"
-#include "caffe2/operators/conv_pool_op_base.h"
-
-namespace caffe2 {
-
-class IDEEPConvPoolOpBase : public ConvPoolOpBase<IDEEPContext> {
- public:
-  IDEEPConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
-     : ConvPoolOpBase<IDEEPContext>(operator_def, ws) {}
-  ~IDEEPConvPoolOpBase() override {}
-
-  inline const ideep::tensor& Input(int index) {
-    return OperatorBase::template Input<ideep::tensor>(index);
-  }
-  inline ideep::tensor* Output(int index) {
-    return OperatorBase::template Output<ideep::tensor>(index);
-  }
-
-  ideep::tensor::dims pad_tl() const {
-    return {pad_t(), pad_l()};
-  }
-
-  ideep::tensor::dims pad_br() const {
-    return {pad_b(), pad_r()};
-  }
-
-  ideep::tensor::dims CalcOutputDims(
-      const ideep::tensor& input,
-      int output_channel) {
-    CAFFE_ENFORCE_GT(input.get_size(), 0);
-    std::vector<int> output_dims;
-    const auto input_dims = input.get_dims();
-    std::vector<std::int64_t> input_Tdims(
-        input_dims.cbegin(), input_dims.cend());
-    InferOutputSize(
-        input_Tdims,
-        output_channel,
-        StorageOrder::NCHW, //order_,
-        global_pooling_,
-        legacy_pad_,
-        dilation_,
-        stride_,
-        &kernel_,
-        &pads_,
-        &output_dims);
-    return {output_dims.begin(), output_dims.end()};
-  }
-
-  bool RunOnDevice() override {
-    if (!global_pooling_) {
-      for (const auto dim : c10::irange(kernel_.size())) {
-        CAFFE_ENFORCE_GT(kernel_[dim], 0);
-      }
-    }
-
-    try {
-      return RunOnDeviceWithOrderNCHW();
-    } catch (ideep::error& e) {
-      LOG(ERROR) << "IDEEP error:" << e.message;
-      throw;
-    }
-  }
-};
-
-#define USE_IDEEP_CONV_POOL_BASE_FUNCTIONS()             \
-  USE_OPERATOR_BASE_FUNCTIONS;                           \
-  /* using override */ using IDEEPConvPoolOpBase::Input; \
-  /* using override */ using IDEEPConvPoolOpBase::Output;
-
-} // namespace caffe2
-
-#endif // CAFFE2_IDEEP_OPERATORS_CONV_POOL_BASE_OP_H_
--- a/caffe2/ideep/operators/conv_transpose_op.cc
+++ b/caffe2/ideep/operators/conv_transpose_op.cc
@ -1,160 +0,0 @@
-#include "caffe2/operators/conv_transpose_op.h"
-#include "caffe2/ideep/operators/conv_transpose_unpool_base_op.h"
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPConvTransposeOp final : public IDEEPConvTransposeUnpoolBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS();
-
-  IDEEPConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvTransposeUnpoolBase(operator_def, ws),
-        training_mode_(
-            OperatorBase::GetSingleArgument<int>("training_mode", 0)) {
-    OPERATOR_NEEDS_FEATURE(
-        pad_l() == pad_r() && pad_t() == pad_b(),
-        "Uneven padding not supported.");
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPConvTransposeOp() override {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& X = Input(INPUT);
-    const auto& filter = Input(FILTER);
-    auto* Y = Output(OUTPUT);
-    CAFFE_ENFORCE_EQ(X.ndims(), 4);
-    CAFFE_ENFORCE_EQ(filter.ndims(), 4);
-    CAFFE_ENFORCE_EQ(filter.get_dim(2), kernel_h());
-    CAFFE_ENFORCE_EQ(filter.get_dim(3), kernel_w());
-    CAFFE_ENFORCE_EQ(filter.get_dim(0), X.get_dim(1),
-                     "filter number must be equal to input channel number");
-
-    auto Y_dims = CalcOutputDims(X, filter.get_dim(1));
-
-    bool weights_changed = (cached_weights_descriptor_ != filter.get_descriptor());
-    if (!training_mode_ && weights_changed) {
-      cached_weights_descriptor_ = filter.dup_descriptor();
-      // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
-      auto filter_in = filter;
-
-      auto expected_descriptor =
-          ideep::convolution_transpose_forward::expected_weights_desc(
-              filter.get_dims(),
-              filter.get_data_type(),
-              {stride_.begin(), stride_.end()},
-              pad_tl(),
-              pad_br());
-      if (filter_in.get_descriptor() != expected_descriptor) {
-        filter_.init(expected_descriptor);
-        filter_.feed_from(filter_in, /*is_deconv_weights=*/true);
-      } else {
-        filter_ = filter_in;
-      }
-    }
-
-    auto transposed_filter = training_mode_ ? filter : filter_;
-    transposed_filter.transpose_(0, 1);
-
-    if (InputSize() > BIAS) {
-      const auto& bias = Input(BIAS);
-      CAFFE_ENFORCE_EQ(bias.ndims(), 1, "bias must be 1D tensor");
-      CAFFE_ENFORCE_EQ(
-          bias.get_dim(0), filter.get_dim(1),
-          "bias dimension must be equal to output channel number");
-
-      ideep::convolution_transpose_forward::compute(
-          X, transposed_filter, bias, Y_dims, *Y,
-          {stride_.begin(), stride_.end()} , pad_tl(), pad_br());
-    } else {
-      ideep::convolution_transpose_forward::compute(
-          X, transposed_filter, Y_dims, *Y,
-          {stride_.begin(), stride_.end()}, pad_tl(), pad_br());
-    }
-    return true;
-  }
-
- private:
-  INPUT_TAGS(INPUT, FILTER, BIAS);
-  OUTPUT_TAGS(OUTPUT);
-
-  const bool training_mode_;
-  ideep::tensor filter_;
-  ideep::tensor::descriptor cached_weights_descriptor_;
-};
-
-class IDEEPConvTransposeGradientOp final : public IDEEPConvTransposeUnpoolBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS();
-
-  IDEEPConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvTransposeUnpoolBase(operator_def, ws),
-        no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", false)) {
-    OPERATOR_NEEDS_FEATURE(
-        pad_l() == pad_r() && pad_t() == pad_b(),
-        "Uneven padding not supported.");
-    CAFFE_ENFORCE(
-        !(no_bias_ && OutputSize() == 3),
-        "If bias is not present, you should not have 3 grad output.");
-    CAFFE_ENFORCE(
-        OperatorBase::GetSingleArgument<int>("training_mode", 0),
-        "In order to backward propagate weights correctly, "
-        "please set training_mode=1");
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPConvTransposeGradientOp() override {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& X = Input(INPUT);
-    const auto& filter = Input(FILTER);
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dfilter = Output(FILTER_GRAD);
-    auto transposed_filter = filter;
-    transposed_filter.transpose_(0, 1);
-
-    if (no_bias_) {
-      ideep::convolution_transpose_backward_weights::compute(
-          X,
-          dY,
-          filter.get_dims(),
-          *dfilter,
-          {stride_.begin(), stride_.end()},
-          pad_tl(),
-          pad_br());
-    } else {
-      auto* dbias = Output(BIAS_OR_INPUT_GRAD);
-      ideep::convolution_transpose_backward_weights::compute(
-          X,
-          dY,
-          filter.get_dims(),
-          *dfilter,
-          *dbias,
-          {stride_.begin(), stride_.end()},
-          pad_tl(),
-          pad_br());
-    }
-
-    if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
-      auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
-      ideep::convolution_transpose_backward_data::compute(
-          dY, transposed_filter, X.get_dims(), *dX,
-          {stride_.begin(), stride_.end()}, pad_tl(), pad_br());
-    }
-
-    return true;
-  }
-
- private:
-  bool no_bias_;
-
-  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
-  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(ConvTranspose, IDEEPConvTransposeOp);
-REGISTER_IDEEP_OPERATOR(ConvTransposeGradient, IDEEPConvTransposeGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
+++ b/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
@ -1,261 +0,0 @@
-#pragma once
-
-#include "caffe2/ideep/ideep_utils.h"
-#include "caffe2/proto/caffe2_legacy.pb.h"
-
-namespace {
-
-class IDEEPConvTransposeUnpoolBase : public caffe2::IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPConvTransposeUnpoolBase(const caffe2::OperatorDef& operator_def, caffe2::Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        legacy_pad_(
-            static_cast<caffe2::LegacyPadding>(OperatorBase::GetSingleArgument<int>(
-                "legacy_pad",
-                caffe2::LegacyPadding::NOTSET))),
-        kernel_(OperatorBase::GetRepeatedArgument<int>("kernels")),
-        stride_(OperatorBase::GetRepeatedArgument<int>("strides")),
-        pads_(OperatorBase::GetRepeatedArgument<int>("pads")),
-        adj_(OperatorBase::GetRepeatedArgument<int>("adjs")),
-        shared_buffer_(
-            OperatorBase::GetSingleArgument<int>("shared_buffer", 0)) {
-    // For the padding, they should either be the legacy padding strategy
-    // (VALID or SAME), or an explicit, non-negative value.
-    if (legacy_pad_ == caffe2::LegacyPadding::VALID ||
-        legacy_pad_ == caffe2::LegacyPadding::SAME) {
-      CAFFE_ENFORCE(
-          !OperatorBase::HasArgument("pads"),
-          "If you use legacy padding VALID or SAME, you should not specify "
-          "any specific padding values.");
-    }
-    // Get old arguments values.
-    if (OperatorBase::HasArgument("kernel")) {
-      kernel_.resize(2, OperatorBase::GetSingleArgument<int>("kernel", 0));
-    } else if (
-        OperatorBase::HasArgument("kernel_h") &&
-        OperatorBase::HasArgument("kernel_w")) {
-      kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_h", 0));
-      kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_w", 0));
-    }
-
-    if (OperatorBase::HasArgument("stride")) {
-      stride_.resize(2, OperatorBase::GetSingleArgument<int>("stride", 0));
-    } else if (
-        OperatorBase::HasArgument("stride_h") &&
-        OperatorBase::HasArgument("stride_w")) {
-      stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_h", 0));
-      stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_w", 0));
-    }
-
-    if (OperatorBase::HasArgument("adj")) {
-      adj_.resize(2, OperatorBase::GetSingleArgument<int>("adj", 0));
-    } else if (
-        OperatorBase::HasArgument("adj_h") &&
-        OperatorBase::HasArgument("adj_w")) {
-      adj_.push_back(OperatorBase::GetSingleArgument<int>("adj_h", 0));
-      adj_.push_back(OperatorBase::GetSingleArgument<int>("adj_w", 0));
-    }
-
-    if (OperatorBase::HasArgument("pad")) {
-      CAFFE_ENFORCE(
-          legacy_pad_ != caffe2::LegacyPadding::VALID &&
-              legacy_pad_ != caffe2::LegacyPadding::SAME,
-          "If you use legacy padding VALID or SAME, you should not specify "
-          "any specific padding values.");
-      pads_.resize(4, OperatorBase::GetSingleArgument<int>("pad", 0));
-    } else if (
-        OperatorBase::HasArgument("pad_t") &&
-        OperatorBase::HasArgument("pad_l") &&
-        OperatorBase::HasArgument("pad_b") &&
-        OperatorBase::HasArgument("pad_r")) {
-      CAFFE_ENFORCE(
-          legacy_pad_ != caffe2::LegacyPadding::VALID &&
-              legacy_pad_ != caffe2::LegacyPadding::SAME,
-          "If you use legacy padding VALID or SAME, you should not specify "
-          "any specific padding values.");
-      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_t", 0));
-      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_l", 0));
-      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_b", 0));
-      pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_r", 0));
-    }
-
-    // Fill default values.
-    if (kernel_.empty()) {
-      kernel_.assign({0, 0});
-    }
-
-    if (stride_.empty()) {
-      stride_.assign(kernel_.size(), 1);
-    }
-
-    if (pads_.empty()) {
-      pads_.assign(kernel_.size() * 2, 0);
-    }
-
-    if (adj_.empty()) {
-      adj_.assign(kernel_.size(), 0);
-    }
-
-    CAFFE_ENFORCE_EQ(stride_.size(), kernel_.size());
-    CAFFE_ENFORCE_EQ(adj_.size(), kernel_.size());
-
-    if (legacy_pad_ != caffe2::LegacyPadding::VALID &&
-        legacy_pad_ != caffe2::LegacyPadding::SAME) {
-      CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size());
-    }
-
-    for (const auto dim : c10::irange(kernel_.size())) {
-      CAFFE_ENFORCE_GT(kernel_[dim], 0);
-      CAFFE_ENFORCE_GT(stride_[dim], 0);
-      CAFFE_ENFORCE_GE(adj_[dim], 0);
-      CAFFE_ENFORCE_LE(adj_[dim], stride_[dim]);
-    }
-  }
-  ~IDEEPConvTransposeUnpoolBase() override {}
-
-  const ideep::tensor& Input(int index) {
-    return OperatorBase::template Input<ideep::tensor>(index);
-  }
-  ideep::tensor* Output(int index) {
-    return OperatorBase::template Output<ideep::tensor>(index);
-  }
-
-  ideep::tensor::dims pad_tl() const {
-    return {pad_t(), pad_l()};
-  }
-
-  ideep::tensor::dims pad_br() const {
-    return {pad_b(), pad_r()};
-  }
-
-  ideep::tensor::dims CalcOutputDims(
-      const ideep::tensor& input,
-      int output_channel) {
-    CAFFE_ENFORCE_GT(input.get_size(), 0);
-
-    int N = input.get_dim(0);
-    ideep::tensor::dims output_dims;
-    auto input_dims = input.get_dims();
-    itensor::dims dims;
-    dims.assign(input_dims.begin() + 2, input_dims.end());
-    for (const auto dim : c10::irange(dims.size())) {
-      int dim_size = 0;
-      ComputeSizeAndPad(
-          dims[dim],
-          stride_[dim],
-          kernel_[dim],
-          adj_[dim],
-          &pads_[dim],
-          &pads_[dim + 2],
-          &dim_size);
-      output_dims.push_back(dim_size);
-    }
-
-    output_dims.insert(output_dims.begin(), {N, output_channel});
-    return output_dims;
-  }
-
-  bool RunOnDevice() override {
-    try {
-      return RunOnDeviceWithOrderNCHW();
-    } catch (ideep::error& e) {
-      LOG(ERROR) << "IDEEP error:" << e.message;
-      throw;
-    }
-  }
-
-  virtual bool RunOnDeviceWithOrderNCHW() {
-    CAFFE_THROW("Not implemented");
-  }
-
- private:
-  caffe2::LegacyPadding legacy_pad_;
-
- protected:
-  std::vector<int> kernel_;
-  std::vector<int> stride_;
-  std::vector<int> pads_;
-  std::vector<int> adj_;
-  bool shared_buffer_;
-
-  // Accessors for 2D conv params.
-
-  inline int pad_t() const {
-    return pads_[0];
-  }
-
-  inline int pad_l() const {
-    return pads_[1];
-  }
-
-  inline int pad_b() const {
-    return pads_[2];
-  }
-
-  inline int pad_r() const {
-    return pads_[3];
-  }
-
-  inline int kernel_h() const {
-    return kernel_[0];
-  }
-
-  inline int kernel_w() const {
-    return kernel_[1];
-  }
-
-  inline int stride_h() const {
-    return stride_[0];
-  }
-
-  inline int stride_w() const {
-    return stride_[1];
-  }
-
-  inline int adj_h() const {
-    return adj_[0];
-  }
-
-  inline int adj_w() const {
-    return adj_[1];
-  }
-
-  inline void ComputeSizeAndPad(
-      const int in_size,
-      const int stride,
-      const int kernel,
-      const int adj,
-      int* pad_head,
-      int* pad_tail,
-      int* out_size) {
-    switch (legacy_pad_) {
-      case caffe2::LegacyPadding::NOTSET:
-        CAFFE_ENFORCE_GE(*pad_head, 0);
-        CAFFE_ENFORCE_GE(*pad_tail, 0);
-        *out_size =
-            (in_size - 1) * stride + kernel + adj - *pad_head - *pad_tail;
-        break;
-      // We handle cases of LegacyPadding::VALID and LegacyPadding::SAME
-      // the same way
-      case caffe2::LegacyPadding::VALID:
-      case caffe2::LegacyPadding::SAME:
-        *pad_head = 0;
-        *pad_tail = 0;
-        *out_size = (in_size - 1) * stride + kernel + adj;
-        break;
-      case caffe2::LegacyPadding::CAFFE_LEGACY_POOLING:
-        LOG(FATAL) << "CAFFE_LEGACY_POOLING is no longer supported.";
-        break;
-    }
-  }
-};
-
-#define USE_IDEEP_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS()          \
-  USE_OPERATOR_BASE_FUNCTIONS;                                    \
-  /* using override */ using IDEEPConvTransposeUnpoolBase::Input; \
-  /* using override */ using IDEEPConvTransposeUnpoolBase::Output;
-
-} // namespace
--- a/caffe2/ideep/operators/dropout_op.cc
+++ b/caffe2/ideep/operators/dropout_op.cc
@ -1,94 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPDropoutOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPDropoutOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
-        is_test_(
-            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
-    CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPDropoutOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-
-    if (is_test_) {
-      if (Y != &X) {
-        ideep::direct_copy::compute(X, *Y);
-      }
-      return true;
-    }
-
-    auto* mask = Output(MASK);
-    ideep::dropout_forward::compute(X, ratio_, *Y, *mask);
-
-    return true;
-  }
-
- private:
-  float ratio_;
-  bool is_test_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT, MASK);
-};
-
-class IDEEPDropoutGradientOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
-        is_test_(
-            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
-    CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPDropoutGradientOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dX = Output(INPUT_GRAD);
-
-    if (is_test_) {
-      if (dX != &dY) {
-        ideep::direct_copy::compute(dY, *dX);
-      }
-      return true;
-    }
-
-    const auto& mask = Input(MASK);
-    ideep::dropout_backward::compute(mask, dY, *dX);
-
-    return true;
-  }
-
- protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  float ratio_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  bool is_test_;
-
-  INPUT_TAGS(OUTPUT_GRAD , MASK);
-  OUTPUT_TAGS(INPUT_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(Dropout, IDEEPDropoutOp);
-REGISTER_IDEEP_OPERATOR(DropoutGrad, IDEEPDropoutGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/elementwise_sum_op.cc
+++ b/caffe2/ideep/operators/elementwise_sum_op.cc
@ -1,82 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-#include <caffe2/ideep/operators/operator_fallback_ideep.h>
-#include "caffe2/operators/utility_ops.h"
-#include "caffe2/operators/elementwise_add_op.h"
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPSumOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-  using FALLBACK_SUM = IDEEPFallbackOp<SumOp<CPUContext>, SkipIndices<0>>;
-  using FALLBACK_ADD = IDEEPFallbackOp<BinaryElementwiseOp<
-    NumericTypes, CPUContext, AddFunctor<CPUContext>>, SkipIndices<0>>;
-
-  IDEEPSumOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        fallback_sum_(operator_def, ws),
-        fallback_add_(operator_def, ws) {}
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPSumOp() override {}
-
-  bool RunOnDevice() override {
-    itensor::dims input_dims;
-    bool fallback_to_cpu = false;
-    vector<itensor> inputs_itensor;
-
-    // We only support element-wise sum for ideep tensors here.
-    // If a CPU tensor is detected in input list, we have to fallback
-    // to corresponding CPU operator.
-    for (int i = 0; i < InputSize(); ++i) {
-      if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
-        auto& tensor_ideep = Input(i);
-        if (input_dims.empty()) {
-          input_dims = tensor_ideep.get_dims();
-        } else if (input_dims != tensor_ideep.get_dims()) {
-          fallback_to_cpu = true;
-          break;
-        }
-        inputs_itensor.emplace_back(tensor_ideep);
-      } else {
-        CAFFE_ENFORCE(
-            BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
-            "Expect cpu tensor if not itensor");
-        fallback_to_cpu = true;
-        break;
-      }
-    }
-
-    if (!fallback_to_cpu) {
-      auto* Y = Output(OUTPUT);
-      if (InputSize() == 1) {
-        const auto& X = Input(INPUT0);
-        ideep::direct_copy::compute(X, *Y);
-      } else {
-        const vector<float> scales(InputSize(), 1.0);
-        ideep::sum::compute(scales, inputs_itensor, *Y);
-      }
-      return true;
-    }
-
-    if (InputSize() == 2) {
-      return fallback_add_.Run(0);
-    }
-
-    return fallback_sum_.Run(0);
-  }
-
- private:
-  FALLBACK_SUM fallback_sum_;
-  FALLBACK_ADD fallback_add_;
-
-  INPUT_TAGS(INPUT0);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR(Sum, IDEEPSumOp);
-REGISTER_IDEEP_OPERATOR(Add, IDEEPSumOp);
-
-} // namespace
--- a/caffe2/ideep/operators/expand_squeeze_dims_op.cc
+++ b/caffe2/ideep/operators/expand_squeeze_dims_op.cc
@ -1,133 +0,0 @@
-#include "caffe2/operators/expand_squeeze_dims_op.h"
-#include <caffe2/ideep/ideep_utils.h>
-#include <caffe2/ideep/operators/operator_fallback_ideep.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPExpandDimsOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-  using FALLBACK_OP = IDEEPFallbackOp<ExpandDimsOp<CPUContext>, SkipIndices<0>>;
-
-  IDEEPExpandDimsOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        fallback_(operator_def, ws) {
-    dims_ = OperatorBase::GetRepeatedArgument<int>("dims");
-    auto originalSize = dims_.size();
-    CAFFE_ENFORCE_GT(originalSize, 0, "Parameter `dims` must be provided.");
-    std::sort(dims_.begin(), dims_.end());
-    dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
-    if (dims_.size() < originalSize) {
-      LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
-    }
-    CAFFE_ENFORCE_GE(dims_.front(), 0, "Dimension ids must be non-negative.");
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPExpandDimsOp() override {}
-
-  bool RunOnDevice() override {
-    if (!OperatorBase::InputBlob(INPUT).template IsType<itensor>()) {
-      return fallback_.Run(0);
-    }
-
-    const auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-    if (&X != Y) {
-      // Copy if not inplace
-      ideep::direct_copy::compute(X, *Y);
-    }
-    if (dims_.empty()) {
-      return true;
-    }
-
-    auto newDims = X.get_dims();
-    CAFFE_ENFORCE_GE(
-        newDims.size() + dims_.size(),
-        dims_.back() + 1,
-        "Input needs at least ",
-        (1 + dims_.back() - dims_.size()),
-        " dimensions given `dims`.");
-
-    for (const auto dim : dims_) {
-      newDims.insert(newDims.begin() + dim, 1);
-    }
-
-    Y->reshape(newDims);
-    return true;
-  }
-
- private:
-  std::vector<int> dims_;
-  FALLBACK_OP fallback_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-
-class IDEEPSqueezeOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-  using FALLBACK_OP = IDEEPFallbackOp<SqueezeOp<CPUContext>, SkipIndices<0>>;
-
-  IDEEPSqueezeOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        fallback_(operator_def, ws) {
-    dims_ = OperatorBase::GetRepeatedArgument<int>("dims");
-    auto originalSize = dims_.size();
-    CAFFE_ENFORCE_GT(originalSize, 0, "Parameter `dims` must be provided.");
-
-    std::sort(dims_.begin(), dims_.end());
-    dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
-    if (dims_.size() < originalSize) {
-      LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
-    }
-    CAFFE_ENFORCE_GE(dims_.front(), 0, "Dimension ids must be non-negative.");
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPSqueezeOp() override {}
-
-  bool RunOnDevice() override {
-    if (!OperatorBase::InputBlob(INPUT).template IsType<itensor>()) {
-      return fallback_.Run(0);
-    }
-
-    const auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-
-    CAFFE_ENFORCE_GT(
-        X.ndims(),
-        dims_.back(),
-        "Input needs at least ",
-        (dims_.back() + 1),
-        " dimensions.");
-    const auto& ideep_dims = X.get_dims();
-    std::vector<int64_t> dims(ideep_dims.begin(), ideep_dims.end());
-    const auto new_dims = SqueezeOp<IDEEPContext>::ComputeDims(dims, dims_);
-    itensor::dims new_dims_ideep(new_dims.begin(), new_dims.end());
-    if (&X != Y) {
-      // Copy if not inplace
-      ideep::direct_copy::compute(X, *Y);
-    }
-
-    Y->reshape(new_dims_ideep);
-    return true;
-  }
-
- private:
-  std::vector<int> dims_;
-  FALLBACK_OP fallback_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-
-REGISTER_IDEEP_OPERATOR(ExpandDims, IDEEPExpandDimsOp);
-REGISTER_IDEEP_OPERATOR(Squeeze, IDEEPSqueezeOp);
-
-} // namespace
--- a/caffe2/ideep/operators/fully_connected_op.cc
+++ b/caffe2/ideep/operators/fully_connected_op.cc
@ -1,145 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPFullyConnectedOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPFullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
-        axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
-        training_mode_(OperatorBase::GetSingleArgument<int>("training_mode", 0)) {}
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPFullyConnectedOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    const auto& filter = Input(FILTER);
-    auto* Y = Output(OUTPUT);
-
-    itensor X_in = X;
-    auto X_dims = CanonicalDims(X_in.get_dims(), axis_);
-    if (X_in.get_dims() != X_dims) {
-      X_in.reshape(X_dims);
-    }
-
-    if (training_mode_) {
-      filter_ = filter;
-      auto filter_dims = CanonicalDims(filter_.get_dims(), axis_w_);
-      if (filter_.get_dims() != filter_dims) {
-        filter_.reshape(filter_dims);
-      }
-
-      if (InputSize() > BIAS) {
-        bias_ = Input(BIAS);
-      }
-    } else {
-      if (cached_X_descriptor_ != X.get_descriptor()) {
-        cached_X_descriptor_ = X.dup_descriptor();
-      }
-
-      if (cached_weights_descriptor_ != filter.get_descriptor()) {
-        cached_weights_descriptor_ = filter.dup_descriptor();
-
-        filter_ = filter.has_scale() ? filter.to_public() : filter;
-        auto filter_dims = CanonicalDims(filter_.get_dims(), axis_w_);
-        if (filter_.get_dims() != filter_dims) {
-          filter_.reshape(filter_dims);
-        }
-
-        if (InputSize() > BIAS) {
-          const auto& bias = Input(BIAS);
-          bias_ = bias.has_scale() ? bias.to_public() : bias;
-        }
-      }
-    }
-
-    if (InputSize() > BIAS) {
-      ideep::inner_product_forward::compute(
-          X_in, filter_, bias_, *Y);
-    } else {
-      ideep::inner_product_forward::compute(X_in, filter_, *Y);
-    }
-
-    return true;
-  }
-
- private:
-  size_t axis_{1};
-  size_t axis_w_{1};
-  bool training_mode_;
-
-  itensor filter_, bias_;
-  itensor::descriptor cached_X_descriptor_, cached_weights_descriptor_;
-
-  INPUT_TAGS(INPUT, FILTER, BIAS);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPFullyConnectedGradientOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPFullyConnectedGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
-        axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)) {}
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPFullyConnectedGradientOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    const auto& filter = Input(FILTER);
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dfilter = Output(FILTER_GRAD);
-    auto* dbias = Output(BIAS_GRAD);
-
-    itensor X_in = X;
-    auto X_dims = CanonicalDims(X_in.get_dims(), axis_);
-    if (X_in.get_dims() != X_dims) {
-      X_in.reshape(X_dims);
-    }
-
-    itensor filter_in = filter;
-    auto filter_dims = CanonicalDims(filter_in.get_dims(), axis_w_);
-    if (filter_in.get_dims() != filter_dims) {
-      filter_in.reshape(filter_dims);
-    }
-
-    ideep::inner_product_backward_weights::compute(X_in, dY, *dfilter, *dbias);
-    dfilter->to_default_format();
-
-    /**
-     * In mkl-dnn,weight gradient shape is determined by X_in,
-     * so we should ensure that weight gradient shape is consistent with weight shape.
-     */
-    if (dfilter->get_dims() != filter.get_dims()) {
-      dfilter->reshape(filter.get_dims());
-    }
-
-    if (OutputSize() > INPUT_GRAD) {
-      ideep::inner_product_backward_data::compute(
-          dY, filter_in, X.get_dims(), *Output(INPUT_GRAD));
-    }
-
-    return true;
-  }
-
- private:
-  size_t axis_{1};
-  size_t axis_w_{1};
-
-  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
-  OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(FC, IDEEPFullyConnectedOp);
-REGISTER_IDEEP_OPERATOR(FCGradient, IDEEPFullyConnectedGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/local_response_normalization_op.cc
+++ b/caffe2/ideep/operators/local_response_normalization_op.cc
@ -1,87 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPLRNOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPLRNOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
-        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
-        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
-        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)) {
-    TORCH_DCHECK_GT(size_, 0);
-    TORCH_DCHECK_EQ(size_ % 2, 1);
-    TORCH_DCHECK_GT(alpha_, 0);
-    TORCH_DCHECK_GT(beta_, 0);
-  }
-  ~IDEEPLRNOp() override = default;
-
-  bool RunOnDevice() override {
-    auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-
-    ideep::lrn_forward::compute(X, *Y, size_, alpha_, beta_, bias_);
-
-    return true;
-  }
-
- private:
-  const int size_;
-  const float alpha_;
-  const float beta_;
-  const float bias_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPLRNGradientOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPLRNGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        size_(OperatorBase::GetSingleArgument<int>("size", 0)),
-        alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
-        beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
-        bias_(OperatorBase::GetSingleArgument<float>("bias", 1)) {
-    TORCH_DCHECK_GT(size_, 0);
-    TORCH_DCHECK_EQ(size_ % 2, 1);
-    TORCH_DCHECK_GT(alpha_, 0);
-    TORCH_DCHECK_GT(beta_, 0);
-  }
-  ~IDEEPLRNGradientOp() override = default;
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    const auto& Y = Input(FILTER);
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dX = Output(INPUT_GRAD);
-
-    ideep::lrn_backward::compute(X, dY, Y, *dX, size_, alpha_, beta_, bias_);
-
-    return true;
-  }
-
- private:
-  const int size_;
-  const float alpha_;
-  const float beta_;
-  const float bias_;
-
-  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
-  OUTPUT_TAGS(INPUT_GRAD);
-};
-
-
-REGISTER_IDEEP_OPERATOR(LRN, IDEEPLRNOp);
-REGISTER_IDEEP_OPERATOR(LRNGradient, IDEEPLRNGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/momentum_sgd_op.cc
+++ b/caffe2/ideep/operators/momentum_sgd_op.cc
@ -1,131 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-void momentum_sgd_update(
-    const int N,
-    const float* g,
-    const float* m,
-    float* ng,
-    float* nm,
-    const float* lr,
-    const float momentum,
-    const bool nesterov,
-    float* param) {
-  const float LR = lr[0];
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static)
-#endif
-  for (auto i = 0; i < N; ++i) {
-    if (!nesterov) {
-      const float adjusted_gradient = LR * g[i] + momentum * m[i];
-      nm[i] = adjusted_gradient;
-      ng[i] = adjusted_gradient;
-    } else {
-      const float mi = m[i];
-      const float mi_new = momentum * mi + LR * g[i];
-      nm[i] = mi_new;
-      ng[i] = (1 + momentum) * mi_new - momentum * mi;
-    }
-
-    if (param) {
-      param[i] -= ng[i];
-    }
-  }
-}
-
-class IDEEPMomentumSGDOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPMomentumSGDOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
-        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
-
-  bool RunOnDevice() override {
-    CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems());
-    if (Input(GRAD) != *Output(OUTPUT_GRAD)) {
-      Output(OUTPUT_GRAD)->init(Input(GRAD).get_descriptor());
-    }
-    if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) {
-      Output(OUTPUT_MOMENTUM)->init(Input(MOMENTUM).get_descriptor());
-    }
-
-    // TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
-    const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
-    CAFFE_ENFORCE(lr.numel() == 1);
-
-    momentum_sgd_update(
-        Input(GRAD).get_nelems(),
-        static_cast<float*>(Input(GRAD).get_data_handle()),
-        static_cast<float*>(Input(MOMENTUM).get_data_handle()),
-        static_cast<float*>(Output(OUTPUT_GRAD)->get_data_handle()),
-        static_cast<float*>(Output(OUTPUT_MOMENTUM)->get_data_handle()),
-        lr.template data<float>(),
-        momentum_,
-        nesterov_,
-        nullptr);
-    return true;
-  }
-
- protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-non-private-member-variables-in-classes)
-  float momentum_ = 0.9f;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  bool nesterov_;
-  INPUT_TAGS(GRAD, MOMENTUM, LR);
-  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM);
-};
-
-class IDEEPMomentumSGDUpdateOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-  IDEEPMomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
-        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
-
-  bool RunOnDevice() override {
-    CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems());
-    if (Input(GRAD) != *Output(OUTPUT_GRAD)) {
-      Output(OUTPUT_GRAD)->init(Input(GRAD).get_descriptor());
-    }
-    if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) {
-      Output(OUTPUT_MOMENTUM)->init(Input(MOMENTUM).get_descriptor());
-    }
-
-    // TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
-    const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
-    CAFFE_ENFORCE(lr.numel() == 1);
-
-    momentum_sgd_update(
-        Input(GRAD).get_nelems(),
-        static_cast<float*>(Input(GRAD).get_data_handle()),
-        static_cast<float*>(Input(MOMENTUM).get_data_handle()),
-        static_cast<float*>(Output(OUTPUT_GRAD)->get_data_handle()),
-        static_cast<float*>(Output(OUTPUT_MOMENTUM)->get_data_handle()),
-        lr.template data<float>(),
-        momentum_,
-        nesterov_,
-        static_cast<float*>(Output(OUTPUT_PARAM)->get_data_handle()));
-    return true;
-  }
-
- protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-non-private-member-variables-in-classes)
-  float momentum_ = 0.9f;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  bool nesterov_;
-  INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM);
-  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM);
-};
-
-REGISTER_IDEEP_OPERATOR(MomentumSGD, IDEEPMomentumSGDOp);
-REGISTER_IDEEP_OPERATOR(MomentumSGDUpdate, IDEEPMomentumSGDUpdateOp);
-
-} // namespace
--- a/caffe2/ideep/operators/operator_fallback_ideep.cc
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@ -1,279 +0,0 @@
-#include <caffe2/ideep/operators/operator_fallback_ideep.h>
-#include <caffe2/ideep/utils/ideep_operator.h>
-
-#include <caffe2/operators/abs_op.h>
-#include <caffe2/operators/accuracy_op.h>
-#include <caffe2/operators/affine_channel_op.h>
-#include <caffe2/operators/atan_op.h>
-#include <caffe2/operators/batch_matmul_op.h>
-#include <caffe2/operators/cast_op.h>
-#include <caffe2/operators/clip_op.h>
-#include <caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h>
-#include <caffe2/operators/cross_entropy_op.h>
-#include <caffe2/operators/ctc_beam_search_decoder_op.h>
-#include <caffe2/operators/ctc_greedy_decoder_op.h>
-#include <caffe2/operators/distance_op.h>
-#include <caffe2/operators/dropout_op.h>
-#include <caffe2/operators/elementwise_add_op.h>
-#include <caffe2/operators/elementwise_div_op.h>
-#include <caffe2/operators/elementwise_mul_op.h>
-#include <caffe2/operators/elementwise_ops.h>
-#include <caffe2/operators/elementwise_sub_op.h>
-#include <caffe2/operators/expand_op.h>
-#include <caffe2/operators/filler_op.h>
-#include <caffe2/operators/flatten_op.h>
-#include <caffe2/operators/gather_op.h>
-#include <caffe2/operators/generate_proposals_op.h>
-#include <caffe2/operators/given_tensor_fill_op.h>
-#include <caffe2/operators/load_save_op.h>
-#include <caffe2/operators/loss_op.h>
-#include <caffe2/operators/normalize_op.h>
-#include <caffe2/operators/pad_op.h>
-#include <caffe2/operators/prelu_op.h>
-#include <caffe2/operators/reduce_ops.h>
-#include <caffe2/operators/rmac_regions_op.h>
-#include <caffe2/operators/roi_align_op.h>
-#include <caffe2/operators/roi_align_rotated_op.h>
-#include <caffe2/operators/roi_pool_op.h>
-#include <caffe2/operators/scale_op.h>
-#include <caffe2/operators/slice_op.h>
-#include <caffe2/operators/softmax_op.h>
-#include <caffe2/operators/softmax_with_loss_op.h>
-#include <caffe2/operators/sqrt_op.h>
-#include <caffe2/operators/stop_gradient.h>
-#include <caffe2/operators/tanh_op.h>
-#include <caffe2/operators/tensor_protos_db_input.h>
-#include <caffe2/operators/utility_ops.h>
-#include <caffe2/queue/queue_ops.h>
-#include <caffe2/sgd/iter_op.h>
-#include <caffe2/sgd/learning_rate_op.h>
-#include "caffe2/operators/bbox_transform_op.h"
-#include "caffe2/operators/box_with_nms_limit_op.h"
-
-// can add more non-IDEEP operators if needed
-namespace caffe2 {
-
-// Boolean operators
-REGISTER_IDEEP_COMPARE_OPERATOR(EQ);
-REGISTER_IDEEP_COMPARE_OPERATOR(GT);
-REGISTER_IDEEP_COMPARE_OPERATOR(GE);
-REGISTER_IDEEP_COMPARE_OPERATOR(LT);
-REGISTER_IDEEP_COMPARE_OPERATOR(LE);
-REGISTER_IDEEP_COMPARE_OPERATOR(NE);
-
-REGISTER_IDEEP_OPERATOR(Softmax, IDEEPFallbackOp<SoftmaxOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    LabelCrossEntropy,
-    IDEEPFallbackOp<LabelCrossEntropyOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    AveragedLoss,
-    IDEEPFallbackOp<AveragedLoss<float, CPUContext>, SkipIndices<0>>);
-REGISTER_IDEEP_OPERATOR(Flatten, IDEEPFallbackOp<FlattenOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(ResizeLike, IDEEPFallbackOp<ResizeLikeOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(Slice, IDEEPFallbackOp<SliceOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(Clip, IDEEPFallbackOp<ClipOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    ScatterAssign,
-    IDEEPFallbackOp<ScatterAssignOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    Cast,
-    IDEEPFallbackOp<CastOp<CPUContext>>);
-
-// filter operators
-REGISTER_IDEEP_OPERATOR(
-    XavierFill,
-    IDEEPFallbackOp<XavierFillOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    ConstantFill,
-    IDEEPFallbackOp<ConstantFillOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    GaussianFill,
-    IDEEPFallbackOp<GaussianFillOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    MSRAFill,
-    IDEEPFallbackOp<MSRAFillOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    GivenTensorFill,
-    IDEEPFallbackOp<GivenTensorFillOp<float, CPUContext>>);
-// Not supported tensor types in below FillOp
-REGISTER_IDEEP_OPERATOR(
-    GivenTensorDoubleFill,
-    IDEEPFallbackOp<GivenTensorFillOp<double, CPUContext>, SkipIndices<0>>);
-REGISTER_IDEEP_OPERATOR(
-    GivenTensorBoolFill,
-    IDEEPFallbackOp<GivenTensorFillOp<bool, CPUContext>, SkipIndices<0>>);
-REGISTER_IDEEP_OPERATOR(
-    GivenTensorIntFill,
-    IDEEPFallbackOp<GivenTensorFillOp<int, CPUContext>, SkipIndices<0>>);
-REGISTER_IDEEP_OPERATOR(
-    GivenTensorInt64Fill,
-    IDEEPFallbackOp<GivenTensorFillOp<int64_t, CPUContext>, SkipIndices<0>>);
-REGISTER_IDEEP_OPERATOR(
-    GivenTensorStringFill,
-    IDEEPFallbackOp<GivenTensorFillOp<std::string, CPUContext>, SkipIndices<0>>);
-REGISTER_IDEEP_OPERATOR(Load, IDEEPFallbackOp<LoadOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(Save, IDEEPFallbackOp<SaveOp<CPUContext>>);
-
-REGISTER_IDEEP_OPERATOR(
-    RMACRegions,
-    IDEEPFallbackOp<RMACRegionsOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(RoIPool, IDEEPFallbackOp<RoIPoolOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    RoIAlign,
-    IDEEPFallbackOp<RoIAlignOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    RoIAlignRotated,
-    IDEEPFallbackOp<RoIAlignRotatedOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    GenerateProposals,
-    IDEEPFallbackOp<GenerateProposalsOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    GenerateProposalsCPP,
-    IDEEPFallbackOp<GenerateProposalsOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    CollectAndDistributeFpnRpnProposals,
-    IDEEPFallbackOp<CollectAndDistributeFpnRpnProposalsOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    BoxWithNMSLimit,
-    IDEEPFallbackOp<BoxWithNMSLimitOp<CPUContext>, SkipIndices<0,1,2>>);
-REGISTER_IDEEP_OPERATOR(
-    BBoxTransform,
-    IDEEPFallbackOp<BBoxTransformOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    AffineChannel,
-    IDEEPFallbackOp<AffineChannelOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    StopGradient,
-    IDEEPFallbackOp<StopGradientOp<CPUContext>>);
-
-REGISTER_IDEEP_OPERATOR(
-    PadImage,
-    IDEEPFallbackOp<PadImageOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    PRelu,
-    IDEEPFallbackOp<PReluOp<float, CPUContext>>);
-
-// ctc decoder operators
-REGISTER_IDEEP_OPERATOR(
-    CTCGreedyDecoder,
-    IDEEPFallbackOp<CTCGreedyDecoderOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    CTCBeamSearchDecoder,
-    IDEEPFallbackOp<CTCBeamSearchDecoderOp<CPUContext>>);
-
-REGISTER_IDEEP_OPERATOR(
-    AveragedLossGradient,
-    IDEEPFallbackOp<AveragedLossGradient<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    LabelCrossEntropyGradient,
-    IDEEPFallbackOp<LabelCrossEntropyGradientOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    SoftmaxGradient,
-    IDEEPFallbackOp<SoftmaxGradientOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    Iter,
-    IDEEPFallbackOp<IterOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    LearningRate,
-    IDEEPFallbackOp<LearningRateOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    Abs,
-    IDEEPFallbackOp<UnaryElementwiseOp<
-      TensorTypes<float>, CPUContext, AbsFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    Atan,
-    IDEEPFallbackOp<UnaryElementwiseOp<
-      TensorTypes<float>, CPUContext, AtanFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    Sqrt,
-    IDEEPFallbackOp<UnaryElementwiseOp<
-      TensorTypes<float>, CPUContext, SqrtFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    Sign,
-    IDEEPFallbackOp<UnaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        SignFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    Div,
-    IDEEPFallbackOp<BinaryElementwiseOp<
-      NumericTypes, CPUContext, DivFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    Mul,
-    IDEEPFallbackOp<
-        BinaryElementwiseOp<NumericTypes, CPUContext, MulFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    Sub,
-    IDEEPFallbackOp<BinaryElementwiseOp<
-      NumericTypes, CPUContext, SubFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    Tanh,
-    IDEEPFallbackOp<UnaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        TanhFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    L1Distance,
-    IDEEPFallbackOp<L1DistanceOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(Scale, IDEEPFallbackOp<ScaleOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    Accuracy,
-    IDEEPFallbackOp<AccuracyOp<float, CPUContext>>);
-
-REGISTER_IDEEP_OPERATOR(
-    AddGradient,
-    IDEEPFallbackOp<BinaryElementwiseGradientOp<
-        NumericTypes,
-        CPUContext,
-        AddFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    TanhGradient,
-    IDEEPFallbackOp<BinaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        TanhGradientFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    MulGradient,
-    IDEEPFallbackOp<BinaryElementwiseGradientOp<
-        NumericTypes,
-        CPUContext,
-        MulFunctor<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(TensorProtosDBInput, IDEEPFallbackOp<TensorProtosDBInput<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(CloseBlobsQueue, IDEEPFallbackOp<CloseBlobsQueueOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    SoftmaxWithLoss,
-    IDEEPFallbackOp<SoftmaxWithLossOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    SoftmaxWithLossGradient,
-    IDEEPFallbackOp<SoftmaxWithLossGradientOp<float, CPUContext>>);
-
-REGISTER_IDEEP_OPERATOR(
-    Expand,
-    IDEEPFallbackOp<ExpandOp<
-        TensorTypes<std::int32_t, std::int64_t, float, double>,
-        CPUContext>>);
-REGISTER_IDEEP_OPERATOR(Gather, IDEEPFallbackOp<GatherOp<CPUContext>>);
-
-REGISTER_IDEEP_OPERATOR(
-    Normalize,
-    IDEEPFallbackOp<NormalizeOp<float, CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    ReduceL2,
-    IDEEPFallbackOp<
-        ReduceOp<TensorTypes<float>, CPUContext, L2Reducer<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    ReduceSum,
-    IDEEPFallbackOp<ReduceOp<
-        TensorTypes<std::int32_t, std::int64_t, float, double>,
-        CPUContext,
-        SumReducer<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    ReduceMean,
-    IDEEPFallbackOp<ReduceOp<
-        TensorTypes<float>, CPUContext, MeanReducer<CPUContext>>>);
-REGISTER_IDEEP_OPERATOR(
-    BatchMatMul,
-    IDEEPFallbackOp<BatchMatMulOp<CPUContext>>);
-
-
-} // namespace caffe2
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@ -1,190 +0,0 @@
-#pragma once
-
-#include <caffe2/core/common.h>
-#include <caffe2/core/context.h>
-#include <caffe2/core/operator.h>
-#include <caffe2/ideep/ideep_utils.h>
-#include <caffe2/proto/caffe2_pb.h>
-
-namespace caffe2 {
-
-/**
- * @brief A templated class to allow one to wrap a CPU operator as an IDEEP
- * operator.
- *
- * This class can be used when one does not have the IDEEP implementation ready
- * yet for an operator. Essentially, what this op does is to automatically
- * deal with data copy for you. Plausibly, this causes a lot of overhead and
- * is not optimal, so you should use this operator mostly for quick prototyping
- * purpose.
- *
- * All the input and output of the original operator should be TensorCPU.
- *
- * Example usage: if you have a class MyMagicOp that is CPU based, and you use
- * the registration code
- *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
- * to register the CPU side, you can create its corresponding IDEEP operator
- * (with performance hits of course) via
- *     REGISTER_IDEEP_OPERATOR(MyMagic,
- *                            IDEEPFallbackOp<MyMagicOp>);
- *
- * Advanced usage: if you want to have some specific outputs never copied, you
- * can use the SkipOutputCopy template argument to do that. For example, if
- * MyMagic produces two outputs and the first output is always going to live on
- * the CPU, you can do
- *     REGISTER_IDEEP_OPERATOR(MyMagic,
- *                            IDEEPFallbackOp<MyMagicOp, SkipIndices<0>>);
- */
-template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
-class IDEEPFallbackOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPFallbackOp(const OperatorDef& def, Workspace* ws)
-      : IDEEPOperator(def, ws) {
-    CAFFE_ENFORCE_EQ(def.device_option().device_type(), PROTO_IDEEP);
-    base_def_.CopyFrom(def);
-    // base_def_ runs on CPU, so we will set its device option to CPU.
-    // Copy to allow random_seed to be correctly propagated.
-    base_def_.mutable_device_option()->CopyFrom(def.device_option());
-    base_def_.mutable_device_option()->set_device_type(PROTO_CPU);
-    // Create output blobs in parent workspace,
-    // then forward output blobs to local workspace.
-    std::unordered_map<string, string> forwarded_output_blobs;
-    for (const auto i : c10::irange(base_def_.output_size())) {
-      // For in-place case, the in/output tensor for local_ws must be
-      // re-created, instead of forwarding from current workspace.
-      string parent_name(base_def_.output(i));
-      if (!SkipOutputCopy::Contains(i)) {
-        parent_name += "_cpu_output_blob_" + base_def_.type();
-      }
-      local_output_blobs_.push_back(ws->CreateBlob(parent_name));
-      TORCH_CHECK_NOTNULL(local_output_blobs_.back());
-      forwarded_output_blobs[base_def_.output(i)] = parent_name;
-      output_inplace_.push_back(false);
-      for (const string &input_name : base_def_.input()) {
-        if (input_name == base_def_.output(i)) {
-          output_inplace_[i] = true;
-          break;
-        }
-      }
-    }
-    local_ws_.reset(new Workspace(ws, forwarded_output_blobs));
-    // Set up the symbols for the local workspace.
-    for (const string& name : base_def_.input()) {
-      local_input_blobs_.push_back(local_ws_->CreateBlob(name));
-      TORCH_CHECK_NOTNULL(local_input_blobs_.back());
-    }
-    input_share_.resize(local_input_blobs_.size(), false);
-    base_op_.reset(new CPUOp(base_def_, local_ws_.get()));
-  }
-
-  bool RunOnDevice() override {
-    for (const auto i : c10::irange(InputSize())) {
-      if (InputIsType<itensor>(i)
-          && (Input(i).has_scale()
-            || Input(i).get_data_type() == idtype::f32)) {
-        auto& input = Input(i);
-        if (input_share_[i]) {
-          local_input_blobs_[i]->Reset();
-          input_share_[i] = false;
-        }
-        auto dtensor = BlobGetMutableTensor(local_input_blobs_[i], CPU);
-        dtensor->Resize(input.get_dims());
-        // If fallback from INT8, the public format of original input is nhwc.
-        // While the required format is nchw, need to reorder to nchw.
-        if (input.get_desc().is_nhwc()) {
-          itensor temp_ten ({input.get_dims(), idtype::f32, iformat::nchw},
-              dtensor->template mutable_data<float>());
-          temp_ten.feed_from(input);
-        } else if (!input.need_reorder()) {
-          CAFFE_ENFORCE(!input.has_scale(),
-              "Incorrect invocation of get_data_handle");
-          dtensor->ShareExternalPointer(
-              static_cast<float*>(input.get_data_handle()));
-        } else {
-          input.to_public(dtensor->template mutable_data<float>());
-        }
-      } else {
-        VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy.";
-        if (OperatorBase::Inputs()[i]->GetRaw() != local_input_blobs_[i]->GetRaw()) {
-          // Note(jiayq): This removes a const but conceptually
-          // local_input_blobs will only be used as const blob input for the
-          // base op so we are still fine.
-          local_input_blobs_[i]->ShareExternal(
-              const_cast<void *>(OperatorBase::Inputs()[i]->GetRaw()),
-              OperatorBase::Inputs()[i]->meta());
-        }
-        input_share_[i] = true;
-      }
-    }
-
-    // Some CPU ops inherited from OperatorBase directly might need this default
-    // input argument '0' like 'PrefetchOperator'.
-    if (!base_op_->Run(0)) {
-      LOG(ERROR) << "Base op run failed in IDEEPFallbackOp. Def: "
-                 << ProtoDebugString(this->debug_def());
-      return false;
-    }
-
-    for (const auto i : c10::irange(OutputSize())) {
-      if (SkipOutputCopy::Contains(i)) {
-        VLOG(1) << "Copy output: index " << i << " skipped.";
-        continue;
-      }
-      CAFFE_ENFORCE(
-          BlobIsTensorType(*local_output_blobs_[i], CPU),
-          "IDEEP fallback op currently does not support non-TensorCPU "
-          "output type who needs copying.");
-      const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
-      auto src_dims = src.sizes().vec();
-      if (src.template IsType<float>() && src.dim() != 0 && base_op_->type() != "Python") {
-        Blob* dst = OperatorBase::OutputBlob(i);
-        // The output tensor must be ideep tensor with public format.
-        // If reusing ideep tensor with non-public format, the tensor buffer
-        // will be interpreted incorrectly.
-        if (!dst->template IsType<itensor>() ||
-            !dst->template Get<itensor>().is_public_format()) {
-          dst->Reset(new itensor());
-        }
-
-        itensor::dims dst_dims (src_dims.begin(), src_dims.end());
-        auto dtensor = dst->template GetMutable<itensor>();
-        if (dtensor->get_dims() != dst_dims) {
-          dtensor->resize(dst_dims, idtype::f32);
-        }
-        if (output_inplace_[i]) {
-          dtensor->feed_from(dst_dims, idtype::f32,
-              const_cast<void*>(src.raw_data()));
-        } else {
-          CAFFE_ENFORCE(!dtensor->has_scale(),
-              "Incorrect invocation of set_data_handle");
-          dtensor->set_data_handle(const_cast<void *>(src.raw_data()));
-        }
-      } else {
-        VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
-        Blob* dst = OperatorBase::OutputBlob(i);
-        if (output_inplace_[i]) {
-          auto dtensor = BlobGetMutableTensor(dst, CPU);
-          dtensor->CopyFrom(src);
-        } else {
-          dst->Reset(new Tensor(CPU));
-          BlobSetTensor(dst, src.Alias());
-        }
-      }
-    }
-    return true;
-  }
-
- protected:
-  vector<Blob*> local_input_blobs_;
-  vector<Blob*> local_output_blobs_;
-  vector<bool> output_inplace_;
-  vector<bool> input_share_;
-  std::unique_ptr<CPUOp> base_op_;
-  std::unique_ptr<Workspace> local_ws_;
-  OperatorDef base_def_;
-};
-
-} // namespace caffe2
--- a/caffe2/ideep/operators/order_switch_ops.cc
+++ b/caffe2/ideep/operators/order_switch_ops.cc
@ -1,70 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPNHWC2NCHWOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_SIMPLE_IDEEP_CTOR_DTOR(IDEEPNHWC2NCHWOp);
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    CAFFE_ENFORCE_EQ(X.ndims(), 4);
-    CAFFE_ENFORCE(X.get_desc().is_nhwc());
-
-    auto *Y = Output(OUTPUT);
-    CAFFE_ENFORCE(Y != &X);
-
-    // NOTE: NHWC changes the shape in framework, but not in MKL-DNN
-    // Thus, for iDEEP tensor, the shapes of NCHW and NHWC are identical.
-    Y->init({X.get_dims(), X.get_data_type(), iformat::nchw});
-    Y->feed_from(X);
-    // NOTE: This ops is only used to quantization path, setting scale
-    // to distinguish with fp32 path activation(always return NCHW format
-    // even ideep tensor has NHWC format) when convert to numpy memory.
-    Y->set_scale({1.0});
-    return true;
-  }
-
- private:
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPNCHW2NHWCOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_SIMPLE_IDEEP_CTOR_DTOR(IDEEPNCHW2NHWCOp);
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    CAFFE_ENFORCE_EQ(X.ndims(), 4);
-    CAFFE_ENFORCE(X.get_desc().is_nchw());
-
-    auto *Y = Output(OUTPUT);
-    CAFFE_ENFORCE(Y != &X);
-
-    // NOTE: NHWC changes the shape in framework, but not in MKL-DNN
-    // Thus, for iDEEP tensor, the shapes of NCHW and NHWC are identical.
-    Y->init({X.get_dims(), X.get_data_type(), iformat::nhwc});
-    Y->feed_from(X);
-    // NOTE: This ops is only used to quantization path, setting scale
-    // to distinguish with fp32 path activation(always return NCHW format
-    // even ideep tensor has NHWC format) when convert to numpy memory.
-    Y->set_scale({1.0});
-    return true;
-  }
-
- private:
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR(NHWC2NCHW, IDEEPNHWC2NCHWOp);
-REGISTER_IDEEP_OPERATOR(NCHW2NHWC, IDEEPNCHW2NHWCOp);
-
-} // namespace
--- a/caffe2/ideep/operators/pool_op.cc
+++ b/caffe2/ideep/operators/pool_op.cc
@ -1,120 +0,0 @@
-#include <caffe2/ideep/operators/conv_pool_base_op.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPPoolOp final : public IDEEPConvPoolOpBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPPoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws) {
-    CAFFE_ENFORCE(
-        (dilation_h() == 1) && (dilation_w() == 1),
-        "Pooling op does not support dilation right now.");
-    if (!global_pooling_) {
-      CAFFE_ENFORCE(
-          pad_t() < kernel_h() && pad_b() < kernel_h() &&
-              pad_l() < kernel_w() && pad_r() < kernel_w(),
-          "Pad should be smaller than kernel.");
-    }
-
-    bool training_mode = OperatorBase::GetSingleArgument<int>("training_mode", 1);
-    pk_ = training_mode ? iprop::forward_training : iprop::forward_inference;
-
-    // Figure out the pooling descriptor.
-    if (operator_def.type().substr(0, 7) == "MaxPool") {
-      algo_ = ialgo::pooling_max;
-    } else if (operator_def.type().substr(0, 11) == "AveragePool") {
-      algo_ = ialgo::pooling_avg_exclude_padding;
-    } else {
-      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPPoolOp() override {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-    auto Y_dims = CalcOutputDims(X, X.get_dim(1));
-
-    if (cached_X_descriptor_ != X.get_descriptor()) {
-      cached_X_descriptor_ = X.dup_descriptor();
-    }
-
-    ideep::pooling_forward::compute(X, Y_dims, *Y,
-                                    {stride_.begin(), stride_.end()},
-                                    {kernel_.begin(), kernel_.end()},
-                                    pad_tl(), pad_br(), algo_, pk_);
-
-    return true;
-  }
-
- private:
-  iprop pk_;
-  ialgo algo_;
-  itensor::descriptor cached_X_descriptor_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPPoolGradientOp final : public IDEEPConvPoolOpBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPPoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws) {
-    CAFFE_ENFORCE(
-        (dilation_h() == 1) && (dilation_w() == 1),
-        "Pooling op does not support dilation right now.");
-    if (!global_pooling_) {
-      CAFFE_ENFORCE(
-          pad_t() < kernel_h() && pad_b() < kernel_h() &&
-              pad_l() < kernel_w() && pad_r() < kernel_w(),
-          "Pad should be smaller than kernel.");
-    }
-    // Figure out the pooling descriptor.
-    if (operator_def.type().substr(0, 15) == "MaxPoolGradient") {
-      algo_ = ialgo::pooling_max;
-    } else if (operator_def.type().substr(0, 19) == "AveragePoolGradient") {
-      algo_ = ialgo::pooling_avg_exclude_padding;
-    } else {
-      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPPoolGradientOp() override {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& X = Input(INPUT);
-    const auto& Y = Input(OUTPUT);
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dX = Output(INPUT_GRAD);
-
-    ideep::pooling_backward::compute(dY, Y, X, *dX,
-                                     {stride_.begin(), stride_.end()},
-                                     {kernel_.begin(), kernel_.end()},
-                                     pad_tl(), pad_br(), algo_);
-
-    return true;
-  }
-
- private:
-  ialgo algo_;
-
-  INPUT_TAGS(INPUT, OUTPUT, OUTPUT_GRAD);
-  OUTPUT_TAGS(INPUT_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(MaxPool, IDEEPPoolOp);
-REGISTER_IDEEP_OPERATOR(MaxPoolGradient, IDEEPPoolGradientOp);
-
-REGISTER_IDEEP_OPERATOR(AveragePool, IDEEPPoolOp);
-REGISTER_IDEEP_OPERATOR(AveragePoolGradient, IDEEPPoolGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/quantization/int8_add_op.cc
+++ b/caffe2/ideep/operators/quantization/int8_add_op.cc
@ -1,77 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-template <bool ReluFused>
-class IDEEPInt8SumReluOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPInt8SumReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)),
-        zero_point_(
-            this->template GetSingleArgument<int32_t>("Y_zero_point", 0)) {
-    if (ReluFused || zero_point_ == 0) {
-      Y_data_type_ = idtype::u8;
-      CAFFE_ENFORCE_EQ(zero_point_, 0, "Wrong zero point");
-    } else {
-      Y_data_type_ = idtype::s8;
-      CAFFE_ENFORCE_EQ(zero_point_, 128, "Wrong zero point");
-    }
-
-    Y_scales_ = ConvertScales({scale_});
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8SumReluOp() override {}
-
-  bool RunOnDevice() override {
-    itensor temp_ten;
-    itensor::dims input_dims;
-    vector<itensor> inputs_itensor;
-
-    CAFFE_ENFORCE_GT(InputSize(), 1, "Wrong input size (must > 1)");
-    for (int i = 0; i < InputSize(); ++i) {
-      CAFFE_ENFORCE(OperatorBase::InputBlob(i).template IsType<itensor>());
-      auto& Xi = Input(i);
-      if (input_dims.empty())
-        input_dims = Xi.get_dims();
-      CAFFE_ENFORCE(input_dims == Xi.get_dims());
-      inputs_itensor.emplace_back(
-          Xi.get_data_type() != idtype::f32 ? Xi.dequantize() : Xi);
-    }
-
-    temp_ten.init({input_dims, idtype::f32});
-    const vector<float> scales(InputSize(), 1.0);
-    ideep::sum::compute(scales, inputs_itensor, temp_ten);
-    if (ReluFused) {
-      ideep::eltwise_forward::compute(temp_ten, temp_ten);
-    }
-
-    auto* Y = Output(OUTPUT);
-    Y->init({temp_ten.get_dims(), Y_data_type_, iformat::nhwc});
-    Y->set_scale(Y_scales_);
-    Y->feed_from(temp_ten);
-    return true;
-  }
-
- private:
-  float scale_;
-  int32_t zero_point_;
-  iscale Y_scales_;
-  idtype Y_data_type_;
-
-  INPUT_TAGS(INPUT0);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Sum, DNNLOWP, IDEEPInt8SumReluOp<false>);
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Add, DNNLOWP, IDEEPInt8SumReluOp<false>);
-
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8SumRelu, DNNLOWP, IDEEPInt8SumReluOp<true>);
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8AddRelu, DNNLOWP, IDEEPInt8SumReluOp<true>);
-
-} // namespace
--- a/caffe2/ideep/operators/quantization/int8_conv_op.cc
+++ b/caffe2/ideep/operators/quantization/int8_conv_op.cc
@ -1,258 +0,0 @@
-#include <caffe2/ideep/operators/conv_pool_base_op.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPInt8ConvOp : public IDEEPConvPoolOpBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  IDEEPInt8ConvOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)),
-        zero_point_(
-            this->template GetSingleArgument<int32_t>("Y_zero_point", 0)) {
-    OPERATOR_NEEDS_FEATURE(pad_l() == pad_r() && pad_t() == pad_b(),
-                           "Uneven padding not supported.");
-    fusion_type_ = FUSION_UNKNOWN;
-    last_input_ = BIAS_OR_INPUT_S;
-    algo_ = ialgo::convolution_direct;
-    auto conv_algorithm = OperatorBase::GetSingleArgument<int>(
-        "conv_algorithm", CONV_ALGORITHM_AUTO);
-    if (conv_algorithm == CONV_ALGORITHM_WINOGRAD) {
-      algo_ = ialgo::convolution_winograd;
-    }
-    CAFFE_ENFORCE(zero_point_ == 128 || zero_point_ == 0);
-    Y_scales_ = ConvertScales({scale_});
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8ConvOp() override {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto &X = Input(INPUT_X);
-    const auto &filter = Input(FILTER);
-    auto *Y = Output(OUTPUT);
-
-    CAFFE_ENFORCE(X.has_scale());
-    CAFFE_ENFORCE(4 == X.ndims() && 4 == filter.ndims());
-    CAFFE_ENFORCE(X.get_data_type() == idtype::s8
-        || X.get_data_type() == idtype::u8);
-    CAFFE_ENFORCE(filter.get_dim(2) == kernel_h());
-    CAFFE_ENFORCE(filter.get_dim(3) == kernel_w());
-    CAFFE_ENFORCE(
-        X.get_dim(1) == filter.get_dim(1) * group_,
-        "Convolution op: input channels does not match: # of input channels ",
-        X.get_dim(1), " is not equal to kernel channels * group:",
-        filter.get_dim(1), "*", group_);
-
-    bool input_changed = (cached_X_descriptor_ != X.get_descriptor());
-    if (input_changed) {
-      cached_X_descriptor_ = X.dup_descriptor();
-    }
-
-    bool weights_changed = (cached_weights_descriptor_ != filter.get_descriptor());
-    if (weights_changed) {
-      cached_weights_descriptor_ = filter.dup_descriptor();
-      CAFFE_ENFORCE(filter.get_data_type() == idtype::s8 && filter.has_scale());
-
-      auto X_dt = X.get_data_type();
-      lowp_kind_ = ilowp_kind::LOWP_U8S8;
-      if (X_dt == idtype::s8) {
-        lowp_kind_ = ilowp_kind::LOWP_S8S8;
-      }
-
-      auto expected_descriptor =
-          ideep::convolution_forward::expected_weights_desc(
-              filter.get_dims(),
-              idtype::s8,
-              {stride_.begin(), stride_.end()},
-              pad_tl(),
-              pad_br(),
-              {dilation_.begin(), dilation_.end()},
-              group_,
-              algo_,
-              iprop::forward_inference,
-              X_dt, X.get_dims());
-      if (filter.get_desc() != expected_descriptor) {
-        filter_.init(expected_descriptor);
-        filter_.set_scale(filter.get_scale());
-        filter_.feed_from(filter);
-      } else {
-        filter_ = filter;
-      }
-
-      if (InputSize() > last_input_) {
-        // NOTE: If the bias is shared by other operators in this module,
-        // The existing bias scale should not satisfy current operator.
-        // Thus, we have to requantize it by current input and filter scales.
-        auto bias = Input(BIAS_OR_INPUT_S);
-        bias_.init({bias.get_dims(), idtype::s32});
-        iscale bias_scales (filter_.get_scale());
-        for (auto &scale : bias_scales) { scale *= X.get_scale()[0]; }
-        bias_.set_scale(bias_scales);
-        bias_.feed_from(bias);
-      }
-    }
-
-    bool with_bias = InputSize() > last_input_;
-    if (input_changed || weights_changed) {
-      auto Y_dims = CalcOutputDims(X, filter.get_dim(0));
-      if (with_bias) {
-        ideep::convolution_forward::prepare(
-            conv_param,
-            X,
-            filter_,
-            bias_,
-            Y_dims,
-            *Y,
-            {stride_.begin(), stride_.end()},
-            {dilation_.begin(), dilation_.end()},
-            pad_tl(),
-            pad_br(),
-            group_,
-            iscale(),
-            iscale(),
-            Y_scales_,
-            attr_,
-            algo_,
-            iprop::forward_inference,
-            lowp_kind_);
-      } else {
-        ideep::convolution_forward::prepare(
-            conv_param,
-            X,
-            filter_,
-            Y_dims,
-            *Y,
-            {stride_.begin(), stride_.end()},
-            {dilation_.begin(), dilation_.end()},
-            pad_tl(),
-            pad_br(),
-            group_,
-            iscale(),
-            iscale(),
-            Y_scales_,
-            attr_,
-            algo_,
-            iprop::forward_inference,
-            lowp_kind_);
-      }
-    }
-
-    if (with_bias) {
-      ideep::convolution_forward::compute(conv_param, X, filter_, bias_, *Y);
-    } else {
-      ideep::convolution_forward::compute(conv_param, X, filter_, *Y);
-    }
-
-    if (fusion_type_ != FUSION_CONV_RELU && fusion_type_ != FUSION_UNKNOWN) {
-      CAFFE_ENFORCE(
-          Y == &(Input(InputSize() - 1)),
-          "Convolution fusion op: InPlace is enforced for sum fusion.");
-    }
-
-    return true;
-  }
-
- protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  iattr attr_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  ialgo algo_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  float scale_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  int last_input_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  int32_t zero_point_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  ilowp_kind lowp_kind_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  FusionType fusion_type_;
-
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  itensor filter_, bias_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  iscale  Y_scales_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  itensor::descriptor cached_X_descriptor_, cached_weights_descriptor_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  ideep::convolution_forward_params conv_param;
-
-  INPUT_TAGS(INPUT_X, FILTER, BIAS_OR_INPUT_S, INPUT_S);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPInt8ConvReluOp final : public IDEEPInt8ConvOp {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPInt8ConvReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPInt8ConvOp(operator_def, ws) {
-    CAFFE_ENFORCE(zero_point_ == 0);
-    last_input_ = BIAS_OR_INPUT_S;
-    attr_ = iattr::fuse_relu();
-    fusion_type_ = FUSION_CONV_RELU;
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8ConvReluOp() override {}
-};
-
-class IDEEPInt8ConvSumOp final : public IDEEPInt8ConvOp {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPInt8ConvSumOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPInt8ConvOp(operator_def, ws) {
-    last_input_ = INPUT_S;
-    attr_ = iattr::fuse_sum();
-    fusion_type_ = FUSION_CONV_SUM;
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8ConvSumOp() override {}
-};
-
-class IDEEPInt8ConvSumReluOp final : public IDEEPInt8ConvOp {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPInt8ConvSumReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPInt8ConvOp(operator_def, ws) {
-    last_input_ = INPUT_S;
-    attr_ = iattr::residual();
-    fusion_type_ = FUSION_CONV_SUM_RELU;
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8ConvSumReluOp() override {}
-};
-
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Conv, DNNLOWP, IDEEPInt8ConvOp);
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8ConvRelu, DNNLOWP, IDEEPInt8ConvReluOp);
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8ConvSum, DNNLOWP, IDEEPInt8ConvSumOp);
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8ConvSumRelu, DNNLOWP, IDEEPInt8ConvSumReluOp);
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,clang-diagnostic-unused-function)
-OPERATOR_SCHEMA(Int8ConvSum)
-    .NumInputs(2, 4)
-    .NumOutputs(1)
-    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        ConvPoolOpBase<CPUContext>::CostInferenceForConv))
-    .AllowInplace({{2, 0}, {3, 0}});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,clang-diagnostic-unused-function)
-OPERATOR_SCHEMA(Int8ConvSumRelu)
-    .NumInputs(2, 4)
-    .NumOutputs(1)
-    .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        ConvPoolOpBase<CPUContext>::CostInferenceForConv))
-    .AllowInplace({{2, 0}, {3, 0}});
-
-} // namespace
--- a/caffe2/ideep/operators/quantization/int8_dequantize_op.cc
+++ b/caffe2/ideep/operators/quantization/int8_dequantize_op.cc
@ -1,43 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPInt8DequantizeOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPInt8DequantizeOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws) {
-
-    if (HasArgument("output_order")) {
-      Y_fmt_ = static_cast<iformat>(
-        this->template GetSingleArgument<int>("output_order",
-                                              static_cast<int>(iformat::nchw)));
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8DequantizeOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    auto* Y = Output(0);
-    if (Y_fmt_ != iformat::undef) {
-      Y->init(X.get_desc().to_type(idtype::f32).to_format(Y_fmt_));
-    } else {
-      Y->init(X.get_desc().to_type(idtype::f32));
-    }
-    Y->feed_from(X);
-
-    return true;
-  }
-
- private:
-  iformat Y_fmt_ {iformat::undef};
-};
-
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Dequantize, DNNLOWP, IDEEPInt8DequantizeOp);
-
-} // namespace
--- a/caffe2/ideep/operators/quantization/int8_fully_connected_op.cc
+++ b/caffe2/ideep/operators/quantization/int8_fully_connected_op.cc
@ -1,96 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-USE_IDEEP_DEF_ALIASES();
-
-class IDEEPInt8FullyConnectedOp final : public IDEEPOperator {
-public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPInt8FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
-      : IDEEPOperator(operator_def, ws),
-        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
-        axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
-        scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)),
-        zero_point_(
-            this->template GetSingleArgument<int32_t>("Y_zero_point", 0)) {
-    CAFFE_ENFORCE(zero_point_ == 128 || zero_point_ == 0);
-    if (zero_point_ == 0) {
-      Y_data_type_ = idtype::u8;
-    } else {
-      Y_data_type_ = idtype::s8;
-    }
-    Y_scales_ = ConvertScales({scale_});
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8FullyConnectedOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    const auto& filter = Input(FILTER);
-    auto* Y = Output(OUTPUT);
-
-    itensor X_in = X;
-    auto X_dims = CanonicalDims(X_in.get_dims(), axis_);
-    if (X_in.get_dims() != X_dims) {
-      X_in.reshape(X_dims);
-    }
-
-    if (cached_X_descriptor_ != X.get_descriptor()) {
-      cached_X_descriptor_ = X.dup_descriptor();
-      Y_.init({{X.get_dim(0), filter.get_dim(0)}, idtype::f32});
-    }
-
-    if (cached_weights_descriptor_ != filter.get_descriptor()) {
-      cached_weights_descriptor_ = filter.dup_descriptor();
-      CAFFE_ENFORCE(filter.get_data_type() == idtype::s8 && filter.has_scale());
-
-      // INT8 FC is not supported so far.
-      filter_ = filter.to_public();
-      auto filter_dims = CanonicalDims(filter_.get_dims(), axis_w_);
-      if (filter_.get_dims() != filter_dims) {
-        filter_.reshape(filter_dims);
-      }
-
-      if (InputSize() > BIAS) {
-        bias_ = Input(BIAS).to_public();
-      }
-
-      Y_.init({{X.get_dim(0), filter.get_dim(0)}, idtype::f32});
-    }
-
-    X_in = X_in.to_public();
-    if (InputSize() > BIAS) {
-      ideep::inner_product_forward::compute(
-          X_in, filter_, bias_, Y_);
-    } else {
-      ideep::inner_product_forward::compute(X_in, filter_, Y_);
-    }
-    Y->init({Y_.get_dims(), Y_data_type_});
-    Y->set_scale(Y_scales_);
-    Y->feed_from(Y_);
-    return true;
-  }
-
-private:
-  size_t axis_{1};
-  size_t axis_w_{1};
-  float scale_;
-  int32_t zero_point_;
-
-  idtype Y_data_type_;
-  itensor filter_, bias_, Y_;
-  iscale  Y_scales_;
-  itensor::descriptor cached_X_descriptor_, cached_weights_descriptor_;
-
-  INPUT_TAGS(INPUT, FILTER, BIAS);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8FC, DNNLOWP, IDEEPInt8FullyConnectedOp);
-
-} // namespace
--- a/caffe2/ideep/operators/quantization/int8_given_tensor_fill_op.cc
+++ b/caffe2/ideep/operators/quantization/int8_given_tensor_fill_op.cc
@ -1,149 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPInt8GivenTensorFillOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPInt8GivenTensorFillOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        zero_point_(
-            this->template GetSingleArgument<int32_t>("Y_zero_point", 0)),
-        shape_(this->template GetRepeatedArgument<itensor::dim>("shape")) {
-    CAFFE_ENFORCE(shape_.size() == 4 || shape_.size() == 2 || shape_.size() == 1);
-    CAFFE_ENFORCE(zero_point_ == 0 || zero_point_ == 128,
-        "Not support zero point");
-    if (HasArgument("Y_scales")) {
-      scales_ = this->template GetRepeatedArgument<float>("Y_scales");
-    } else {
-      auto scale = (this->template GetSingleArgument<float>("Y_scale", 1.0));
-      scales_ = {scale};
-    }
-
-    if (shape_.size() == 4) {
-      fmt_ = iformat::nhwc;
-      auto C = shape_[3];
-      shape_[3] = shape_[2];
-      shape_[2] = shape_[1];
-      shape_[1] = C;
-    } else if (shape_.size() == 2) {
-      fmt_ = iformat::nc;
-    } else {
-      fmt_ = iformat::x;
-    }
-
-    auto source_values = this->template GetSingleArgument<string>("values", "");
-    auto src_size = source_values.size();
-    values_.Resize(src_size);
-    uint8_t* values_data = values_.template mutable_data<uint8_t>();
-    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < src_size; i++) {
-      values_data[i] = static_cast<uint8_t>(source_values[i]);
-    }
-  }
-
-  bool RunOnDevice() override {
-    auto* output = Output(OUTPUT);
-    auto data_type = zero_point_ == 0 ? idtype::u8 : idtype::s8;
-
-    output->init({shape_, data_type});
-    TORCH_DCHECK_EQ(output->get_nelems(), values_.numel())
-        << "output size: " << output->get_nelems()
-        << " given size: " << values_.numel();
-
-    if (output->get_nelems() > 0) {
-      itensor temp_ten;
-      temp_ten.init({shape_, data_type, fmt_});
-      auto* data_u8 = static_cast<uint8_t*>(temp_ten.get_data_handle());
-      const auto* values_data = values_.template data<uint8_t>();
-      context_.template CopySameDevice<uint8_t>(
-          temp_ten.get_nelems(), values_data, data_u8);
-
-      // Shift quantized data to s8 per zero point
-      if (zero_point_ == 128) {
-        auto* data_s8 = static_cast<int8_t*>(temp_ten.get_data_handle());
-        auto nelems = temp_ten.get_nelems();
-        for (int i = 0; i < nelems; i++) {
-          data_s8[i] = data_s8[i] - zero_point_;
-        }
-      }
-
-      output->feed_from(temp_ten);
-    }
-
-    output->set_scale(ConvertScales(scales_));
-    return true;
-  }
-
- private:
-  iscale scales_;
-  int32_t zero_point_;
-  iformat fmt_;
-  itensor::dims shape_;
-  Tensor values_{CPU};
-
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPInt8GivenIntTensorFillOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPInt8GivenIntTensorFillOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        zero_point_(
-            this->template GetSingleArgument<int32_t>("Y_zero_point", 0)),
-        shape_(this->template GetRepeatedArgument<itensor::dim>("shape")) {
-    CAFFE_ENFORCE(zero_point_ == 0, "Not support zero point");
-    if (HasArgument("Y_scales")) {
-      scales_ = this->template GetRepeatedArgument<float>("Y_scales");
-    } else {
-      auto scale = (this->template GetSingleArgument<float>("Y_scale", 1.0));
-      scales_ = {scale};
-    }
-
-    auto source_values = this->template GetRepeatedArgument<int32_t>("values");
-    auto src_size = source_values.size();
-    values_.Resize(src_size);
-    auto* values_data = values_.template mutable_data<int32_t>();
-    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < src_size; i++) {
-      values_data[i] = static_cast<int32_t>(source_values[i]);
-    }
-  }
-
-  bool RunOnDevice() override {
-    auto* output = Output(OUTPUT);
-    output->init({shape_, idtype::s32});
-    output->set_scale(ConvertScales(scales_));
-    TORCH_DCHECK_EQ(output->get_nelems(), values_.numel())
-        << "output size: " << output->get_nelems()
-        << " given size: " << values_.numel();
-
-    if (output->get_nelems() > 0) {
-      auto* data = static_cast<int32_t*>(output->get_data_handle());
-      const int32_t* values_data = values_.template data<int32_t>();
-      context_.template CopySameDevice<int32_t>(
-          output->get_nelems(), values_data, data);
-    }
-    return true;
-  }
-
- private:
-  iscale scales_;
-  int32_t zero_point_;
-  itensor::dims shape_;
-  Tensor values_{CPU};
-
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR(Int8GivenTensorFill, IDEEPInt8GivenTensorFillOp);
-REGISTER_IDEEP_OPERATOR(Int8GivenIntTensorFill, IDEEPInt8GivenIntTensorFillOp);
-
-} // namespace
--- a/caffe2/ideep/operators/quantization/int8_pool_op.cc
+++ b/caffe2/ideep/operators/quantization/int8_pool_op.cc
@ -1,65 +0,0 @@
-#include <caffe2/ideep/operators/conv_pool_base_op.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPInt8PoolOp final : public IDEEPConvPoolOpBase {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
-
-  IDEEPInt8PoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws) {
-    CAFFE_ENFORCE(
-        (dilation_h() == 1) && (dilation_w() == 1),
-        "Pooling op does not support dilation right now.");
-    if (!global_pooling_) {
-      CAFFE_ENFORCE(
-          pad_t() < kernel_h() && pad_b() < kernel_h() &&
-              pad_l() < kernel_w() && pad_r() < kernel_w(),
-          "Pad should be smaller than kernel.");
-    }
-
-    // Figure out the pooling descriptor.
-    if (operator_def.type().substr(0, 11) == "Int8MaxPool") {
-      algo_ = ialgo::pooling_max;
-    } else if (operator_def.type().substr(0, 15) == "Int8AveragePool") {
-      algo_ = ialgo::pooling_avg_exclude_padding;
-    } else {
-      LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8PoolOp() override {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-    auto Y_dims = CalcOutputDims(X, X.get_dim(1));
-
-    if (cached_X_descriptor_ != X.get_descriptor()) {
-      cached_X_descriptor_ = X.dup_descriptor();
-    }
-
-    ideep::pooling_forward::compute(X, Y_dims, *Y,
-                                    {stride_.begin(), stride_.end()},
-                                    {kernel_.begin(), kernel_.end()},
-                                    pad_tl(), pad_br(), algo_,
-                                    iprop::forward_inference);
-
-    return true;
-  }
-
- private:
-  ialgo algo_;
-  itensor::descriptor cached_X_descriptor_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8MaxPool, DNNLOWP, IDEEPInt8PoolOp);
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8AveragePool, DNNLOWP, IDEEPInt8PoolOp);
-
-} // namespace
--- a/caffe2/ideep/operators/quantization/int8_quantize_op.cc
+++ b/caffe2/ideep/operators/quantization/int8_quantize_op.cc
@ -1,61 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPInt8QuantizeOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPInt8QuantizeOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)),
-        zero_point_(
-            this->template GetSingleArgument<int32_t>("Y_zero_point", 0)) {
-
-    if (HasArgument("output_order")) {
-      Y_fmt_ = static_cast<iformat>(
-        this->template GetSingleArgument<int>("output_order",
-                                              static_cast<int>(iformat::nchw)));
-    }
-
-    CAFFE_ENFORCE(zero_point_ == 0 || zero_point_ == 128,
-        "Not support this zero point");
-    Y_data_type_ = zero_point_ == 0 ? idtype::u8 : idtype::s8;
-    Y_scales_ = ConvertScales({scale_});
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8QuantizeOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    CAFFE_ENFORCE(X.get_data_type() == idtype::f32, "Not support data type");
-
-    auto* Y = Output(0);
-    if (Y_fmt_ != iformat::undef) {
-      Y->init(X.get_desc().to_type(Y_data_type_).to_format(Y_fmt_));
-    } else {
-      Y->init(X.get_desc().to_type(Y_data_type_));
-    }
-    Y->set_scale(Y_scales_);
-    Y->feed_from(X);
-
-    return true;
-  }
-
- private:
-  float scale_;
-  int32_t zero_point_;
-  iscale Y_scales_;
-  idtype Y_data_type_;
-  iformat Y_fmt_ {iformat::undef};
-
-  INPUT_TAGS(INPUT0);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Quantize, DNNLOWP, IDEEPInt8QuantizeOp);
-
-} // namespace
--- a/caffe2/ideep/operators/quantization/int8_relu_op.cc
+++ b/caffe2/ideep/operators/quantization/int8_relu_op.cc
@ -1,43 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPInt8ReluOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPInt8ReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws), alpha_(0.0) {
-    // Figure out the Relu descriptor.
-    if (operator_def.type().substr(0, 8) == "Int8Relu") {
-      alpha_ = 0.0;
-    } else {
-      LOG(FATAL) << "Unsupported Relu method: " << operator_def.type();
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPInt8ReluOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-
-    ideep::eltwise_forward::compute(
-        X, *Y, ialgo::eltwise_relu, iprop::forward_inference, alpha_);
-
-    return true;
-  }
-
- private:
-  float alpha_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Relu, DNNLOWP, IDEEPInt8ReluOp);
-
-} // namespace
--- a/caffe2/ideep/operators/queue_ops.cc
+++ b/caffe2/ideep/operators/queue_ops.cc
@ -1,74 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-#include <caffe2/queue/blobs_queue.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPCreateBlobsQueueOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPCreateBlobsQueueOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        ws_(ws),
-        name(operator_def.output().Get(0)) {}
-
-  bool RunOnDevice() override {
-    const auto capacity = GetSingleArgument("capacity", 1);
-    const auto numBlobs = GetSingleArgument("num_blobs", 1);
-    const auto enforceUniqueName =
-        GetSingleArgument("enforce_unique_name", false);
-    const auto fieldNames =
-        OperatorBase::template GetRepeatedArgument<std::string>("field_names");
-    CAFFE_ENFORCE_EQ(this->OutputSize(), 1);
-    auto queuePtr = OperatorBase::Outputs()[0]
-                        ->template GetMutable<std::shared_ptr<BlobsQueue>>();
-
-    CAFFE_ENFORCE(queuePtr);
-    *queuePtr = std::make_shared<BlobsQueue>(
-        ws_, name, capacity, numBlobs, enforceUniqueName, fieldNames);
-    return true;
-  }
-
- private:
-  Workspace* ws_{nullptr};
-  const std::string name;
-};
-
-class IDEEPSafeEnqueueBlobsOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPSafeEnqueueBlobsOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    auto queue =
-        OperatorBase::Inputs()[0]->template Get<std::shared_ptr<BlobsQueue>>();
-    CAFFE_ENFORCE(queue);
-    auto size = queue->getNumBlobs();
-    CAFFE_ENFORCE(
-        // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-        OutputSize() == size + 1,
-        "Expected " + caffe2::to_string(size + 1) + ", " +
-            " got: " + caffe2::to_string(size));
-    bool status = queue->blockingWrite(OperatorBase::Outputs());
-
-    auto st = OperatorBase::Output<TensorCPU>(1, CPU);
-    st->Resize();
-    auto stat = st->template mutable_data<bool>();
-    stat[0] = !status;
-    return true;
-  }
-};
-
-REGISTER_IDEEP_OPERATOR(CreateBlobsQueue, IDEEPCreateBlobsQueueOp);
-SHOULD_NOT_DO_GRADIENT(IDEEPCreateBlobsQueueOp);
-
-REGISTER_IDEEP_OPERATOR(SafeEnqueueBlobs, IDEEPSafeEnqueueBlobsOp);
-SHOULD_NOT_DO_GRADIENT(IDEEPSafeEnqueueBlobsOp);
-
-} // namespace
--- a/caffe2/ideep/operators/relu_op.cc
+++ b/caffe2/ideep/operators/relu_op.cc
@ -1,91 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPReluOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws), alpha_(0.0) {
-    // Figure out the Relu descriptor.
-    if (operator_def.type().substr(0, 4) == "Relu") {
-      alpha_ = 0.0;
-    } else if (operator_def.type().substr(0, 9) == "LeakyRelu") {
-      if (HasArgument("alpha")) {
-        alpha_ = static_cast<float>(
-            OperatorBase::GetSingleArgument<float>("alpha", 0.01));
-      }
-    } else {
-      LOG(FATAL) << "Unsupported Relu method: " << operator_def.type();
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPReluOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-
-    ideep::eltwise_forward::compute(
-        X, *Y, ialgo::eltwise_relu, iprop::forward_training, alpha_);
-
-    return true;
-  }
-
- private:
-  float alpha_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPReluGradientOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws), alpha_(0.0) {
-    // Figure out the Relu descriptor.
-    if (operator_def.type().substr(0, 12) == "ReluGradient") {
-      alpha_ = 0.0;
-    } else if (operator_def.type().substr(0, 17) == "LeakyReluGradient") {
-      if (HasArgument("alpha")) {
-        alpha_ = static_cast<float>(
-            OperatorBase::GetSingleArgument<float>("alpha", 0.01));
-      }
-    } else {
-      LOG(FATAL) << "Unsupported Relu method: " << operator_def.type();
-    }
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPReluGradientOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& Y = Input(OUTPUT);
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dX = Output(INPUT_GRAD);
-
-    ideep::eltwise_backward::compute(Y, dY, *dX, ialgo::eltwise_relu, alpha_);
-
-    return true;
-  }
-
- private:
-  float alpha_;
-
-  INPUT_TAGS(OUTPUT, OUTPUT_GRAD);
-  OUTPUT_TAGS(INPUT_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(Relu, IDEEPReluOp);
-REGISTER_IDEEP_OPERATOR(ReluGradient, IDEEPReluGradientOp);
-
-REGISTER_IDEEP_OPERATOR(LeakyRelu, IDEEPReluOp);
-REGISTER_IDEEP_OPERATOR(LeakyReluGradient, IDEEPReluGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/reshape_op.cc
+++ b/caffe2/ideep/operators/reshape_op.cc
@ -1,134 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-// Takes a shape and data tensor and reshapes it
-class IDEEPReshapeOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPReshapeOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        new_shape_(OperatorBase::GetRepeatedArgument<itensor::dim>("shape")) {}
-
-  bool RunOnDevice() override {
-    ideep::tensor::dims actual_new_shape = new_shape_;
-    if (InputSize() == 2) {
-      CAFFE_ENFORCE(
-          !OperatorBase::HasArgument("shape"),
-          "New shape is specified by the input blob, do not pass in "
-          "the argument `shape`.");
-
-      // shape info live on CPU
-      auto& shape = OperatorBase::Input<TensorCPU>(1, CPU);
-      CAFFE_ENFORCE(shape.ndim() == 1, "Shape should be 1-D");
-      actual_new_shape.reserve(shape.size());
-      if (shape.template IsType<int>()) {
-        const int* shape_data = shape.template data<int>();
-        actual_new_shape.assign(shape_data, shape_data + shape.size());
-      } else if (shape.template IsType<int64_t>()) {
-        const int64_t* shape_data = shape.template data<int64_t>();
-        for (int i = 0; i < shape.size(); ++i) {
-          actual_new_shape.push_back(static_cast<int>(shape_data[i]));
-        }
-      } else {
-        CAFFE_THROW(
-            "IDEEP reshape only supports shape data in int32_t or int64_t");
-      }
-    } else {
-      CAFFE_ENFORCE(
-          OperatorBase::HasArgument("shape"), "Argument `shape` is missing.");
-    }
-
-    auto& input = Input(0);
-    // Copy over the dimensions for those that are specified zero.
-    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < actual_new_shape.size() && i < input.ndims(); ++i) {
-      if (actual_new_shape[i] == 0) {
-        actual_new_shape[i] = input.get_dim(i);
-      }
-    }
-
-    // Checks if the new shape is valid and fills in the missing dimension
-    // specified by -1.
-    // NOTE: At most one dimension can be -1.
-    auto total_size = input.get_nelems();
-    int size = 1;
-    int unknown_idx = -1;
-    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < actual_new_shape.size(); ++i) {
-      const auto dim = actual_new_shape[i];
-      if (dim == -1) {
-        CAFFE_ENFORCE(
-            unknown_idx == -1,
-            "Argument `shape` has more than one missing dimension.");
-        unknown_idx = i;
-      } else {
-        size *= dim;
-      }
-    }
-    if (size == 0 && total_size != 0) {
-      CAFFE_THROW(
-          "Can not reshape a non-zero size (",
-          total_size,
-          ") tensor to zero size.");
-    }
-
-    if (unknown_idx != -1) {
-      CAFFE_ENFORCE_NE(
-          size,
-          0,
-          "New shape at dim ",
-          unknown_idx,
-          " can not be inferred since new size is zero.");
-      CAFFE_ENFORCE(
-          total_size % size == 0,
-          "Argument `shape` does not agree with the input data.",
-          " (",
-          total_size,
-          " vs ",
-          size,
-          ")");
-      actual_new_shape[unknown_idx] = total_size / size;
-    } else {
-      CAFFE_ENFORCE_EQ(
-          total_size,
-          size,
-          "Argument `shape` does not agree with the input data.",
-          " (",
-          total_size,
-          " != ",
-          size,
-          ")");
-    }
-
-    // Write the original shape to the second output.
-    // shape info live on CPU
-    TensorCPU* old_shape = OperatorBase::Output<TensorCPU>(1, CPU);
-    old_shape->Resize(input.ndims());
-    int* old_shape_data = old_shape->template mutable_data<int>();
-    for (int i = 0; i < input.ndims(); ++i) {
-      old_shape_data[i] = input.get_dim(i);
-    }
-
-    auto* output = Output(0);
-    if (output != &input) {
-      // If we are not doing in-place computation, a copy is needed.
-      output->reinit_like(input);
-      ideep::direct_copy::compute(input, *output);
-    }
-
-    output->reshape(actual_new_shape);
-    return true;
-  }
-
- private:
-  ideep::tensor::dims new_shape_;
-};
-
-REGISTER_IDEEP_OPERATOR(Reshape, IDEEPReshapeOp);
-
-} // namespace
--- a/caffe2/ideep/operators/shape_op.cc
+++ b/caffe2/ideep/operators/shape_op.cc
@ -1,70 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-// RecordShapeOp records the shape of the input tensor to a vector of int. You
-// mostly don't need this operator explicitly, and it is mostly used in the
-// autodiff process.
-class IDEEPShapeOp : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPShapeOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        axes_(OperatorBase ::GetRepeatedArgument<int>("axes")) {}
-
-  bool RunOnDevice() override {
-    int numDims = 0;
-    int numAxes = axes_.size();
-    vector<int64_t> dims;
-    const char* data_dims = nullptr;
-    auto* output = OperatorBase::Output<Tensor>(OUTPUT, CPU);
-
-    if (OperatorBase::InputBlob(DATA).template IsType<itensor>()) {
-      auto& data = Input(DATA);
-      numDims = data.ndims();
-      auto idims = data.get_dims();
-      dims.assign(idims.begin(), idims.end());
-      data_dims = reinterpret_cast<const char*>(dims.data());
-    } else {
-      auto& data = OperatorBase::Input<Tensor>(DATA, CPU);
-      numDims = data.dim();
-      data_dims = reinterpret_cast<const char*>(data.sizes().data());
-    }
-
-    if (numAxes == 0) {
-      output->Resize(numDims);
-      int64_t* output_data = output->template mutable_data<int64_t>();
-      context_.CopyBytesSameDevice(
-          numDims * sizeof(int64_t), data_dims, output_data);
-      return true;
-    }
-
-    output->Resize(numAxes);
-    auto out = reinterpret_cast<char*>(output->template mutable_data<int64_t>());
-    for (int i = 0; i < numAxes; i++) {
-      auto axis = axes_[i];
-      CAFFE_ENFORCE_LT(axis, numDims, "Axis out of range");
-      CAFFE_ENFORCE_GE(axis, 0, "Each axis should be non-negative");
-      context_.CopyBytesSameDevice(
-          sizeof(int64_t), data_dims + axis * sizeof(int64_t), out);
-      out += sizeof(int64_t);
-    }
-
-    return true;
-  }
-
- private:
-  vector<int> axes_;
-
-  INPUT_TAGS(DATA);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-
-REGISTER_IDEEP_OPERATOR(Shape, IDEEPShapeOp);
-
-} // namespace
--- a/caffe2/ideep/operators/sigmoid_op.cc
+++ b/caffe2/ideep/operators/sigmoid_op.cc
@ -1,64 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPSigmoidOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPSigmoidOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws) {
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPSigmoidOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-
-    ideep::eltwise_forward::compute(
-        X, *Y, ialgo::eltwise_logistic, iprop::forward_training);
-
-    return true;
-  }
-
- private:
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-class IDEEPSigmoidGradientOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPSigmoidGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws) {
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPSigmoidGradientOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& Y = Input(OUTPUT);
-    const auto& dY = Input(OUTPUT_GRAD);
-    auto* dX = Output(INPUT_GRAD);
-
-    ideep::eltwise_backward::compute(Y, dY, *dX, ialgo::eltwise_logistic);
-
-    return true;
-  }
-
- private:
-
-  INPUT_TAGS(OUTPUT, OUTPUT_GRAD);
-  OUTPUT_TAGS(INPUT_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(Sigmoid, IDEEPSigmoidOp);
-REGISTER_IDEEP_OPERATOR(SigmoidGradient, IDEEPSigmoidGradientOp);
-
-} // namespace
--- a/caffe2/ideep/operators/spatial_batch_norm_op.cc
+++ b/caffe2/ideep/operators/spatial_batch_norm_op.cc
@ -1,110 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPSpatialBNOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPSpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
-        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
-        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.9)) {
-    CAFFE_ENFORCE(
-        (is_test_ && OutputSize() > OUTPUT)
-          || (!is_test_ && OutputSize() > SAVED_VAR));
-    CAFFE_ENFORCE_GT(epsilon_, 0);
-    CAFFE_ENFORCE_GE(momentum_, 0);
-    CAFFE_ENFORCE_LE(momentum_, 1);
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPSpatialBNOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    const auto& scale = Input(SCALE);
-    const auto& bias = Input(BIAS);
-    auto* Y = Output(OUTPUT);
-
-    TORCH_DCHECK_EQ(scale.ndims(), 1);
-    TORCH_DCHECK_EQ(bias.ndims(), 1);
-    TORCH_DCHECK_EQ(scale.get_dim(0), X.get_dim(1));
-    TORCH_DCHECK_EQ(bias.get_dim(0), X.get_dim(1));
-
-    if (is_test_) {
-      const auto& est_mean = Input(EST_MEAN);
-      const auto& est_var = Input(EST_VAR);
-      auto X_ = X.get_data_type() != idtype::f32 ? X.dequantize() : X;
-      ideep::batch_normalization_forward_inference::compute(
-          // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-          X_, est_mean, est_var, scale, bias, *Y, epsilon_);
-    } else {
-      auto* saved_mean = Output(SAVED_MEAN);
-      auto* saved_var = Output(SAVED_VAR);
-      auto* running_mean = Output(RUNNING_MEAN);
-      auto* running_var = Output(RUNNING_VAR);
-      ideep::batch_normalization_forward_training::compute(
-          X, scale, bias, *Y, *saved_mean, *saved_var,
-          // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-          *running_mean, *running_var, momentum_, epsilon_);
-    }
-
-    return true;
-  }
-
- private:
-  bool is_test_;
-  double epsilon_;
-  double momentum_;
-
-  INPUT_TAGS(INPUT, SCALE, BIAS, EST_MEAN, EST_VAR);
-  OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_VAR);
-};
-
-class IDEEPSpatialBNGradientOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPSpatialBNGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)) {
-    CAFFE_ENFORCE(InputSize() > SAVED_VAR);
-    CAFFE_ENFORCE(OutputSize() > BIAS_GRAD);
-  }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPSpatialBNGradientOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    const auto& scale = Input(SCALE);
-    const auto& dY = Input(OUTPUT_GRAD);
-    const auto& saved_mean = Input(SAVED_MEAN);
-    const auto& saved_var = Input(SAVED_VAR);
-    auto* dX = Output(INPUT_GRAD);
-    auto* dscale = Output(SCALE_GRAD);
-    auto* dbias = Output(BIAS_GRAD);
-
-    ideep::batch_normalization_backward::compute(
-        X, saved_mean, saved_var, dY, scale,
-        // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-        *dX, *dscale, *dbias, epsilon_);
-
-    return true;
-  }
-
- private:
-  double epsilon_;
-
-  INPUT_TAGS(INPUT, SCALE, OUTPUT_GRAD, SAVED_MEAN, SAVED_VAR);
-  OUTPUT_TAGS(INPUT_GRAD, SCALE_GRAD, BIAS_GRAD);
-};
-
-REGISTER_IDEEP_OPERATOR(SpatialBN, IDEEPSpatialBNOp);
-REGISTER_IDEEP_OPERATOR(SpatialBNGradient, IDEEPSpatialBNGradientOp)
-
-}  // namespace
--- a/caffe2/ideep/operators/transpose_op.cc
+++ b/caffe2/ideep/operators/transpose_op.cc
@ -1,36 +0,0 @@
-#include <caffe2/ideep/ideep_utils.h>
-
-using namespace caffe2;
-
-namespace {
-
-class IDEEPTransposeOp final : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPTransposeOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws),
-        axes_(this->template GetRepeatedArgument<int>("axes")){ }
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~IDEEPTransposeOp() override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(INPUT);
-    auto* Y = Output(OUTPUT);
-
-    Y->transpose_from(X.to_public(nullptr, X.get_data_type()), axes_);
-
-    return true;
-  }
-
- private:
-  std::vector<int> axes_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-REGISTER_IDEEP_OPERATOR(Transpose, IDEEPTransposeOp);
-
-} // namespace
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@ -1,133 +0,0 @@
-  #include "caffe2/operators/utility_ops.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/ideep/ideep_utils.h"
-
-using namespace caffe2;
-
-namespace {
-
-class CopyCPUToIDEEPOp final : public IDEEPOperator {
- public:
-  USE_SIMPLE_IDEEP_CTOR_DTOR(CopyCPUToIDEEPOp);
-  USE_IDEEP_DEF_ALIASES();
-
-  bool RunOnDevice() override {
-    const auto& X = OperatorBase::Input<Tensor>(0, CPU);
-    auto* Y = OperatorBase::OutputBlob(0);
-    itensor::dims src_dims(X.sizes().begin(), X.sizes().end());
-    if (!(Y->template IsType<itensor>() &&
-          Y->Get<itensor>().get_data_type() == itensor::data_type::f32) ||
-        Y->Get<itensor>().get_dims() != src_dims) {
-      Y->Reset(new itensor());
-      Y->GetMutable<itensor>()->resize(src_dims, itensor::data_type::f32);
-    }
-    Y->GetMutable<itensor>()->feed_from(
-        src_dims, itensor::data_type::f32, X.raw_data());
-    return true;
-  }
-};
-
-class IDEEPCopyOp final : public IDEEPOperator {
- public:
-  USE_SIMPLE_IDEEP_CTOR_DTOR(IDEEPCopyOp);
-  USE_IDEEP_DEF_ALIASES();
-
-  bool RunOnDevice() override {
-    const auto& X = OperatorBase::Input<itensor>(0);
-    auto* Y = Output(0);
-    if (X != *Y) {
-      Y->reinit_like(X);
-      ideep::direct_copy::compute(X, *Y);
-    }
-
-    return true;
-  }
-};
-
-class CopyIDEEPToCPUOp final : public IDEEPOperator {
- public:
-  USE_SIMPLE_IDEEP_CTOR_DTOR(CopyIDEEPToCPUOp);
-  USE_IDEEP_DEF_ALIASES();
-  bool RunOnDevice() override {
-    const auto& input_blob = OperatorBase::InputBlob(0);
-    if (BlobIsTensorType(input_blob, CPU)) {
-      VLOG(2) << "Directing sharing of TensorCPU";
-      const auto& X = OperatorBase::Input<Tensor>(0, CPU);
-      OutputTensorCopyFrom(0, at::device(CPU), X);
-    } else {
-      const auto& X = OperatorBase::Input<itensor>(0);
-      if (X.get_data_type() == itensor::data_type::f32) {
-        std::vector<int64_t> dims;
-        // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-        for (int i = 0; i < X.get_dims().size(); ++i) {
-          dims.push_back(X.get_dims()[i]);
-        }
-        auto* Y =
-            OperatorBase::OutputTensor(0, dims, at::dtype<float>().device(CPU));
-        itensor temp_ten(
-            X.get_desc().to_default_format(),
-            Y->template mutable_data<float>());
-        X.reorder_to(temp_ten);
-      } else {
-        CAFFE_THROW("Unsupported ideep type: ",
-                    static_cast<int>(X.get_data_type()));
-      }
-    }
-    return true;
-  }
-};
-
-class IDEEPWeightedSumOp : public IDEEPOperator {
- public:
-  USE_IDEEP_DEF_ALIASES();
-  USE_IDEEP_OPERATOR_FUNCTIONS();
-
-  IDEEPWeightedSumOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPOperator(operator_def, ws) {}
-  bool RunOnDevice() override {
-    CAFFE_ENFORCE_EQ(InputSize() % 2, 0);
-    auto ndims = Input(0).ndims();
-    auto nelems = Input(0).get_nelems();
-    auto w_nelems = Input(1).get_nelems();
-    CAFFE_ENFORCE_GT(nelems, 0);
-    CAFFE_ENFORCE_EQ(w_nelems, 1);
-    auto* output = Output(0);
-    std::vector<float> scales;
-    scales.reserve(InputSize() / 2);
-    std::vector<itensor> inputs;
-    inputs.reserve(InputSize() / 2);
-    for (int i = 0; i < InputSize(); i += 2) {
-      auto& X = Input(i);
-      CAFFE_ENFORCE(X.ndims() == ndims);
-      CAFFE_ENFORCE(X.get_nelems() == nelems);
-      CAFFE_ENFORCE(Input(i + 1).get_nelems() == w_nelems);
-      inputs.push_back(X);
-      auto scale = static_cast<float *>(Input(i + 1).get_data_handle());
-      scales.push_back(scale[0]);
-    }
-
-    ideep::sum::compute(scales, inputs, *output);
-
-    return true;
-  }
-};
-
-REGISTER_IDEEP_OPERATOR(CopyCPUToIDEEP, CopyCPUToIDEEPOp);
-REGISTER_IDEEP_OPERATOR(CopyIDEEPToCPU, CopyIDEEPToCPUOp);
-REGISTER_IDEEP_OPERATOR(Copy, IDEEPCopyOp);
-REGISTER_IDEEP_OPERATOR(WeightedSum, IDEEPWeightedSumOp);
-
-// NOLINTNEXTLINE(clang-diagnostic-unused-function,cppcoreguidelines-avoid-non-const-global-variables)
-OPERATOR_SCHEMA(CopyCPUToIDEEP)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .Input(0, "cpu_blob", "The input TensorCPU to copy")
-    .Output(0, "ideep_blob", "The output IDEEP tensort to copy to");
-// NOLINTNEXTLINE(clang-diagnostic-unused-function,cppcoreguidelines-avoid-non-const-global-variables)
-OPERATOR_SCHEMA(CopyIDEEPToCPU)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .Input(0, "ideep_blob", "The input IDEEP tensort to copy")
-    .Output(0, "cpu_blob", "The output TensorCPU to copy to");
-
-} // namespace
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@ -1,171 +0,0 @@
-#pragma once
-
-#include <cstdlib>
-#include <ctime>
-#include <random>
-
-#include <caffe2/core/context.h>
-
-namespace caffe2 {
-
-class IDEEPContext final : public BaseContext {
- public:
-  typedef std::mt19937 rand_gen_type;
-  IDEEPContext() : random_seed_(RandomNumberSeed()) {}
-  explicit IDEEPContext(const DeviceOption& option)
-      : random_seed_(
-            option.has_random_seed() ? option.random_seed()
-                                     : RandomNumberSeed()) {
-    CAFFE_ENFORCE_EQ(option.device_type(), PROTO_IDEEP);
-  }
-  explicit IDEEPContext(const at::Device& device)
-      : IDEEPContext(DeviceToOption(device)) {}
-
-  ~IDEEPContext() noexcept override {}
-
-  inline void SwitchToDevice(int64_t /*stream_id*/) override {}
-  using BaseContext::SwitchToDevice;
-
-  inline void WaitEvent(const Event& ev) override {
-    ev.Wait(IDEEP, this);
-  }
-
-  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
-    CAFFE_ENFORCE(ev, "Event must not be null.");
-    ev->Record(IDEEP, this, err_msg);
-  }
-
-
-  inline void FinishDeviceComputation() override {}
-
-  inline rand_gen_type& RandGenerator() {
-    if (!random_generator_.get()) {
-      random_generator_.reset(new rand_gen_type(random_seed_));
-    }
-    return *random_generator_.get();
-  }
-
-  inline static at::DataPtr New(size_t nbytes) {
-    return GetAllocator(CPU)->allocate(nbytes);
-  }
-
-  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
-    if (nbytes == 0) {
-      return;
-    }
-    CAFFE_ENFORCE(src);
-    CAFFE_ENFORCE(dst);
-    memcpy(dst, src, nbytes);
-  }
-
-  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  bool SupportsNonFundamentalTypes() const override {
-    // IDEEP meta copy is OK
-    return true;
-  }
-
-  // Two copy functions that deals with cross-device copies.
-  template <class SrcContext, class DstContext>
-  inline void CopyBytes(size_t nbytes, const void* src, void* dst);
-
-  template <typename T, class SrcContext, class DstContext>
-  inline void Copy(size_t n, const T* src, T* dst) {
-    if (c10::guts::is_fundamental<T>::value) {
-      CopyBytes<SrcContext, DstContext>(
-          n * sizeof(T),
-          static_cast<const void*>(src),
-          static_cast<void*>(dst));
-    } else {
-      for (const auto i : c10::irange(n)) {
-        dst[i] = src[i];
-      }
-    }
-  }
-
-  template <class SrcContext, class DstContext>
-  inline void
-  CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
-    if (meta.copy()) {
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  static bool HasAsyncPartDefault() {
-    return false;
-  }
-
-  static bool SupportsAsyncScheduling() {
-    return false;
-  }
-
-  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
-    return true;
-  }
-
-  at::Device device() const override {
-    return at::Device(IDEEP);
-  }
-
-  DeviceType device_type() const override {
-    return IDEEP;
-  }
-
-  static constexpr DeviceType GetDeviceType() {
-    return IDEEP;
-  }
-
- protected:
-  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
-  int random_seed_{1701};
-  std::unique_ptr<rand_gen_type> random_generator_;
-};
-
-template <>
-inline void IDEEPContext::CopyBytes<IDEEPContext, IDEEPContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  if (nbytes == 0) {
-    return;
-  }
-  CAFFE_ENFORCE(src);
-  CAFFE_ENFORCE(dst);
-  memcpy(dst, src, nbytes);
-}
-
-template <>
-inline void IDEEPContext::CopyBytes<CPUContext, IDEEPContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  if (nbytes == 0) {
-    return;
-  }
-  CAFFE_ENFORCE(src);
-  CAFFE_ENFORCE(dst);
-  memcpy(dst, src, nbytes);
-}
-
-template <>
-inline void IDEEPContext::CopyBytes<IDEEPContext, CPUContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  if (nbytes == 0) {
-    return;
-  }
-  CAFFE_ENFORCE(src);
-  CAFFE_ENFORCE(dst);
-  memcpy(dst, src, nbytes);
-}
-
-} // namespace caffe2
--- a/caffe2/ideep/utils/ideep_operator.h
+++ b/caffe2/ideep/utils/ideep_operator.h
@ -1,150 +0,0 @@
-#pragma once
-
-#include <ideep.hpp>
-#include <caffe2/core/operator.h>
-#include <caffe2/proto/caffe2_pb.h>
-
-namespace caffe2 {
-
-C10_DECLARE_REGISTRY(
-    IDEEPOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
-
-#define REGISTER_IDEEP_OPERATOR_CREATOR(key, ...) \
-  C10_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_IDEEP_OPERATOR(name, ...) \
-  C10_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_IDEEP_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  C10_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
-#define REGISTER_IDEEP_OPERATOR_STR(str_name, ...) \
-  C10_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__)
-#define REGISTER_IDEEP_COMPARE_OPERATOR(Op)                    \
-  REGISTER_IDEEP_OPERATOR(                                     \
-      Op,                                                      \
-      IDEEPFallbackOp<BinaryElementwiseOp<                     \
-          TensorTypes<bool, int32_t, int64_t, float, double>,  \
-          CPUContext,                                          \
-          Op##Functor<CPUContext>,                             \
-          FixedType<bool>>>)
-
-
-// IDEEPOperator is the base scaffolding of the operators that uses IDEEP. It
-// provides a few operators that are useful to IDEEP specific implementations.
-class IDEEPOperator : public OperatorBase {
- public:
-  explicit IDEEPOperator(const OperatorDef& operator_def, Workspace* ws)
-      : OperatorBase(operator_def, ws),
-        context_(operator_def.device_option()),
-        order_(StringToStorageOrder(
-            OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-  }
-  ~IDEEPOperator() override {}
-
-  inline const ideep::tensor& Input(int index) {
-    return OperatorBase::template Input<ideep::tensor>(index);
-  }
-  inline ideep::tensor* Output(int index) {
-    return OperatorBase::template Output<ideep::tensor>(index);
-  }
-
-  // The run function of Operator switches to the device, and then carries out
-  // the actual computation with RunOnDevice(). You should implement RunOnDevice
-  // instead of Run().
-  bool Run(int /* unused */ /*stream_id*/) final {
-    // Since IDEEP does not need to do SwithToDevice and
-    // FinishDeviceComputation,
-    // it is always just a re-route to RunOnDevice().
-    try {
-      StartAllObservers();
-      bool result =  RunOnDevice();
-      StopAllObservers();
-      return result;
-    } catch (EnforceNotMet& err) {
-      TORCH_RETHROW(err, getErrorMsg());
-    } catch (ideep::error& e) {
-      LOG(ERROR) << "IDEEP error:" << e.message;
-      throw;
-    }
-  }
-
-  // Waits for a previous event. Note that to properly wait and run
-  // asynchronously, WaitEvent, RunAsync and Record should all be executed
-  // on the same CPU thread.
-  void WaitEvent(const Event& ev, int /* unused */) final {
-    context_.WaitEvent(ev);
-  }
-
-  void WaitEvents(const std::vector<const Event*>& events, int /* unused */)
-      final {
-    for (const auto& ev : events) {
-      context_.WaitEvent(*ev);
-    }
-  }
-
-  void RecordEvent(const char* err_msg = nullptr) final {
-    if (event_) {
-      context_.Record(event_.get(), err_msg);
-    }
-  }
-
-  virtual bool RunOnDevice() = 0;
-
- protected:
-  std::string getErrorMsg() {
-    if (has_debug_def()) {
-      return "Error from operator: " + ProtoDebugString(debug_def());
-    } else {
-      return "Error from operator: no op def";
-    }
-  }
-
-  IDEEPContext context_;
-  StorageOrder order_;
-};
-
-#define USE_IDEEP_OPERATOR_FUNCTIONS()                                         \
-  USE_OPERATOR_BASE_FUNCTIONS;                                                 \
-  /* using override */ using IDEEPOperator::Input;                             \
-  /* using override */ using IDEEPOperator::Output;                            \
-  /* using override */ using IDEEPOperator::order_;                            \
-  /* using override */ using IDEEPOperator::context_;
-
-#define USE_SIMPLE_IDEEP_CTOR_DTOR(name)                                       \
-  name(const OperatorDef& operator_def, Workspace* ws)                         \
-      : IDEEPOperator(operator_def, ws) {}                                     \
-  ~name() override {}
-
-// Convert zero_point scales to min_max scales
-// NOTE:
-//  The scales in operator is saved in FBGEMM format,
-//  while FBGEMM scales are the reciprocals of MKL-DNN scales.
-//  This function is provided to convert scales from FBGEMM to MKL-DNN
-inline ideep::scale_t ConvertScales(
-    const std::vector<float> scales_z) {
-  ideep::scale_t scales (scales_z);
-  for (auto it = scales.begin(); it != scales.end(); it++) {
-    *it = 1.0f / *it;
-  }
-  return scales;
-}
-
-inline ideep::tensor::dims CanonicalDims(
-    ideep::tensor::dims adims, int32_t axis) {
-  CAFFE_ENFORCE(axis < (int32_t)adims.size(), "Invalid axis!");
-  CAFFE_ENFORCE(axis > (int32_t)-adims.size(), "Invalid axis!");
-  if (adims.size() == 2 || axis == 1)
-    return adims;
-  if (axis < 0) {
-    axis += (int32_t)adims.size();
-  }
-
-  auto dim0 = std::accumulate(adims.begin(), adims.begin() + axis, 1,
-                              std::multiplies<ideep::tensor::dim_t>());
-  auto dim1 = std::accumulate(adims.begin() + axis, adims.end(), 1,
-                              std::multiplies<ideep::tensor::dim_t>());
-  return ideep::tensor::dims({dim0, dim1});
-}
-
-} // namespace caffe2
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@ -1,63 +0,0 @@
-#include <caffe2/core/event_cpu.h>
-#include <caffe2/core/operator.h>
-#include <caffe2/proto/caffe2_pb.h>
-#include <ideep/tensor.hpp>
-#include "ideep_context.h"
-
-namespace at {
-REGISTER_CONTEXT(DeviceType::IDEEP, caffe2::IDEEPContext);
-
-namespace {
-void CopyBytesWrapper(
-    size_t nbytes,
-    const void* src,
-    Device src_device,
-    void* dst,
-    Device dst_device) {
-  if (nbytes == 0) {
-    return;
-  }
-  CAFFE_ENFORCE(src);
-  CAFFE_ENFORCE(dst);
-  memcpy(dst, src, nbytes);
-}
-} // namespace
-
-REGISTER_COPY_BYTES_FUNCTION(
-    DeviceType::IDEEP,
-    DeviceType::CPU,
-    CopyBytesWrapper);
-REGISTER_COPY_BYTES_FUNCTION(
-    DeviceType::CPU,
-    DeviceType::IDEEP,
-    CopyBytesWrapper);
-REGISTER_COPY_BYTES_FUNCTION(
-    DeviceType::IDEEP,
-    DeviceType::IDEEP,
-    CopyBytesWrapper);
-} // namespace at
-
-namespace caffe2 {
-
-CAFFE_KNOWN_TYPE(ideep::tensor);
-
-C10_DEFINE_REGISTRY(
-    IDEEPOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
-
-CAFFE_REGISTER_DEVICE_TYPE(DeviceType::IDEEP, IDEEPOperatorRegistry);
-
-REGISTER_EVENT_CREATE_FUNCTION(IDEEP, EventCreateCPU);
-REGISTER_EVENT_RECORD_FUNCTION(IDEEP, EventRecordCPU);
-REGISTER_EVENT_WAIT_FUNCTION(IDEEP, IDEEP, EventWaitCPUCPU);
-REGISTER_EVENT_WAIT_FUNCTION(IDEEP, CPU, EventWaitCPUCPU);
-REGISTER_EVENT_WAIT_FUNCTION(CPU, IDEEP, EventWaitCPUCPU);
-REGISTER_EVENT_FINISH_FUNCTION(IDEEP, EventFinishCPU);
-REGISTER_EVENT_QUERY_FUNCTION(IDEEP, EventQueryCPU);
-REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
-REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
-REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);
-
-} // namespace caffe2
--- a/caffe2/mobile/CMakeLists.txt
+++ b/caffe2/mobile/CMakeLists.txt
@ -1,11 +0,0 @@
-add_subdirectory(contrib)
-
-# CPU source, test sources, binary sources
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
-
-# GPU source, test sources, binary sources
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
--- a/caffe2/mobile/contrib/CMakeLists.txt
+++ b/caffe2/mobile/contrib/CMakeLists.txt
@ -1,15 +0,0 @@
-add_subdirectory(ios)
-
-if(USE_NNAPI)
-  add_subdirectory(nnapi)
-endif()
-
-# CPU source, test sources, binary sources
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
-
-# GPU source, test sources, binary sources
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
--- a/caffe2/mobile/contrib/ios/CMakeLists.txt
+++ b/caffe2/mobile/contrib/ios/CMakeLists.txt
@ -1,17 +0,0 @@
-# TODO: figure out conflict between contrib/nnpack/nnpack_ops.cc and mobile_nnpack.cc
-if(IOS)
-  # Basic ios srcs.
-  set(Caffe2_CONTRIB_IOS_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe_predictor.cc"
-    # "${CMAKE_CURRENT_SOURCE_DIR}/mobile_nnpack.cc"
-    )
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_IOS_SRC})
-
-  if(USE_METAL)
-    # metal/mpscnn files
-    add_subdirectory(mpscnn)
-  endif()
-endif()
-
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
--- a/caffe2/mobile/contrib/ios/ios_caffe.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe.cc
@ -1,52 +0,0 @@
-
-#include "ios_caffe.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
-#include "caffe2/predictor/predictor.h"
-
-Caffe2IOSPredictor* MakeCaffe2Predictor(const std::string& init_net_str,
-                                        const std::string& predict_net_str,
-                                        bool disableMultithreadProcessing,
-                                        bool allowMetalOperators,
-                                        std::string& errorMessage) {
-  caffe2::NetDef init_net, predict_net;
-  init_net.ParseFromString(init_net_str);
-  predict_net.ParseFromString(predict_net_str);
-
-  Caffe2IOSPredictor* predictor = NULL;
-  try {
-    predictor = Caffe2IOSPredictor::NewCaffe2IOSPredictor(
-        init_net, predict_net, disableMultithreadProcessing, allowMetalOperators);
-  } catch (const std::exception& e) {
-    std::string error = e.what();
-    errorMessage.swap(error);
-    return NULL;
-  }
-  return predictor;
-}
-
-void GenerateStylizedImage(std::vector<float>& originalImage,
-                           const std::string& init_net_str,
-                           const std::string& predict_net_str,
-                           int height,
-                           int width,
-                           std::vector<float>& dataOut) {
-  caffe2::NetDef init_net, predict_net;
-  init_net.ParseFromString(init_net_str);
-  predict_net.ParseFromString(predict_net_str);
-  caffe2::Predictor p(init_net, predict_net);
-
-  std::vector<int> dims({1, 3, height, width});
-  caffe2::Tensor input(caffe2::CPU);
-  input.Resize(dims);
-  input.ShareExternalPointer(originalImage.data());
-  caffe2::Predictor::TensorList input_vec;
-  input_vec.emplace_back(std::move(input));
-  caffe2::Predictor::TensorList output_vec;
-  p(input_vec, &output_vec);
-  assert(output_vec.size() == 1);
-  caffe2::TensorCPU* output = &output_vec.front();
-  // output is our styled image
-  float* outputArray = output->mutable_data<float>();
-  dataOut.assign(outputArray, outputArray + output->size());
-}
--- a/caffe2/mobile/contrib/ios/ios_caffe.h
+++ b/caffe2/mobile/contrib/ios/ios_caffe.h
@ -1,25 +0,0 @@
-
-#ifdef __cplusplus
-
-#include <string>
-#include <vector>
-#include "caffe2/mobile/contrib/ios/ios_caffe_defines.h"
-#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
-#include "caffe2/predictor/predictor.h"
-
-extern "C" {
-
-IOS_CAFFE_EXPORT Caffe2IOSPredictor* MakeCaffe2Predictor(const std::string& init_net_str,
-                                                         const std::string& predict_net_str,
-                                                         bool disableMultithreadProcessing,
-                                                         bool allowMetalOperators,
-                                                         std::string& errorMessage);
-IOS_CAFFE_EXPORT void GenerateStylizedImage(std::vector<float>& originalImage,
-                                            const std::string& init_net_str,
-                                            const std::string& predict_net_str,
-                                            int height,
-                                            int width,
-                                            std::vector<float>& dataOut);
-}
-
-#endif
--- a/caffe2/mobile/contrib/ios/ios_caffe_defines.h
+++ b/caffe2/mobile/contrib/ios/ios_caffe_defines.h
@ -1,2 +0,0 @@
-
-#define IOS_CAFFE_EXPORT __attribute__((visibility("default")))
--- a/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
@ -1,68 +0,0 @@
-#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/tensor.h"
-
-#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
-#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
-#endif
-
-C10_DECLARE_bool(caffe2_force_shared_col_buffer);
-
-Caffe2IOSPredictor* Caffe2IOSPredictor::NewCaffe2IOSPredictor(const caffe2::NetDef& init_net,
-                                                              const caffe2::NetDef& predict_net,
-                                                              bool disableMultithreadProcessing,
-                                                              bool allowMetalOperators) {
-  caffe2::NetDef metal_predict_net;
-  bool usingMetalOperators = false;
-#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
-  if (allowMetalOperators) {
-    caffe2::dumpDef(predict_net);
-    if (caffe2::tryConvertToMPSCNN(init_net, predict_net, &metal_predict_net)) {
-      LOG(INFO) << "Successfully converted to MPSCNN";
-      caffe2::dumpDef(metal_predict_net);
-      usingMetalOperators = true;
-    } else {
-      LOG(ERROR) << "Failed converting model to MPSCNN";
-    }
-  }
-#endif
-
-  return new Caffe2IOSPredictor(init_net,
-                                usingMetalOperators ? metal_predict_net : predict_net,
-                                disableMultithreadProcessing,
-                                usingMetalOperators);
-}
-
-Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
-                                       const caffe2::NetDef& predict_net,
-                                       bool disableMultithreadProcessing,
-                                       bool usingMetalOperators)
-    : usingMetalOperators(usingMetalOperators), predictor_(init_net, predict_net) {
-#ifdef C10_MOBILE
-  if (disableMultithreadProcessing) {
-    caffe2::ThreadPool* threadpool = predictor_.ws()->GetThreadPool();
-    if (threadpool != nullptr) {
-      threadpool->setMinWorkSize(std::numeric_limits<size_t>::max());
-    }
-  }
-#endif
-}
-
-void Caffe2IOSPredictor::run(const Tensor& inData, Tensor& outData, std::string& errorMessage) {
-  FLAGS_caffe2_force_shared_col_buffer = true;
-  caffe2::Tensor input = caffe2::empty(inData.dims, at::dtype<uint8_t>().device(caffe2::CPU));
-  input.ShareExternalPointer(inData.data);
-  caffe2::Predictor::TensorList input_vec;
-  input_vec.emplace_back(std::move(input));
-  caffe2::Predictor::TensorList output_vec;
-  try {
-    predictor_(input_vec, &output_vec);
-  } catch (const std::exception& e) {
-    std::string error = e.what();
-    errorMessage.swap(error);
-    return;
-  }
-  caffe2::Tensor* output = &output_vec.front();
-  outData.data = output->mutable_data<uint8_t>();
-  outData.dims = output->sizes().vec();
-}
--- a/caffe2/mobile/contrib/ios/ios_caffe_predictor.h
+++ b/caffe2/mobile/contrib/ios/ios_caffe_predictor.h
@ -1,36 +0,0 @@
-
-#pragma once
-
-#include <string>
-#include "caffe2/core/net.h"
-#include "caffe2/mobile/contrib/ios/ios_caffe_defines.h"
-#include "caffe2/predictor/predictor.h"
-
-struct Tensor {
-  std::vector<int64_t> dims;
-  uint8_t* data;
-};
-
-class IOS_CAFFE_EXPORT Caffe2IOSPredictor final {
- public:
-  /**
-   @allowMetalOperators Allow converting eligible operators to Metal GPU framework accelerated
-   operators. Setting this flag to true doesn't guarantee predictor will be using Metal operators;
-   Client code must check usingMetalOperators flag to determine predictor is using them.
-   */
-  static Caffe2IOSPredictor* NewCaffe2IOSPredictor(const caffe2::NetDef& init_net,
-                                                   const caffe2::NetDef& predict_net,
-                                                   bool disableMultithreadProcessing,
-                                                   bool allowMetalOperators);
-  void run(const Tensor& inData, Tensor& outData, std::string& errorMessage);
-  ~Caffe2IOSPredictor(){};
-
-  const bool usingMetalOperators;
-
- private:
-  Caffe2IOSPredictor(const caffe2::NetDef& init_net,
-                     const caffe2::NetDef& predict_net,
-                     bool disableMultithreadProcessing,
-                     bool usingMetalOperators);
-  caffe2::Predictor predictor_;
-};
--- a/caffe2/mobile/contrib/ios/mpscnn/CMakeLists.txt
+++ b/caffe2/mobile/contrib/ios/mpscnn/CMakeLists.txt
@ -1,7 +0,0 @@
-if(USE_METAL)
-  file(GLOB_RECURSE tmp *.mm *.cc)
-  # exclude test files
-  file(GLOB_RECURSE test_files *_test.cc)
-  exclude(tmp "${tmp}" ${test_files})
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
-endif()
--- a/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
+++ b/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
@ -1,23 +0,0 @@
-
-#pragma once
-#include "caffe2/core/net.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-static constexpr const char* kMPSCNNReadCountArg = "__mpscnn_read_count__";
-static constexpr const char* kMPSCNNOutputIsTempImageArg = "__mpscnn_output_is_temp_img__";
-static constexpr const int kMetalMaxTextureArrLength = 2048;
-// We currently only try to convert a fixed set of operators that handle a subset of a full
-// CNN. We also only run when MPSCNN is available, provides a speedup.
-// On failure, returns false. On success, returns true, and sets the MPSCNN net in the output
-// parameter.
-
-bool tryConvertToMPSCNN(const NetDef& initNet, const NetDef& predictNet, NetDef* mpscnnPredictNet);
-
-// Exposed for testing.
-NetDef annotateDefWithReadCounts(const NetDef& net);
-NetDef rewriteForMetal(const NetDef& net);
-NetDef runMPSCNNFusion(const NetDef& net);
-void dumpDef(const NetDef& d);
-void mpscnnRecordExecutionFinish();
-} // namespace caffe2
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
--- a/Show More
+++ b/Show More
				`@ -1,2 +0,0 @@`

				`#define IOS_CAFFE_EXPORT __attribute__((visibility("default")))`