mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-02 14:34:54 +08:00
Compare commits
82 Commits
mlazos/bac
...
mlazos/mor
| Author | SHA1 | Date | |
|---|---|---|---|
| 26004dc2e5 | |||
| 7fb495730a | |||
| 595c67e8ea | |||
| a5c93a6899 | |||
| 4457cd9a30 | |||
| 31946c10d0 | |||
| ee804d256b | |||
| 45628e3b66 | |||
| b08072f645 | |||
| c61bfd24c1 | |||
| 538877d204 | |||
| aca0807101 | |||
| 9e85d3d830 | |||
| 4dad988822 | |||
| 0e853327cb | |||
| c520929c83 | |||
| 59f2e716cc | |||
| 10c17b13d7 | |||
| bef7d650c4 | |||
| ff98731803 | |||
| f25c7c9699 | |||
| f42ea14c3f | |||
| d7fe3c4123 | |||
| 4996a3fda3 | |||
| 53a64e446f | |||
| 5f58cf65d1 | |||
| cc4da72b47 | |||
| 31372fa842 | |||
| 946b96fd54 | |||
| f87fbfdb01 | |||
| d81db9c1df | |||
| 2ed17e0b1e | |||
| 013722bcb8 | |||
| fcbf2b61e6 | |||
| 8be4104cf3 | |||
| d14d6127f6 | |||
| 2ad794550a | |||
| d19d932183 | |||
| 978b572652 | |||
| c9a258e474 | |||
| fd816bf630 | |||
| 3267814d53 | |||
| 13545fe68a | |||
| 23e71ffd82 | |||
| 350a3ed82f | |||
| 477612c0f6 | |||
| 52fad83335 | |||
| 37d2ecd123 | |||
| e43d656921 | |||
| a08be4b705 | |||
| 6a8b1da18d | |||
| eaaf0f3299 | |||
| 20271f0a3b | |||
| ae5e2ab92e | |||
| 0d4fdb0bb7 | |||
| 966ebd2e24 | |||
| ee00349780 | |||
| a7575e8bd5 | |||
| 7c00635125 | |||
| 1ecea513b6 | |||
| 6fd745255e | |||
| 74a0ef8f8c | |||
| ed8a560845 | |||
| 85447c41e3 | |||
| 0e419b9146 | |||
| 98821b3d92 | |||
| ae20f15941 | |||
| 6ea226b99c | |||
| 8fb3ff2a4e | |||
| 26b942c4fc | |||
| 257d40ba2e | |||
| 3ccf107f01 | |||
| 0241ed9331 | |||
| 7e86a7c015 | |||
| b8a706a321 | |||
| 4e29e80bf0 | |||
| b9588101c4 | |||
| c337395cdb | |||
| d83ab88f81 | |||
| 96c8447001 | |||
| da2f4bbc33 | |||
| e5766f02d0 |
@ -310,3 +310,5 @@ lxml==5.0.0.
|
||||
#Description: This is a requirement of unittest-xml-reporting
|
||||
|
||||
# Python-3.9 binaries
|
||||
|
||||
PyGithub==2.3.0
|
||||
|
||||
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@ -21,6 +21,7 @@ self-hosted-runner:
|
||||
- linux.rocm.gpu
|
||||
- macos-m1-stable
|
||||
- macos-m1-13
|
||||
- macos-m1-14
|
||||
- macos-12-xl
|
||||
- macos-12
|
||||
- macos12.3-m1
|
||||
|
||||
99
.github/scripts/get_workflow_type.py
vendored
Normal file
99
.github/scripts/get_workflow_type.py
vendored
Normal file
@ -0,0 +1,99 @@
|
||||
import json
|
||||
from argparse import ArgumentParser
|
||||
from typing import Any
|
||||
|
||||
from github import Auth, Github
|
||||
from github.Issue import Issue
|
||||
|
||||
|
||||
WORKFLOW_TYPE_LABEL = "label"
|
||||
WORKFLOW_TYPE_RG = "rg"
|
||||
WORKFLOW_TYPE_BOTH = "both"
|
||||
|
||||
|
||||
def parse_args() -> Any:
|
||||
parser = ArgumentParser("Get dynamic rollout settings")
|
||||
parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
|
||||
parser.add_argument(
|
||||
"--github-repo",
|
||||
type=str,
|
||||
required=False,
|
||||
default="pytorch/test-infra",
|
||||
help="GitHub repo to get the issue",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--github-issue", type=int, required=True, help="GitHub issue umber"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--github-user", type=str, required=True, help="GitHub username"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--github-branch", type=str, required=True, help="Current GitHub branch"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_gh_client(github_token: str) -> Github:
|
||||
auth = Auth.Token(github_token)
|
||||
return Github(auth=auth)
|
||||
|
||||
|
||||
def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
|
||||
repo = gh.get_repo(repo)
|
||||
return repo.get_issue(number=issue_num)
|
||||
|
||||
|
||||
def is_exception_branch(branch: str) -> bool:
|
||||
return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
|
||||
|
||||
|
||||
def get_workflow_type(issue: Issue, username: str) -> str:
|
||||
user_list = issue.get_comments()[0].body.split("\r\n")
|
||||
try:
|
||||
run_option = issue.get_comments()[1].body.split("\r\n")[0]
|
||||
except Exception as e:
|
||||
run_option = "single"
|
||||
|
||||
if user_list[0] == "!":
|
||||
# Use old runners for everyone
|
||||
return WORKFLOW_TYPE_LABEL
|
||||
elif user_list[1] == "*":
|
||||
if run_option == WORKFLOW_TYPE_BOTH:
|
||||
# Use ARC runners and old runners for everyone
|
||||
return WORKFLOW_TYPE_BOTH
|
||||
else:
|
||||
# Use only ARC runners for everyone
|
||||
return WORKFLOW_TYPE_RG
|
||||
elif username in user_list:
|
||||
if run_option == WORKFLOW_TYPE_BOTH:
|
||||
# Use ARC runners and old runners for a specific user
|
||||
return WORKFLOW_TYPE_BOTH
|
||||
else:
|
||||
# Use only ARC runners for a specific user
|
||||
return WORKFLOW_TYPE_RG
|
||||
else:
|
||||
# Use old runners by default
|
||||
return WORKFLOW_TYPE_LABEL
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
if is_exception_branch(args.github_branch):
|
||||
output = {"workflow_type": WORKFLOW_TYPE_LABEL}
|
||||
else:
|
||||
try:
|
||||
gh = get_gh_client(args.github_token)
|
||||
issue = get_issue(gh, args.github_repo, args.github_issue)
|
||||
|
||||
output = {"workflow_type": get_workflow_type(issue, args.github_user)}
|
||||
except Exception as e:
|
||||
output = {"workflow_type": WORKFLOW_TYPE_LABEL}
|
||||
|
||||
json_output = json.dumps(output)
|
||||
print(json_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
13
.github/workflows/_mac-test.yml
vendored
13
.github/workflows/_mac-test.yml
vendored
@ -24,11 +24,6 @@ on:
|
||||
default: "3.8"
|
||||
description: |
|
||||
The python version to be used. Will be 3.8 by default
|
||||
arch:
|
||||
required: true
|
||||
type: string
|
||||
description: |
|
||||
Contains the architecture to run the tests with
|
||||
timeout-minutes:
|
||||
required: false
|
||||
type: number
|
||||
@ -44,7 +39,7 @@ jobs:
|
||||
# Also ensure that we always run with the right architecture
|
||||
defaults:
|
||||
run:
|
||||
shell: arch -arch ${{ inputs.arch }} bash -e -l {0}
|
||||
shell: bash -e -l {0}
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(inputs.test-matrix) }}
|
||||
fail-fast: false
|
||||
@ -133,12 +128,6 @@ jobs:
|
||||
test-matrix: ${{ inputs.test-matrix }}
|
||||
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
||||
|
||||
- name: Pre-process arm64 wheels
|
||||
if: inputs.build-environment == 'macos-12-py3-arm64'
|
||||
run: |
|
||||
# As wheels are cross-compiled they are reported as x86_64 ones
|
||||
ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv "${ORIG_WHLNAME}" "${ARM_WHLNAME}"
|
||||
|
||||
- name: Set Test step time
|
||||
id: test-timeout
|
||||
shell: bash
|
||||
|
||||
58
.github/workflows/_runner-determinator.yml
vendored
Normal file
58
.github/workflows/_runner-determinator.yml
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
name: Check whether the workflow owner can use ARC runners
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
user_name:
|
||||
required: true
|
||||
type: string
|
||||
description: The name of the workflow owner.
|
||||
curr_branch:
|
||||
required: true
|
||||
type: string
|
||||
description: Current branch.
|
||||
issue_number:
|
||||
required: false
|
||||
type: string
|
||||
default: "5132"
|
||||
|
||||
outputs:
|
||||
workflow-type:
|
||||
description: Type of runners to use
|
||||
value: ${{ jobs.runner-determinator.outputs.workflow-type }}
|
||||
|
||||
jobs:
|
||||
runner-determinator:
|
||||
runs-on: linux.4xlarge
|
||||
outputs:
|
||||
workflow-type: ${{ steps.set-condition.outputs.workflow-type }}
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
ISSUE_NUMBER: ${{ inputs.issue_number }}
|
||||
USERNAME: ${{ inputs.user_name }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
with:
|
||||
fetch-depth: 1
|
||||
submodules: true
|
||||
|
||||
- name: Install dependencies
|
||||
run: python3 -m pip install urllib3==1.26.18 PyGithub==2.3.0
|
||||
|
||||
- name: Get the workflow type for the current user
|
||||
id: set-condition
|
||||
run: |
|
||||
curr_branch="${{ inputs.curr_branch }}"
|
||||
echo "Current branch is '$curr_branch'"
|
||||
|
||||
output="$(python3 .github/scripts/get_workflow_type.py \
|
||||
--github-token "$GITHUB_TOKEN" \
|
||||
--github-issue "$ISSUE_NUMBER" \
|
||||
--github-branch "$curr_branch" \
|
||||
--github-user "$USERNAME")"
|
||||
|
||||
echo "Output: '${output}'"
|
||||
|
||||
WORKFLOW_TYPE=$(echo "${output}" | jq -r '.workflow_type')
|
||||
echo "workflow-type=$WORKFLOW_TYPE" >> "$GITHUB_OUTPUT"
|
||||
7
.github/workflows/docker-release.yml
vendored
7
.github/workflows/docker-release.yml
vendored
@ -127,15 +127,10 @@ jobs:
|
||||
run: |
|
||||
make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
|
||||
- name: Push nightly tags
|
||||
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }}
|
||||
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
|
||||
run: |
|
||||
PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
|
||||
CUDA_SUFFIX="-cu${CUDA_VERSION}"
|
||||
if [[ ${CUDA_VERSION_SHORT} == "cpu" ]]; then
|
||||
PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime"
|
||||
CUDA_SUFFIX=""
|
||||
fi
|
||||
|
||||
PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
|
||||
python -c 'import torch; print(torch.version.git_version[:7],end="")')
|
||||
|
||||
|
||||
4
.github/workflows/lint.yml
vendored
4
.github/workflows/lint.yml
vendored
@ -230,11 +230,11 @@ jobs:
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
- name: Setup Python 3.5
|
||||
- name: Setup Python 3.6
|
||||
if: matrix.test_type == 'older_python_version'
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.5'
|
||||
python-version: '3.6'
|
||||
architecture: x64
|
||||
check-latest: false
|
||||
cache: pip
|
||||
|
||||
24
.github/workflows/mac-mps.yml
vendored
24
.github/workflows/mac-mps.yml
vendored
@ -13,33 +13,29 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
macos-12-py3-arm64-build:
|
||||
name: macos-12-py3-arm64
|
||||
macos-13-py3-arm64-build:
|
||||
name: macos-13-py3-arm64
|
||||
uses: ./.github/workflows/_mac-build.yml
|
||||
with:
|
||||
sync-tag: macos-12-py3-arm64-build
|
||||
build-environment: macos-12-py3-arm64
|
||||
sync-tag: macos-py3-arm64-build
|
||||
build-environment: macos-13-py3-arm64
|
||||
runner-type: macos-m1-stable
|
||||
build-generates-artifacts: true
|
||||
# To match the one pre-installed in the m1 runners
|
||||
python-version: 3.9.12
|
||||
# We need to set the environment file here instead of trying to detect it automatically because
|
||||
# MacOS arm64 is cross-compiled from x86-64. Specifically, it means that arm64 conda environment
|
||||
# is needed when building PyTorch MacOS arm64 from x86-64
|
||||
environment-file: .github/requirements/conda-env-macOS-ARM64
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-14" },
|
||||
]}
|
||||
|
||||
macos-12-py3-arm64-mps-test:
|
||||
name: macos-12-py3-arm64-mps
|
||||
macos-py3-arm64-mps-test:
|
||||
name: macos-py3-arm64-mps
|
||||
uses: ./.github/workflows/_mac-test-mps.yml
|
||||
needs: macos-12-py3-arm64-build
|
||||
needs: macos-13-py3-arm64-build
|
||||
with:
|
||||
sync-tag: macos-12-py3-arm64-mps-test
|
||||
build-environment: macos-12-py3-arm64
|
||||
sync-tag: macos-py3-arm64-mps-test
|
||||
build-environment: macos-13-py3-arm64
|
||||
# Same as the build job
|
||||
python-version: 3.9.12
|
||||
test-matrix: ${{ needs.macos-12-py3-arm64-build.outputs.test-matrix }}
|
||||
test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}
|
||||
|
||||
49
.github/workflows/trunk.yml
vendored
49
.github/workflows/trunk.yml
vendored
@ -34,18 +34,6 @@ jobs:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
# Build PyTorch with BUILD_CAFFE2=ON
|
||||
caffe2-linux-jammy-py3_8-gcc11-build:
|
||||
name: caffe2-linux-jammy-py3.8-gcc11
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: caffe2-linux-jammy-py3.8-gcc11
|
||||
docker-image-name: pytorch-linux-jammy-py3.8-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
]}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-build:
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
@ -106,20 +94,16 @@ jobs:
|
||||
{ config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
|
||||
]}
|
||||
|
||||
macos-12-py3-arm64-build:
|
||||
name: macos-12-py3-arm64
|
||||
macos-13-py3-arm64-build:
|
||||
name: macos-13-py3-arm64
|
||||
uses: ./.github/workflows/_mac-build.yml
|
||||
with:
|
||||
sync-tag: macos-12-py3-arm64-build
|
||||
build-environment: macos-12-py3-arm64
|
||||
sync-tag: macos-py3-arm64-build
|
||||
build-environment: macos-13-py3-arm64
|
||||
runner-type: macos-m1-stable
|
||||
build-generates-artifacts: true
|
||||
# To match the one pre-installed in the m1 runners
|
||||
python-version: 3.9.12
|
||||
# We need to set the environment file here instead of trying to detect it automatically because
|
||||
# MacOS arm64 is cross-compiled from x86-64. Specifically, it means that arm64 conda environment
|
||||
# is needed when building PyTorch MacOS arm64 from x86-64
|
||||
environment-file: .github/requirements/conda-env-macOS-ARM64
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
|
||||
@ -127,33 +111,34 @@ jobs:
|
||||
{ config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
|
||||
]}
|
||||
|
||||
macos-12-py3-arm64-mps-test:
|
||||
name: macos-12-py3-arm64-mps
|
||||
macos-py3-arm64-mps-test:
|
||||
name: macos-py3-arm64-mps
|
||||
uses: ./.github/workflows/_mac-test-mps.yml
|
||||
needs: macos-12-py3-arm64-build
|
||||
if: needs.macos-12-py3-arm64-build.outputs.build-outcome == 'success'
|
||||
needs: macos-13-py3-arm64-build
|
||||
if: needs.macos-13-py3-arm64-build.outputs.build-outcome == 'success'
|
||||
with:
|
||||
sync-tag: macos-12-py3-arm64-mps-test
|
||||
build-environment: macos-12-py3-arm64
|
||||
sync-tag: macos-py3-arm64-mps-test
|
||||
build-environment: macos-13-py3-arm64
|
||||
# Same as the build job
|
||||
python-version: 3.9.12
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
|
||||
|
||||
]}
|
||||
|
||||
macos-12-py3-arm64-test:
|
||||
name: macos-12-py3-arm64
|
||||
macos-13-py3-arm64-test:
|
||||
name: macos-13-py3-arm64
|
||||
uses: ./.github/workflows/_mac-test.yml
|
||||
needs:
|
||||
- macos-12-py3-arm64-build
|
||||
- macos-13-py3-arm64-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: macos-12-py3-arm64
|
||||
build-environment: macos-13-py3-arm64
|
||||
# Same as the build job
|
||||
python-version: 3.9.12
|
||||
test-matrix: ${{ needs.macos-12-py3-arm64-build.outputs.test-matrix }}
|
||||
arch: arm64
|
||||
test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}
|
||||
|
||||
win-vs2019-cpu-py3-build:
|
||||
name: win-vs2019-cpu-py3
|
||||
|
||||
32
.github/workflows/upload-test-stats.yml
vendored
32
.github/workflows/upload-test-stats.yml
vendored
@ -49,22 +49,6 @@ jobs:
|
||||
- run: |
|
||||
pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
|
||||
|
||||
- name: Upload test stats
|
||||
env:
|
||||
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
|
||||
WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
|
||||
WORKFLOW_URL: ${{ github.event.workflow_run.html_url }}
|
||||
HEAD_REPOSITORY: ${{ github.event.workflow_run.head_repository.full_name }}
|
||||
HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
|
||||
run: |
|
||||
echo "${WORKFLOW_URL}"
|
||||
python3 -m tools.stats.upload_test_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --head-branch "${HEAD_BRANCH}" --head-repository "${HEAD_REPOSITORY}"
|
||||
python3 -m tools.stats.upload_sccache_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}"
|
||||
|
||||
- name: Upload test artifacts
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
@ -81,6 +65,22 @@ jobs:
|
||||
# anything on GitHub to upload. The command should return right away
|
||||
python3 -m tools.stats.upload_artifacts --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"
|
||||
|
||||
- name: Upload test stats
|
||||
env:
|
||||
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
|
||||
WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
|
||||
WORKFLOW_URL: ${{ github.event.workflow_run.html_url }}
|
||||
HEAD_REPOSITORY: ${{ github.event.workflow_run.head_repository.full_name }}
|
||||
HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
|
||||
run: |
|
||||
echo "${WORKFLOW_URL}"
|
||||
python3 -m tools.stats.upload_test_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --head-branch "${HEAD_BRANCH}" --head-repository "${HEAD_REPOSITORY}"
|
||||
python3 -m tools.stats.upload_sccache_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}"
|
||||
|
||||
- name: Analyze disabled tests rerun
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
|
||||
@ -1114,8 +1114,6 @@ exclude_patterns = [
|
||||
'test/test_optim.py',
|
||||
'test/test_out_dtype_op.py',
|
||||
'test/test_overrides.py',
|
||||
'test/test_package.py',
|
||||
'test/test_per_overload_api.py',
|
||||
'test/test_prims.py',
|
||||
'test/test_proxy_tensor.py',
|
||||
'test/test_pruning_op.py',
|
||||
|
||||
@ -65,8 +65,9 @@ ARG CUDA_VERSION=12.1
|
||||
ARG CUDA_CHANNEL=nvidia
|
||||
ARG INSTALL_CHANNEL=pytorch-nightly
|
||||
# Automatically set by buildx
|
||||
# Note conda needs to be pinned to 23.5.2 see: https://github.com/pytorch/pytorch/issues/106470
|
||||
RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=23.5.2
|
||||
RUN /opt/conda/bin/conda update -y -n base -c defaults conda
|
||||
RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION}
|
||||
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
# On arm64 we can only install wheel packages.
|
||||
|
||||
@ -65,8 +65,8 @@ Following is the release cadence for year 2023/2024. All dates below are tentati
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 2.1 | Aug 2023 | Oct 2023 | Nov 2023 | Dec 2023 |
|
||||
| 2.2 | Dec 2023 | Jan 2024 | Feb 2024 | Mar 2024 |
|
||||
| 2.3 | Mar 2024 | Apr 2024 | May 2024 | Jun 2024 |
|
||||
| 2.4 | May 2024 | Jul 2024 | Aug 2024 | Sep 2024 |
|
||||
| 2.3 | Mar 2024 | Apr 2024 | Jun 2024 | Not planned |
|
||||
| 2.4 | Jun 2024 | Jul 2024 | Aug 2024 | Sep 2024 |
|
||||
| 2.5 | Aug 2024 | Oct 2024 | Nov 2024 | Dec 2024 |
|
||||
|
||||
## General Overview
|
||||
|
||||
@ -97,7 +97,16 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
|
||||
/*resizable=*/true
|
||||
),
|
||||
base_(base)
|
||||
{
|
||||
{
|
||||
// SparseTensorImpl has no storage, so we cannot query its nbytes.
|
||||
// (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
|
||||
// Same for XLA
|
||||
if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
|
||||
original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
|
||||
} else {
|
||||
original_storage_size_ = -1;
|
||||
}
|
||||
curr_storage_size_ = original_storage_size_;
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
|
||||
}
|
||||
|
||||
|
||||
@ -105,6 +105,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
|
||||
frozen_ = true;
|
||||
}
|
||||
|
||||
c10::SymInt get_storage_size(bool before) {
|
||||
if (before) {
|
||||
return original_storage_size_;
|
||||
} else {
|
||||
return curr_storage_size_;
|
||||
}
|
||||
}
|
||||
|
||||
~FunctionalStorageImpl() override = default;
|
||||
|
||||
void mark_mutation() {
|
||||
@ -132,6 +140,15 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
|
||||
return mutation_counter_ <= mutation_counter_hidden_from_autograd_;
|
||||
}
|
||||
|
||||
void mark_inductor_storage_resize(c10::SymInt new_size) {
|
||||
inductor_storage_resized_ = true;
|
||||
curr_storage_size_ = new_size;
|
||||
}
|
||||
|
||||
bool was_inductor_storage_resized() {
|
||||
return inductor_storage_resized_;
|
||||
}
|
||||
|
||||
private:
|
||||
// NB: base_ should always point to a tensor BELOW the current
|
||||
// functionalization layer. This is mainly to avoid reference cycles. e.g.
|
||||
@ -172,6 +189,13 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
|
||||
uint64_t mutation_counter_during_no_grad_or_inference_mode_ = 0;
|
||||
uint64_t mutation_counter_ = 0;
|
||||
uint64_t mutation_counter_hidden_from_autograd_ = 0;
|
||||
|
||||
// Used to tell if:
|
||||
// (1) There were any storage resizes on a graph input
|
||||
// (2) The original/curr storage size tell us if these resizes result in a nop
|
||||
bool inductor_storage_resized_ = false;
|
||||
c10::SymInt original_storage_size_;
|
||||
c10::SymInt curr_storage_size_;
|
||||
};
|
||||
|
||||
} // namespace at::functionalization
|
||||
|
||||
@ -276,6 +276,32 @@ void FunctionalTensorWrapper::set__impl(const FunctionalTensorWrapper* other) {
|
||||
set_sizes_and_strides(sizes_, strides_, storage_offset_);
|
||||
}
|
||||
|
||||
void FunctionalTensorWrapper::storage_resize_(c10::SymInt new_size) {
|
||||
auto curr_storage_size = value_.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
|
||||
// storage resizing is severely limited: we only support resizing either to zero, or from zero bytes.
|
||||
TORCH_CHECK(new_size == 0 || curr_storage_size == 0, "new_size: ", new_size, ". curr_storage_size: ", curr_storage_size);
|
||||
// The "functionalization rule" for storage resizing is a giant no-op, mainly because we don't want
|
||||
// resize_() calls to actualy emit any ops in the functional graph.
|
||||
// How does it work?
|
||||
// Resizing up (old size == 0):
|
||||
// We do nothing in this case.
|
||||
// The expection is that for the user code to be valid, the next op that should run against the current tensor "x"
|
||||
// will be a x.copy_(y) (or similar), that will fully overwrite the data of x.
|
||||
// If there are any outstanding aliases of x, we expect them not to be used until after the copy_() call
|
||||
// (otherwise the eager code would be invalid),
|
||||
// and therefore functionalization will regenerate the aliases off of the result of `x.copy(y)`.
|
||||
// Resizing down (new size == 0):
|
||||
// We also do nothing in this case. The assumption is that after resizing a tensor down,
|
||||
// it is fully unused in the program (unless it is later resized back up first, has data copied in)
|
||||
// Although it might be saved for backward, which happens in FSDP.
|
||||
// The expected pattern is that the param will then be resized back up from zero in the backward.
|
||||
|
||||
// Mark the tensor as having its storage resized.
|
||||
// This is so we can detect it for inputs in AOTAutograd and error / emit
|
||||
// an input mutation resize_() appropriately
|
||||
functional_storage_impl()->mark_inductor_storage_resize(new_size);
|
||||
}
|
||||
|
||||
void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
|
||||
// Note [resize_() in functionalization pass]
|
||||
// resize_() is a special operator in functionalization because it can reallocate its underlying storage.
|
||||
|
||||
@ -141,6 +141,9 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
// Custom implementation of self.set_(src)
|
||||
void set__impl(const FunctionalTensorWrapper* other);
|
||||
|
||||
// Custom implementation of resize_storage_bytes_(self, new_size)
|
||||
void storage_resize_(c10::SymInt new_size);
|
||||
|
||||
// Returns whether the current tensor's data was ever mutated
|
||||
bool has_data_mutation();
|
||||
//
|
||||
@ -150,6 +153,16 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
return was_storage_changed_;
|
||||
}
|
||||
|
||||
c10::SymInt get_storage_size(bool before) {
|
||||
return functional_storage_impl()->get_storage_size(before);
|
||||
}
|
||||
|
||||
// Returns whether the FunctionalTensor experienced an
|
||||
// untyped_storage().resize_() call
|
||||
bool was_inductor_storage_resized() {
|
||||
return functional_storage_impl()->was_inductor_storage_resized();
|
||||
}
|
||||
|
||||
// The functionalization pass can be used to remove mutations.
|
||||
// It does so by replacing any mutation op with it's corresponding
|
||||
// out-of-place op, followed by a call to replace_(). e.g:
|
||||
|
||||
@ -335,6 +335,9 @@ static at::Tensor& set__functionalize(at::Tensor& self, const at::Tensor& src) {
|
||||
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(src));
|
||||
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
|
||||
auto src_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(src);
|
||||
// See Note [Ordering of resize_() and set_()]
|
||||
TORCH_CHECK(!self_impl->was_inductor_storage_resized(),
|
||||
"storage_resize_() followed by set_() in torch.compile is not supported today");
|
||||
self_impl->set__impl(src_impl);
|
||||
return self;
|
||||
}
|
||||
|
||||
@ -57,6 +57,8 @@ SparseCsrTensorImpl::SparseCsrTensorImpl(
|
||||
TORCH_INTERNAL_ASSERT(((key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kCPU)
|
||||
|| (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kCUDA)
|
||||
|| (key_set.has(DispatchKey::SparseCsrMeta) && device().type() == kMeta)
|
||||
|| (key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kMeta) // fake tensor
|
||||
|| (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kMeta) // fake tensor
|
||||
|| (key_set.has(DispatchKey::SparseCsrPrivateUse1) && device().type() == kPrivateUse1)),
|
||||
"Inconsistent key_set (=", key_set, ") and device (=", device(), ")");
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/TensorImpl.h>
|
||||
#include <c10/core/impl/TorchDispatchModeTLS.h>
|
||||
#include <c10/util/Exception.h>
|
||||
namespace at {
|
||||
|
||||
@ -107,6 +108,39 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename VariableVersion>
|
||||
c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
|
||||
VariableVersion&& version_counter,
|
||||
bool allow_tensor_metadata_change) const {
|
||||
const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
|
||||
c10::impl::PyInterpreter&& interpreter = nullptr;
|
||||
if (mode_stack_len > 0 &&
|
||||
!c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
|
||||
const auto& cur_torch_dispatch_mode_state =
|
||||
c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
|
||||
interpreter = cur_torch_dispatch_mode_state->pyinterpreter();
|
||||
} else if (
|
||||
key_set_.has(DispatchKey::Python) &&
|
||||
!c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
|
||||
interpreter = pyobj_slot_.load_pyobj_interpreter();
|
||||
} else {
|
||||
// otherwise just copy the SparseTensorImpl and not the PyObject.
|
||||
auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
|
||||
key_set(), device(), layout_impl(), dtype());
|
||||
copy_tensor_metadata(
|
||||
/*src_sparse_impl=*/this,
|
||||
/*dest_sparse_impl=*/impl.get(),
|
||||
/*version_counter=*/version_counter,
|
||||
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
|
||||
impl->refresh_numel();
|
||||
return impl;
|
||||
}
|
||||
auto r = interpreter->detach(this);
|
||||
r->set_version_counter(std::forward<VariableVersion>(version_counter));
|
||||
r->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a TensorImpl that is a shallow-copy of this TensorImpl.
|
||||
*
|
||||
@ -116,15 +150,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
|
||||
const c10::VariableVersion& version_counter,
|
||||
bool allow_tensor_metadata_change) const override {
|
||||
auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
|
||||
key_set(), device(), layout_impl(), dtype());
|
||||
copy_tensor_metadata(
|
||||
/*src_sparse_impl=*/this,
|
||||
/*dest_sparse_impl=*/impl.get(),
|
||||
/*version_counter=*/version_counter,
|
||||
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
|
||||
impl->refresh_numel();
|
||||
return impl;
|
||||
return shallow_copy_and_detach_core(
|
||||
version_counter, allow_tensor_metadata_change);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -136,15 +163,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
|
||||
c10::VariableVersion&& version_counter,
|
||||
bool allow_tensor_metadata_change) const override {
|
||||
auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
|
||||
key_set(), device(), layout_impl(), dtype());
|
||||
copy_tensor_metadata(
|
||||
/*src_sparse_impl=*/this,
|
||||
/*dest_sparse_impl=*/impl.get(),
|
||||
/*version_counter=*/std::move(version_counter),
|
||||
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
|
||||
impl->refresh_numel();
|
||||
return impl;
|
||||
return shallow_copy_and_detach_core(
|
||||
std::move(version_counter), allow_tensor_metadata_change);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/TensorImpl.h>
|
||||
#include <c10/core/impl/TorchDispatchModeTLS.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
@ -306,6 +307,38 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
const Tensor& indices,
|
||||
const Tensor& values);
|
||||
|
||||
template <typename VariableVersion>
|
||||
c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
|
||||
VariableVersion&& version_counter,
|
||||
bool allow_tensor_metadata_change) const {
|
||||
const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
|
||||
c10::impl::PyInterpreter&& interpreter = nullptr;
|
||||
if (mode_stack_len > 0 &&
|
||||
!c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
|
||||
const auto& cur_torch_dispatch_mode_state =
|
||||
c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
|
||||
interpreter = cur_torch_dispatch_mode_state->pyinterpreter();
|
||||
} else if (
|
||||
key_set_.has(DispatchKey::Python) &&
|
||||
!c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
|
||||
interpreter = pyobj_slot_.load_pyobj_interpreter();
|
||||
} else {
|
||||
// otherwise just copy the SparseTensorImpl and not the PyObject.
|
||||
auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
|
||||
copy_tensor_metadata(
|
||||
/*src_sparse_impl=*/this,
|
||||
/*dest_sparse_impl=*/impl.get(),
|
||||
/*version_counter=*/version_counter,
|
||||
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
|
||||
impl->refresh_numel();
|
||||
return impl;
|
||||
}
|
||||
auto r = interpreter->detach(this);
|
||||
r->set_version_counter(std::forward<VariableVersion>(version_counter));
|
||||
r->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a TensorImpl that is a shallow-copy of this TensorImpl.
|
||||
*
|
||||
@ -315,14 +348,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
|
||||
const c10::VariableVersion& version_counter,
|
||||
bool allow_tensor_metadata_change) const override {
|
||||
auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
|
||||
copy_tensor_metadata(
|
||||
/*src_sparse_impl=*/this,
|
||||
/*dest_sparse_impl=*/impl.get(),
|
||||
/*version_counter=*/version_counter,
|
||||
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
|
||||
impl->refresh_numel();
|
||||
return impl;
|
||||
return shallow_copy_and_detach_core(
|
||||
version_counter, allow_tensor_metadata_change);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -334,14 +361,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
|
||||
c10::VariableVersion&& version_counter,
|
||||
bool allow_tensor_metadata_change) const override {
|
||||
auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
|
||||
copy_tensor_metadata(
|
||||
/*src_sparse_impl=*/this,
|
||||
/*dest_sparse_impl=*/impl.get(),
|
||||
/*version_counter=*/std::move(version_counter),
|
||||
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
|
||||
impl->refresh_numel();
|
||||
return impl;
|
||||
return shallow_copy_and_detach_core(
|
||||
std::move(version_counter), allow_tensor_metadata_change);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -298,19 +298,18 @@ class Vectorized<c10::Half> {
|
||||
} else if (count == (size() >> 1)) {
|
||||
Vectorized<c10::Half> res;
|
||||
res.values.val[0] = vld1q_f16(reinterpret_cast<const float16_t*>(ptr));
|
||||
res.values.val[1] = vdupq_n_f16(0);
|
||||
std::memset(&res.values.val[1], 0, sizeof(res.values.val[1]));
|
||||
return res;
|
||||
} else {
|
||||
__at_align__ float16_t tmp_values[size()];
|
||||
for (const auto i : c10::irange(size())) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(
|
||||
tmp_values,
|
||||
reinterpret_cast<const float16_t*>(ptr),
|
||||
count * sizeof(float16_t));
|
||||
return vld1q_f16_x2(reinterpret_cast<const float16_t*>(tmp_values));
|
||||
}
|
||||
__at_align__ float16_t tmp_values[size()];
|
||||
for (const auto i : c10::irange(size())) {
|
||||
tmp_values[i] = 0;
|
||||
}
|
||||
std::memcpy(
|
||||
tmp_values,
|
||||
reinterpret_cast<const float16_t*>(ptr),
|
||||
count * sizeof(float16_t));
|
||||
return vld1q_f16_x2(reinterpret_cast<const float16_t*>(tmp_values));
|
||||
}
|
||||
void store(void* ptr, int64_t count = size()) const {
|
||||
if (count == size()) {
|
||||
|
||||
@ -213,12 +213,36 @@ struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplI
|
||||
hip_stream.synchronize();
|
||||
}
|
||||
|
||||
void synchronizeEvent(void* event) const override {
|
||||
if (!event)
|
||||
return;
|
||||
hipEvent_t hip_event = static_cast<hipEvent_t>(event);
|
||||
C10_HIP_CHECK(hipEventSynchronize(hip_event));
|
||||
}
|
||||
|
||||
void recordDataPtrOnStream(
|
||||
const c10::DataPtr& data_ptr,
|
||||
const Stream& stream) const override {
|
||||
HIPStreamMasqueradingAsCUDA hip_stream{stream};
|
||||
HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(data_ptr, hip_stream);
|
||||
}
|
||||
|
||||
double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
|
||||
const override {
|
||||
TORCH_CHECK(
|
||||
event1 && event2,
|
||||
"Both events must be recorded before calculating elapsed time.");
|
||||
int orig_device;
|
||||
C10_HIP_CHECK(hipGetDevice(&orig_device));
|
||||
C10_HIP_CHECK(hipSetDevice(device_index));
|
||||
hipEvent_t hip_event1 = static_cast<hipEvent_t>(event1);
|
||||
hipEvent_t hip_event2 = static_cast<hipEvent_t>(event2);
|
||||
float time_ms = 0;
|
||||
// raise hipErrorNotReady if either event is recorded but not yet completed
|
||||
C10_HIP_CHECK(hipEventElapsedTime(&time_ms, hip_event1, hip_event2));
|
||||
C10_HIP_CHECK(hipSetDevice(orig_device));
|
||||
return static_cast<double>(time_ms);
|
||||
}
|
||||
};
|
||||
|
||||
// All of the guards which have HIPGuardImpl burned in need to also have
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/Copy.h>
|
||||
#include <ATen/native/Copy.h>
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
@ -25,8 +26,12 @@
|
||||
#include <ATen/ops/_copy_from.h>
|
||||
#include <ATen/ops/_propagate_xla_data.h>
|
||||
#include <ATen/ops/_propagate_xla_data_native.h>
|
||||
#include <ATen/ops/copy.h>
|
||||
#include <ATen/ops/copy_native.h>
|
||||
#include <ATen/ops/_foreach_copy.h>
|
||||
#include <ATen/ops/_foreach_copy_native.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_strided.h>
|
||||
#include <ATen/ops/expand_copy.h>
|
||||
#endif
|
||||
|
||||
@ -303,15 +308,45 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
|
||||
return self;
|
||||
}
|
||||
|
||||
Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
|
||||
// copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
|
||||
// (1) It isn't exposed to the frontend (no python bindings)
|
||||
// (2) It isn't exposed to the backend (it's a composite, that decomposes into to() and expand_as() calls.
|
||||
Tensor copy_meta(const Tensor& self, const Tensor& src, bool non_blocking) {
|
||||
// Must directly use self(), so we can dispatch properly is self is a subclass
|
||||
auto r = clone_preserve_strides(self);
|
||||
r.copy_(src, non_blocking);
|
||||
return r;
|
||||
}
|
||||
|
||||
Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
|
||||
at::Tensor r;
|
||||
// copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
|
||||
// (1) It isn't exposed to the frontend (no python bindings)
|
||||
// (2) It isn't exposed to the backend (it's a composite, that decomposes into to() and expand_as() calls.
|
||||
auto self_storage = self.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl();
|
||||
// If self has no real storage, we can't actually clone it.
|
||||
// Instead, generate an empty tensor with the right sizes/strides, since we should be able to assume
|
||||
// that copy_() will fully overwrite all data with that of src
|
||||
if (self_storage->nbytes() == 0) {
|
||||
r = at::empty_strided(self.sizes(), self.strides());
|
||||
} else {
|
||||
r = clone_preserve_strides(self);
|
||||
}
|
||||
r.copy_(src, non_blocking);
|
||||
return r;
|
||||
}
|
||||
|
||||
::std::vector<at::Tensor> _foreach_copy(at::TensorList self, at::TensorList src, bool non_blocking) {
|
||||
std::vector<at::Tensor> outs;
|
||||
outs.reserve(self.size());
|
||||
// This is a very slow implementation, but needs to directly call the copy() kernel above to handle
|
||||
// when self has zero storage.
|
||||
// This kernel should never really be run, except with debugging using compile(backend="aot_eager")
|
||||
for (const auto i : c10::irange(src.size())) {
|
||||
auto curr_src = src[i];
|
||||
auto curr_self = self[i];
|
||||
outs.push_back(at::copy(curr_self, curr_src, non_blocking));
|
||||
}
|
||||
return outs;
|
||||
}
|
||||
|
||||
Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
|
||||
auto maybe_outnames = namedinference::compute_broadcast_outnames(self, src);
|
||||
{
|
||||
|
||||
@ -178,6 +178,12 @@ inline bool _check_tensors_do_type_promotion_with_scalars(
|
||||
// - All tensors must be non-overlapping and dense
|
||||
// - Resulting tensor must have the same dtype as the input one
|
||||
|
||||
// [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
|
||||
// ``does_op_promote_integer_inputs_to_float=true`` means that the result of
|
||||
// the op will be float even if inputs are integer or boolean, which
|
||||
// currently fast path does not support. In short, this flag, when
|
||||
// turned on, gatekeeps the op from going down the fastpath.
|
||||
|
||||
// Please, make sure to call check_foreach_api_restrictions before calling this
|
||||
// method. There is a set of preconditions that have to be satisfied.
|
||||
inline bool check_fast_path_restrictions(
|
||||
@ -231,6 +237,7 @@ inline std::vector<c10::Scalar> convert_tensor_to_scalar_list(
|
||||
return scalarList;
|
||||
}
|
||||
|
||||
// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
|
||||
inline bool can_use_fast_route(
|
||||
ArrayRef<TensorList> tensorLists,
|
||||
ArrayRef<Scalar> scalarList = {},
|
||||
@ -239,6 +246,7 @@ inline bool can_use_fast_route(
|
||||
tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
|
||||
}
|
||||
|
||||
// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
|
||||
inline bool can_use_fast_route(
|
||||
TensorList tensors1,
|
||||
TensorList tensors2,
|
||||
|
||||
@ -230,16 +230,18 @@ FOREACH_BINARY_OP_LIST(
|
||||
div,
|
||||
std::divides,
|
||||
/*division_op*/ true);
|
||||
// NOTE(crcrpar): `all_types_half_bfloat16` does not cover bool, so temporarily
|
||||
// set `division_op` to true.
|
||||
FOREACH_BINARY_OP_LIST(
|
||||
all_types_half_bfloat16,
|
||||
clamp_max,
|
||||
minimum,
|
||||
/*division_op*/ false);
|
||||
/*division_op*/ true);
|
||||
FOREACH_BINARY_OP_LIST(
|
||||
all_types_half_bfloat16,
|
||||
clamp_min,
|
||||
maximum,
|
||||
/*division_op*/ false);
|
||||
/*division_op*/ true);
|
||||
// NOTE(crcrpar): [Why is foreach_pow's division_op=true?]
|
||||
// To push integer inputs to slow path. This is because with integer type inputs
|
||||
// the fast path behaves differently from the slow one. Need to investigate
|
||||
|
||||
@ -239,7 +239,9 @@ std::vector<Tensor> foreach_tensor_sub_scalar_kernel_cuda(
|
||||
});
|
||||
}
|
||||
|
||||
FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_max, minimum, false);
|
||||
FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_min, maximum, false);
|
||||
// NOTE(crcrpar): `all_types_half_bfloat16` does not cover bool, so temporarily
|
||||
// set `division_op` to true.
|
||||
FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_max, minimum, true);
|
||||
FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_min, maximum, true);
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -235,15 +235,7 @@ std::vector<Tensor> foreach_tensor_sub_scalarlist_kernel_cuda(
|
||||
});
|
||||
}
|
||||
|
||||
FOREACH_BINARY_OP_SCALARLIST(
|
||||
all_types_half_bfloat16,
|
||||
clamp_max,
|
||||
minimum,
|
||||
false);
|
||||
FOREACH_BINARY_OP_SCALARLIST(
|
||||
all_types_half_bfloat16,
|
||||
clamp_min,
|
||||
maximum,
|
||||
false);
|
||||
FOREACH_BINARY_OP_SCALARLIST(all_types_half_bfloat16, clamp_max, minimum, true);
|
||||
FOREACH_BINARY_OP_SCALARLIST(all_types_half_bfloat16, clamp_min, maximum, true);
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -27,7 +27,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
|
||||
TensorList tensors2,
|
||||
TensorList tensors3) {
|
||||
check_foreach_api_restrictions(tensors1, tensors2, tensors3);
|
||||
if (!can_use_fast_route({tensors1, tensors2, tensors3})) {
|
||||
if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) {
|
||||
return foreach_tensor_ternary_lerp_slow(tensors1, tensors2, tensors3);
|
||||
}
|
||||
|
||||
@ -64,7 +64,7 @@ void foreach_tensor_lerp_ternary_cuda_(
|
||||
TensorList tensors2,
|
||||
TensorList tensors3) {
|
||||
check_foreach_api_restrictions(tensors1, tensors2, tensors3);
|
||||
if (!can_use_fast_route({tensors1, tensors2, tensors3})) {
|
||||
if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) {
|
||||
return foreach_tensor_ternary_lerp_slow_(tensors1, tensors2, tensors3);
|
||||
}
|
||||
|
||||
@ -94,7 +94,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
|
||||
TensorList tensors2,
|
||||
const Scalar& weight) {
|
||||
check_foreach_api_restrictions(tensors1, tensors2);
|
||||
if (!can_use_fast_route({tensors1, tensors2})) {
|
||||
if (!can_use_fast_route({tensors1, tensors2}, {}, true)) {
|
||||
return foreach_tensor_lerp_list_kernel_slow(tensors1, tensors2, weight);
|
||||
}
|
||||
|
||||
@ -132,7 +132,7 @@ void foreach_tensor_lerp_list_cuda_(
|
||||
TensorList tensors2,
|
||||
const Scalar& weight) {
|
||||
check_foreach_api_restrictions(tensors1, tensors2);
|
||||
if (!can_use_fast_route({tensors1, tensors2})) {
|
||||
if (!can_use_fast_route({tensors1, tensors2}, {}, true)) {
|
||||
return foreach_tensor_lerp_list_kernel_slow_(tensors1, tensors2, weight);
|
||||
}
|
||||
|
||||
|
||||
@ -1750,6 +1750,7 @@
|
||||
- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
Meta: copy_meta
|
||||
CompositeExplicitAutogradNonFunctional: copy
|
||||
tags: core
|
||||
|
||||
@ -11357,7 +11358,13 @@
|
||||
dispatch:
|
||||
CPU: foreach_tensor_copy_list_kernel_slow_
|
||||
CUDA: foreach_tensor_copy_list_kernel_cuda_
|
||||
autogen: _foreach_copy, _foreach_copy.out
|
||||
autogen: _foreach_copy.out
|
||||
|
||||
- func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
|
||||
device_check: NoCheck
|
||||
variants: function
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _foreach_copy
|
||||
|
||||
- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
|
||||
dispatch:
|
||||
|
||||
@ -485,7 +485,7 @@ static at::Tensor _quantized_convolution_onednn(
|
||||
torch::List<int64_t> dilation,
|
||||
bool transposed,
|
||||
int64_t groups,
|
||||
double inv_output_scale,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
c10::optional<at::Tensor> accum=c10::nullopt, // accum to fused with conv add
|
||||
double accum_scale=1.0,
|
||||
|
||||
@ -1397,7 +1397,7 @@ static at::Tensor _quantized_convolution_onednn(
|
||||
torch::List<int64_t> dilation,
|
||||
bool transposed,
|
||||
int64_t groups,
|
||||
double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
c10::optional<at::Tensor> accum, // accum to fused with conv add
|
||||
double accum_scale,
|
||||
@ -1420,10 +1420,10 @@ static at::Tensor _quantized_convolution_onednn(
|
||||
bool bfloat16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16);
|
||||
if (fp32_output || bfloat16_output) {
|
||||
// When fp32 or bf16 output, oneDNN expects op_attr doesn't set_scales and set_zero_points.
|
||||
// So, we will use default inv_output_scale as 1.0 and output_zero_point as 0, since
|
||||
// when inv_output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep;
|
||||
// So, we will use default output_scale as 1.0 and output_zero_point as 0, since
|
||||
// when output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep;
|
||||
// when output_zero_point is 0, we will skip invoking of op_attr.set_zero_points in ideep.
|
||||
TORCH_CHECK(inv_output_scale == 1.0, " (ONEDNN): fp32 or bf16 output, inv_output_scale must be 1.0.");
|
||||
TORCH_CHECK(output_scale == 1.0, " (ONEDNN): fp32 or bf16 output, output_scale must be 1.0.");
|
||||
TORCH_CHECK(output_zero_point == 0, " (ONEDNN): fp32 or bf16 output, output_zero_point must be 0");
|
||||
}
|
||||
|
||||
@ -1634,7 +1634,7 @@ static at::Tensor _quantized_convolution_onednn(
|
||||
int oc_per_group = packed_weight.get_dim(0) / groups;
|
||||
int wei_scale_mask = ideep::utils::conv_weight_scale_mask(weight_scales.numel(), oc_per_group, groups, false);
|
||||
op_attr.set_scales_mask(DNNL_ARG_WEIGHTS, wei_scale_mask);
|
||||
if (inv_output_scale != 1.0f) {
|
||||
if (output_scale != 1.0f) {
|
||||
op_attr.set_scales_mask(DNNL_ARG_DST, 0);
|
||||
}
|
||||
if (output_zero_point != 0) {
|
||||
@ -1671,13 +1671,13 @@ static at::Tensor _quantized_convolution_onednn(
|
||||
}
|
||||
tensor src_scales_t = tensor(ideep::scale_t(1, act_scale));
|
||||
tensor wei_scales_t = tensor(weights_scales);
|
||||
tensor dst_scales_t = tensor(ideep::scale_t(1, 1.0/inv_output_scale));
|
||||
tensor dst_scales_t = tensor(ideep::scale_t(1, output_scale));
|
||||
tensor src_zp_t = tensor(ideep::zero_point_t(1, act_zero_point));
|
||||
tensor dst_zp_t = tensor(ideep::zero_point_t(1, output_zero_point));
|
||||
if (act_scale != 1.0f) {
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_t});
|
||||
}
|
||||
if (inv_output_scale != 1.0f) {
|
||||
if (output_scale != 1.0f) {
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales_t});
|
||||
}
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_t});
|
||||
@ -1697,7 +1697,7 @@ static at::Tensor _quantized_convolution_onednn(
|
||||
const ideep::scale_t accum_ideep_scale = ideep::scale_t(1, 1.0/accum_scale);
|
||||
const ideep::zero_point_t accum_ideep_zero_points = ideep::zero_point_t(1, accum_zero_point);
|
||||
// Set the dst scale and zero point with the value of accum.
|
||||
// The true scale and zero point is stored in ideep::scale_t(scale_size, inv_output_scale) and dst_zero_points.
|
||||
// The true scale and zero point is stored in ideep::scale_t(scale_size, output_scale) and dst_zero_points.
|
||||
dst.set_scale(accum_ideep_scale);
|
||||
dst.set_zero_point(accum_ideep_zero_points);
|
||||
}
|
||||
@ -1707,7 +1707,7 @@ static at::Tensor _quantized_convolution_onednn(
|
||||
ideep::convolution_forward::prepare(
|
||||
params, src, packed_weight, expected_bias, dst_dims, dst,
|
||||
stride.vec(), dilation.vec(), padding.vec(), padding.vec(), groups,
|
||||
src_scales, weights_scales, ideep::scale_t(1, inv_output_scale),
|
||||
src_scales, weights_scales, ideep::scale_t(1, 1.0f / output_scale),
|
||||
src_zero_points, dst_zero_points,
|
||||
op_attr, dnnl::algorithm::convolution_direct,
|
||||
dnnl::prop_kind::forward_inference,
|
||||
@ -1872,7 +1872,7 @@ class QConvoneDNN final {
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
c10::optional<c10::ScalarType> output_dtype,
|
||||
c10::string_view attr,
|
||||
@ -1900,7 +1900,7 @@ class QConvoneDNN final {
|
||||
act, act_scale, act_zero_point,
|
||||
weight, weight_scales, weight_zero_points,
|
||||
bias, stride, padding, dilation, /*transposed*/false,
|
||||
groups, inv_output_scale, output_zero_point,
|
||||
groups, output_scale, output_zero_point,
|
||||
/*accum*/c10::nullopt, /*accum_scale*/0.0, /*accum_zero_point*/0,
|
||||
/*output_dtype*/output_dtype, /*binary_attr*/c10::nullopt, /*binary_alpha*/c10::nullopt,
|
||||
/*unary_attr*/attr, /*unary_scalars*/scalars, /*unary_algorithm*/algorithm
|
||||
@ -1924,7 +1924,7 @@ class QConvoneDNN final {
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
c10::optional<c10::ScalarType> output_dtype,
|
||||
c10::string_view binary_attr,
|
||||
@ -1952,7 +1952,7 @@ class QConvoneDNN final {
|
||||
act, act_scale, act_zero_point,
|
||||
weight, weight_scales, weight_zero_points,
|
||||
bias, stride, padding, dilation, /*transposed*/false,
|
||||
groups, inv_output_scale, output_zero_point,
|
||||
groups, output_scale, output_zero_point,
|
||||
accum, accum_scale, accum_zero_point,
|
||||
/*output_dtype*/output_dtype, binary_attr, alpha,
|
||||
unary_attr, unary_scalars, unary_algorithm
|
||||
|
||||
@ -931,7 +931,6 @@ static at::Tensor linear_int8_with_onednn_weight(
|
||||
c10::string_view& unary_post_op_algorithm) {
|
||||
using ideep::tensor;
|
||||
const int64_t dim = input.dim();
|
||||
output_scale = 1.0f / output_scale;
|
||||
TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte,
|
||||
"qlinear with mkldnn tensor: data type of input should be uint8 (unsigned char).");
|
||||
TORCH_CHECK(onednn_weight.scalar_type() == c10::ScalarType::Char,
|
||||
|
||||
@ -10,6 +10,7 @@ extern template int register_conv_params<3>();
|
||||
int register_embedding_params();
|
||||
|
||||
TORCH_LIBRARY(quantized, m) {
|
||||
m.set_python_module("caffe2.torch.fb.model_transform.splitting.split_dispatcher");
|
||||
register_linear_params();
|
||||
register_conv_params<2>();
|
||||
register_conv_params<3>();
|
||||
@ -257,12 +258,12 @@ TORCH_LIBRARY(onednn, m) {
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv_prepack(Tensor weight, Tensor w_scales, float x_scale, int x_zp, int[] stride, int[] padding, int[] dilation, int groups, int[]? x_shape=None) -> Tensor"));
|
||||
|
||||
// Conv1D/2D/3D with unary postop
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
|
||||
|
||||
// Conv2D with binary postop
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));
|
||||
|
||||
// Linear prepack
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_prepack(Tensor weight, int[]? x_shape) -> Tensor"));
|
||||
|
||||
@ -301,18 +301,55 @@ bool check_all_tensors_on_device(sdp_params const& params, bool debug) {
|
||||
}
|
||||
|
||||
bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
|
||||
const auto num_heads{params.query.sym_size(1)},
|
||||
query_lengths{params.query.sym_size(2)},
|
||||
head_dim{params.query.sym_size(3)};
|
||||
const bool ok = query_lengths % 64 == 0 && head_dim % 64 == 0;
|
||||
if (!ok) {
|
||||
const auto s_q = params.query.sym_size(2);
|
||||
const auto s_k = params.key.sym_size(2);
|
||||
const auto head_dim = params.query.sym_size(3);
|
||||
long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
|
||||
if (cudnn_version >= 90000) {
|
||||
if (head_dim % 8 != 0 || head_dim > 256) {
|
||||
if (debug) {
|
||||
TORCH_WARN("head_dim should be a multiple of 8 and no more than 256");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (head_dim % 8 != 0 || head_dim > 128) {
|
||||
if (debug) {
|
||||
TORCH_WARN("head_dim should be a multiple of 8 and no more than 128");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (cudnn_version < 8903) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
"CuDNN requires sequence length and head dim to be divisible by 64. Got sequence length: ",
|
||||
query_lengths,
|
||||
", head dim: ",
|
||||
head_dim,
|
||||
".");
|
||||
TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (params.dropout != 0.0 && cudnn_version < 8906) {
|
||||
if (debug) {
|
||||
TORCH_WARN("Dropout reference is only supported on 8.9.6 onwards.");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (cudnn_version < 90000) {
|
||||
if (s_q < 64) {
|
||||
if (debug) {
|
||||
TORCH_WARN("s_q less than 64 is not supported before cudnn 9.0.0");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if ((s_q % 64 != 0 || s_k % 64 != 0) && params.dropout != 0.0) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
"s_q not a multiple of 64 with padding/dropout is not supported with cudnn version 9.0.0");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (s_k % 64 != 0 && cudnn_version < 8906) {
|
||||
if (debug) {
|
||||
TORCH_WARN("not-multiple-of-64 seq_kv is not supported below 8.9.6");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -326,24 +363,64 @@ bool check_cudnn_layout(sdp_params const& params, bool debug) {
|
||||
const int64_t s_k = params.key.size(2);
|
||||
const int64_t s_v = params.value.size(2);
|
||||
// corresponds to cuDNN's "packed QKV" layout
|
||||
const bool query_layout_ok = (params.query.stride(0) == s_q * 3 * h * d) &&
|
||||
const bool packed_query_layout_ok = (params.query.stride(0) == s_q * 3 * h * d) &&
|
||||
(params.query.stride(1) == d) &&
|
||||
(params.query.stride(2) == 3 * h * d) &&
|
||||
(params.query.stride(3) == 1);
|
||||
const bool key_layout_ok = (params.key.stride(0) == s_k * 3 * h * d) &&
|
||||
const bool packed_key_layout_ok = (params.key.stride(0) == s_k * 3 * h * d) &&
|
||||
(params.key.stride(1) == d) &&
|
||||
(params.key.stride(2) == 3 * h * d) &&
|
||||
(params.key.stride(3) == 1);
|
||||
const bool value_layout_ok = (params.value.stride(0) == s_v * 3 * h * d) &&
|
||||
const bool packed_value_layout_ok = (params.value.stride(0) == s_v * 3 * h * d) &&
|
||||
(params.value.stride(1) == d) &&
|
||||
(params.value.stride(2) == 3 * h * d) &&
|
||||
(params.value.stride(3) == 1);
|
||||
if (debug) {
|
||||
if (!query_layout_ok) { TORCH_WARN("Query tensor was not in cuDNN-supported packed QKV layout", params.query.strides()); }
|
||||
if (!key_layout_ok) { TORCH_WARN("Key tensor was not in cuDNN-supported packed QKV layout"); }
|
||||
if (!value_layout_ok) { TORCH_WARN("Value tensor was not in cuDNN-supported packed QKV layout"); }
|
||||
|
||||
const bool packed_layout_ok = packed_query_layout_ok && packed_key_layout_ok && packed_value_layout_ok;
|
||||
|
||||
const bool query_layout_ok = (params.query.stride(0) == s_q * h * d) &&
|
||||
(params.query.stride(1) == d) &&
|
||||
(params.query.stride(2) == h * d) &&
|
||||
(params.query.stride(3) == 1);
|
||||
const bool key_layout_ok = (params.key.stride(0) == s_k * h * d) &&
|
||||
(params.key.stride(1) == d) &&
|
||||
(params.key.stride(2) == h * d) &&
|
||||
(params.key.stride(3) == 1);
|
||||
const bool value_layout_ok = (params.value.stride(0) == s_v * h * d) &&
|
||||
(params.value.stride(1) == d) &&
|
||||
(params.value.stride(2) == h * d) &&
|
||||
(params.value.stride(3) == 1);
|
||||
|
||||
const bool layout_ok = query_layout_ok && key_layout_ok && value_layout_ok;
|
||||
|
||||
if (!packed_value_layout_ok && !layout_ok) {
|
||||
if (debug) {
|
||||
if (!packed_layout_ok) {
|
||||
if (!packed_query_layout_ok) {
|
||||
TORCH_WARN("Query tensor was not in cuDNN-supported packed QKV layout", params.query.strides());
|
||||
}
|
||||
if (!packed_key_layout_ok) {
|
||||
TORCH_WARN("Key tensor was not in cuDNN-supported packed QKV layout", params.key.strides());
|
||||
}
|
||||
if (!packed_value_layout_ok) {
|
||||
TORCH_WARN("Value tensor was not in cuDNN-supported packed QKV layout", params.value.strides());
|
||||
}
|
||||
}
|
||||
if (!layout_ok) {
|
||||
if (!query_layout_ok) {
|
||||
TORCH_WARN("Query tensor was not in cuDNN-supported unpacked QKV layout", params.query.strides());
|
||||
}
|
||||
if (!key_layout_ok) {
|
||||
TORCH_WARN("Key tensor was not in cuDNN-supported unpacked QKV layout", params.key.strides());
|
||||
}
|
||||
if (!value_layout_ok) {
|
||||
TORCH_WARN("Value tensor was not in cuDNN-supported unpacked QKV layout", params.value.strides());
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return query_layout_ok && key_layout_ok && value_layout_ok;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
|
||||
@ -434,14 +511,14 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
|
||||
constexpr auto general_constraints =
|
||||
array_of<bool (*)(sdp_params const&, bool)>(
|
||||
check_runtime_enabled_cudnn,
|
||||
check_cudnn_hardware_support);
|
||||
// check_all_tensors_on_device,
|
||||
// check_cudnn_tensor_shapes,
|
||||
// check_cudnn_layout,
|
||||
check_cudnn_hardware_support,
|
||||
check_all_tensors_on_device,
|
||||
check_cudnn_tensor_shapes,
|
||||
check_cudnn_layout,
|
||||
// check_is_causal,
|
||||
// check_for_nested_inputs,
|
||||
// check_cudnn_requires_grad,
|
||||
// check_dtypes_low_precision
|
||||
check_for_nested_inputs,
|
||||
check_cudnn_requires_grad,
|
||||
check_dtypes_low_precision);
|
||||
for (auto& constraint : general_constraints) {
|
||||
if (!constraint(params, debug)) {
|
||||
return false;
|
||||
|
||||
@ -2005,33 +2005,6 @@ def get_dynamo_stats():
|
||||
)
|
||||
|
||||
|
||||
def maybe_fresh_cache(fn, is_cold_start):
|
||||
def inner(*args, **kwargs):
|
||||
cache_minder = contextlib.nullcontext()
|
||||
if is_cold_start:
|
||||
cache_entries = {}
|
||||
cache_minder = fresh_inductor_cache(cache_entries)
|
||||
|
||||
try:
|
||||
with cache_minder:
|
||||
return fn(*args, **kwargs)
|
||||
finally:
|
||||
dump_cache = False
|
||||
if dump_cache and is_cold_start:
|
||||
output_csv(
|
||||
output_filename[:-4] + "_triton_cache.csv",
|
||||
["dev", "name", "batch_size", "triton_cache"],
|
||||
[
|
||||
current_device,
|
||||
current_name,
|
||||
current_batch_size,
|
||||
cache_entries,
|
||||
],
|
||||
)
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
@contextmanager
|
||||
def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789"):
|
||||
try:
|
||||
@ -3297,12 +3270,6 @@ def parse_args(args=None):
|
||||
action="store_true",
|
||||
help="print dataframe result used for calculating accuracy",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cold-start-latency",
|
||||
"--cold_start_latency",
|
||||
action="store_true",
|
||||
help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-cudagraphs",
|
||||
action="store_true",
|
||||
@ -3415,6 +3382,19 @@ def parse_args(args=None):
|
||||
help="Enables Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",
|
||||
)
|
||||
|
||||
group_latency = parser.add_mutually_exclusive_group()
|
||||
group_latency.add_argument(
|
||||
"--cold-start-latency",
|
||||
"--cold_start_latency",
|
||||
action="store_true",
|
||||
help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
|
||||
)
|
||||
group_latency.add_argument(
|
||||
"--warm-start-latency",
|
||||
action="store_true",
|
||||
help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
|
||||
)
|
||||
|
||||
group_fuser = parser.add_mutually_exclusive_group()
|
||||
# --nvfuser is now the default, keep the option to not break scripts
|
||||
group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
|
||||
@ -3571,9 +3551,17 @@ def process_entry(rank, runner, original_dir, args):
|
||||
world_size=args.world_size,
|
||||
port=args.distributed_master_port,
|
||||
):
|
||||
return maybe_fresh_cache(
|
||||
run, (args.cold_start_latency and args.only) or args.ci
|
||||
)(runner, args, original_dir)
|
||||
return run(runner, args, original_dir)
|
||||
|
||||
|
||||
def maybe_fresh_cache(args):
|
||||
cache_dir_assigned = "TORCHINDUCTOR_CACHE_DIR" in os.environ
|
||||
if not cache_dir_assigned and (
|
||||
args.cold_start_latency or args.warm_start_latency or args.ci
|
||||
):
|
||||
return fresh_inductor_cache()
|
||||
else:
|
||||
return contextlib.nullcontext()
|
||||
|
||||
|
||||
def main(runner, original_dir=None, args=None):
|
||||
@ -3598,23 +3586,39 @@ def main(runner, original_dir=None, args=None):
|
||||
f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
|
||||
)
|
||||
|
||||
args.init_distributed = args.only and args.multiprocess
|
||||
if args.init_distributed:
|
||||
# NB: Do NOT query device count before CUDA initialization; we're
|
||||
# going to overwrite CUDA_VISIBLE_DEVICES and this will result in
|
||||
# https://github.com/pytorch/pytorch/issues/107300
|
||||
device_count = torch.cuda.device_count()
|
||||
if device_count <= 1:
|
||||
log.warning(
|
||||
"The use multiprocess flag is set but there are <= 1 devices available."
|
||||
with maybe_fresh_cache(args):
|
||||
args.init_distributed = args.only and args.multiprocess
|
||||
if args.init_distributed:
|
||||
# NB: Do NOT query device count before CUDA initialization; we're
|
||||
# going to overwrite CUDA_VISIBLE_DEVICES and this will result in
|
||||
# https://github.com/pytorch/pytorch/issues/107300
|
||||
device_count = torch.cuda.device_count()
|
||||
if device_count <= 1:
|
||||
log.warning(
|
||||
"The use multiprocess flag is set but there are <= 1 devices available."
|
||||
)
|
||||
# multiprocess path
|
||||
args.world_size = device_count
|
||||
mp.spawn(
|
||||
process_entry, args=(runner, original_dir, args), nprocs=device_count
|
||||
)
|
||||
# multiprocess path
|
||||
args.world_size = device_count
|
||||
mp.spawn(process_entry, args=(runner, original_dir, args), nprocs=device_count)
|
||||
else:
|
||||
# single process path just uses the main process
|
||||
args.world_size = 1
|
||||
process_entry(0, runner, original_dir, args)
|
||||
elif args.only and args.warm_start_latency:
|
||||
# Warm start mode. Enable FX graph caching and perform back-to-back runs in
|
||||
# separate processes (but ensure the inductor cache is preserved across runs).
|
||||
env = os.environ.copy()
|
||||
env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
|
||||
cmd = [sys.executable] + sys.argv
|
||||
cmd.remove("--warm-start-latency")
|
||||
|
||||
print(f"Executing cold-start run for {args.only}")
|
||||
subprocess.check_call(cmd, timeout=args.timeout, env=env)
|
||||
|
||||
print(f"Executing warm-start run for {args.only}")
|
||||
subprocess.check_call(cmd, timeout=args.timeout, env=env)
|
||||
else:
|
||||
# single process path just uses the main process
|
||||
args.world_size = 1
|
||||
process_entry(0, runner, original_dir, args)
|
||||
|
||||
|
||||
def write_csv_when_exception(args, name: str, status: str, device=None):
|
||||
|
||||
@ -173,6 +173,19 @@ def _load_model(x: Experiment, device="cuda", precision=torch.bfloat16):
|
||||
return model.eval()
|
||||
|
||||
|
||||
def _get_model_size(model):
|
||||
model_size = 0
|
||||
for name, child in model.named_children():
|
||||
if not isinstance(child, torch.nn.Embedding):
|
||||
model_size += sum(
|
||||
[
|
||||
p.numel() * p.dtype.itemsize
|
||||
for p in itertools.chain(child.parameters(), child.buffers())
|
||||
]
|
||||
)
|
||||
return model_size
|
||||
|
||||
|
||||
def run_experiment(
|
||||
x: Experiment,
|
||||
num_samples: int = 5,
|
||||
@ -193,10 +206,7 @@ def run_experiment(
|
||||
prompt_length = prompt.size(0)
|
||||
|
||||
torch.manual_seed(1234)
|
||||
model_size = sum(
|
||||
p.numel() * p.dtype.itemsize
|
||||
for p in itertools.chain(model.parameters(), model.buffers())
|
||||
)
|
||||
model_size = _get_model_size(model)
|
||||
|
||||
aggregate_metrics = {"tokens_per_sec": []}
|
||||
start = -1
|
||||
|
||||
@ -37,24 +37,12 @@ endif()
|
||||
if(USE_CUDA)
|
||||
caffe2_binary_target("inspect_gpu.cc")
|
||||
target_link_libraries(inspect_gpu ${CUDA_LIBRARIES})
|
||||
caffe2_binary_target("print_core_object_sizes_gpu.cc")
|
||||
|
||||
if(BUILD_TEST)
|
||||
# Core overhead benchmark
|
||||
caffe2_binary_target("core_overhead_benchmark_gpu.cc")
|
||||
target_link_libraries(core_overhead_benchmark_gpu benchmark ${CUDA_curand_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(USE_ROCM)
|
||||
caffe2_hip_binary_target("hip/inspect_gpu.cc")
|
||||
caffe2_hip_binary_target("hip/print_core_object_sizes_gpu.cc")
|
||||
|
||||
if(BUILD_TEST)
|
||||
# Core overhead benchmark
|
||||
caffe2_hip_binary_target("hip/core_overhead_benchmark_gpu.cc")
|
||||
target_link_libraries(core_overhead_benchmark_gpu benchmark)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(USE_MPI)
|
||||
|
||||
@ -1,222 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
#define CAFFE2_SKIP_IF_NO_GPU \
|
||||
if (!caffe2::NumCudaDevices()) { \
|
||||
state.SkipWithError("No CUDA available, skipping benchmark."); \
|
||||
return; \
|
||||
}
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
static void BM_CUDAContextCreation(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
volatile CUDAContext context_so_we_do_initialization_work;
|
||||
while (state.KeepRunning()) {
|
||||
volatile CUDAContext context;
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_CUDAContextCreation);
|
||||
|
||||
static void BM_CUDAContextStreamAccess(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
CUDAContext context;
|
||||
while (state.KeepRunning()) {
|
||||
volatile cudaStream_t stream = context.cuda_stream();
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_CUDAContextStreamAccess);
|
||||
|
||||
static void BM_cudaGetDevice(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
int id;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaGetDevice(&id));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaGetDevice);
|
||||
|
||||
static void BM_cudaSetDevice(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
int total = NumCudaDevices();
|
||||
int i = 0;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaSetDevice((i++) % total));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaSetDevice);
|
||||
|
||||
static void BM_cudaSetAndGetDevice(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
int total = NumCudaDevices();
|
||||
int i = 0;
|
||||
int id;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaSetDevice((i++) % total));
|
||||
CUDA_ENFORCE(cudaGetDevice(&id));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaSetAndGetDevice);
|
||||
|
||||
static void BM_cudaSetSameDevice(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaSetDevice(0));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaSetSameDevice);
|
||||
|
||||
static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
cudaStream_t stream;
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream));
|
||||
CUDA_ENFORCE(cudaStreamSynchronize(stream));
|
||||
CUDA_ENFORCE(cudaStreamDestroy(stream));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaStreamCreateSyncDelete);
|
||||
|
||||
static void BM_cudaStreamSynchronize(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
cudaStream_t stream;
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream));
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaStreamSynchronize(stream));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaStreamSynchronize);
|
||||
|
||||
static void BM_cudaEventRecord(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
cudaStream_t stream;
|
||||
cudaEvent_t event;
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream));
|
||||
CUDA_ENFORCE(cudaEventCreateWithFlags(
|
||||
&event, cudaEventDefault | cudaEventDisableTiming));
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaEventRecord(event, stream));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaEventRecord);
|
||||
|
||||
static void BM_cudaStreamWaitEventThenStreamSynchronize(
|
||||
benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
cudaStream_t stream;
|
||||
cudaEvent_t event;
|
||||
CUDA_ENFORCE(cudaStreamCreate(&stream));
|
||||
CUDA_ENFORCE(cudaEventCreateWithFlags(
|
||||
&event, cudaEventDefault | cudaEventDisableTiming));
|
||||
CUDA_ENFORCE(cudaEventRecord(event, stream));
|
||||
CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
|
||||
CUDA_ENFORCE(cudaStreamSynchronize(stream));
|
||||
while (state.KeepRunning()) {
|
||||
CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
|
||||
CUDA_ENFORCE(cudaStreamSynchronize(stream));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
|
||||
|
||||
static void BM_CudaPointerAffinity(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
Tensor tensor(vector<int64_t>{1, 2, 3, 4}, CUDA);
|
||||
float* ptr = tensor.mutable_data<float>();
|
||||
while (state.KeepRunning()) {
|
||||
volatile int id = GetGPUIDForPointer(ptr);
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_CudaPointerAffinity);
|
||||
|
||||
namespace {
|
||||
template <class Context>
|
||||
class DummyEmptyOp : public Operator<Context> {
|
||||
public:
|
||||
DummyEmptyOp(const OperatorDef& def, Workspace* ws)
|
||||
: Operator<Context>(def, ws) {}
|
||||
|
||||
bool RunOnDevice() final { return true; }
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
|
||||
REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
|
||||
OPERATOR_SCHEMA(DummyEmpty);
|
||||
} // namespace
|
||||
|
||||
static void BM_OperatorCreationCPU(benchmark::State& state) {
|
||||
std::unique_ptr<OperatorBase> op;
|
||||
OperatorDef def;
|
||||
Workspace ws;
|
||||
def.set_type("DummyEmpty");
|
||||
def.mutable_device_option()->set_device_type(PROTO_CPU);
|
||||
while (state.KeepRunning()) {
|
||||
op = CreateOperator(def, &ws);
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_OperatorCreationCPU);
|
||||
|
||||
static void BM_OperatorCreationCUDA(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
std::unique_ptr<OperatorBase> op;
|
||||
OperatorDef def;
|
||||
Workspace ws;
|
||||
def.set_type("DummyEmpty");
|
||||
def.mutable_device_option()->set_device_type(PROTO_CUDA);
|
||||
while (state.KeepRunning()) {
|
||||
op = CreateOperator(def, &ws);
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_OperatorCreationCUDA);
|
||||
|
||||
static void BM_RawAllocDeallocCPU(benchmark::State& state) {
|
||||
while (state.KeepRunning()) {
|
||||
// Allocating only 1 byte in order to measure the overhead.
|
||||
auto data_ptr = GetCPUAllocator()->allocate(1);
|
||||
// Deallocated when it's out of scope
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_RawAllocDeallocCPU);
|
||||
|
||||
static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
|
||||
Tensor tensor(CPU);
|
||||
// small allocation
|
||||
tensor.Resize(32, 32);
|
||||
while (state.KeepRunning()) {
|
||||
CHECK(tensor.mutable_data<float>());
|
||||
tensor.FreeMemory();
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_TensorAllocDeallocCPU);
|
||||
|
||||
static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
|
||||
CAFFE2_SKIP_IF_NO_GPU;
|
||||
Tensor tensor(CUDA);
|
||||
// small allocation
|
||||
tensor.Resize(32, 32);
|
||||
while (state.KeepRunning()) {
|
||||
CHECK(tensor.mutable_data<float>());
|
||||
tensor.FreeMemory();
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_TensorAllocDeallocCUDA);
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
@ -1,40 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/proto/caffe2_pb.h"
|
||||
|
||||
#define PRINT_SIZE(cls) \
|
||||
std::cout << "Size of " #cls ": " << sizeof(cls) << " bytes." \
|
||||
<< std::endl;
|
||||
|
||||
int main(int /* unused */, char** /* unused */) {
|
||||
PRINT_SIZE(caffe2::Blob);
|
||||
PRINT_SIZE(caffe2::Tensor);
|
||||
PRINT_SIZE(caffe2::CPUContext);
|
||||
PRINT_SIZE(caffe2::CUDAContext);
|
||||
PRINT_SIZE(caffe2::OperatorBase);
|
||||
PRINT_SIZE(caffe2::OperatorDef);
|
||||
PRINT_SIZE(caffe2::Operator<caffe2::CPUContext>);
|
||||
PRINT_SIZE(caffe2::Operator<caffe2::CUDAContext>);
|
||||
PRINT_SIZE(caffe2::TypeMeta);
|
||||
PRINT_SIZE(caffe2::Workspace);
|
||||
return 0;
|
||||
}
|
||||
@ -1,49 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "caffe2/core/blob_serialization.h"
|
||||
#include "caffe2/core/db.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/proto/caffe2_pb.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
|
||||
C10_DEFINE_string(f_in, "", "The input data file name.");
|
||||
C10_DEFINE_string(f_out, "", "The output data file name.");
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
caffe2::GlobalInit(&argc, &argv);
|
||||
std::ifstream f_in(FLAGS_f_in);
|
||||
std::ofstream f_out(FLAGS_f_out);
|
||||
std::string line;
|
||||
caffe2::TensorProtos tensor_protos;
|
||||
while (std::getline(f_in, line)) {
|
||||
caffe2::TensorProto* data = tensor_protos.add_protos();
|
||||
data->set_data_type(caffe2::TensorProto::STRING);
|
||||
data->add_dims(1);
|
||||
data->add_string_data(line);
|
||||
data->set_name("text");
|
||||
}
|
||||
f_in.close();
|
||||
std::string output_str;
|
||||
tensor_protos.SerializeToString(&output_str);
|
||||
f_out << output_str;
|
||||
f_out.close();
|
||||
return 0;
|
||||
}
|
||||
@ -824,6 +824,7 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/mps/Module.cpp",
|
||||
"torch/csrc/mtia/Module.cpp",
|
||||
"torch/csrc/inductor/aoti_runner/pybind.cpp",
|
||||
"torch/csrc/inductor/aoti_eager/kernel_holder.cpp",
|
||||
"torch/csrc/jit/backends/backend_init.cpp",
|
||||
"torch/csrc/jit/python/init.cpp",
|
||||
"torch/csrc/jit/passes/onnx.cpp",
|
||||
|
||||
@ -16,7 +16,7 @@ namespace c10 {
|
||||
class DataPtr;
|
||||
|
||||
/**
|
||||
* Flags defining the behavior of events.
|
||||
* Note [Flags defining the behavior of events]
|
||||
*
|
||||
* PYTORCH_DEFAULT and BACKEND_DEFAULT are valid for all backends. The
|
||||
* BACKEND_DEFAULT is what a particular backend would select if no
|
||||
@ -223,7 +223,10 @@ struct C10_API DeviceGuardImplInterface {
|
||||
/**
|
||||
* Fetch the elapsed time between two recorded events.
|
||||
*/
|
||||
virtual double elapsedTime(void* /*event1*/, void* /*event2*/) const {
|
||||
virtual double elapsedTime(
|
||||
void* /*event1*/,
|
||||
void* /*event2*/,
|
||||
const DeviceIndex /*device_index*/) const {
|
||||
TORCH_CHECK(false, "Backend doesn't support elapsedTime.");
|
||||
}
|
||||
|
||||
|
||||
@ -118,7 +118,7 @@ struct InlineEvent final {
|
||||
" does not match other's device type ",
|
||||
DeviceTypeName(other.device_type()),
|
||||
".");
|
||||
return backend_.elapsedTime(event_, other.event_);
|
||||
return backend_.elapsedTime(event_, other.event_, device_index_);
|
||||
}
|
||||
|
||||
void synchronize() const {
|
||||
|
||||
@ -87,8 +87,9 @@ class VirtualGuardImpl final : public DeviceGuardImplInterface {
|
||||
impl_->recordDataPtrOnStream(data_ptr, stream);
|
||||
}
|
||||
|
||||
double elapsedTime(void* event1, void* event2) const override {
|
||||
return impl_->elapsedTime(event1, event2);
|
||||
double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
|
||||
const override {
|
||||
return impl_->elapsedTime(event1, event2, device_index);
|
||||
}
|
||||
|
||||
void synchronizeEvent(void* event) const override {
|
||||
|
||||
@ -184,6 +184,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
|
||||
if (!event)
|
||||
return true;
|
||||
cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
|
||||
// Note: cudaEventQuery can be safely called from any device
|
||||
const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event));
|
||||
if (err != cudaErrorNotReady) {
|
||||
C10_CUDA_CHECK(err);
|
||||
@ -205,11 +206,44 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
|
||||
cuda_stream.synchronize();
|
||||
}
|
||||
|
||||
void synchronizeEvent(void* event) const override {
|
||||
if (!event)
|
||||
return;
|
||||
cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
|
||||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
|
||||
if (C10_UNLIKELY(interp)) {
|
||||
(*interp)->trace_gpu_event_synchronization(
|
||||
c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
|
||||
}
|
||||
// Note: cudaEventSynchronize can be safely called from any device
|
||||
C10_CUDA_CHECK(cudaEventSynchronize(cuda_event));
|
||||
}
|
||||
|
||||
void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
|
||||
const override {
|
||||
CUDAStream cuda_stream{stream};
|
||||
CUDACachingAllocator::recordStream(data_ptr, cuda_stream);
|
||||
}
|
||||
|
||||
double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
|
||||
const override {
|
||||
TORCH_CHECK(
|
||||
event1 && event2,
|
||||
"Both events must be recorded before calculating elapsed time.");
|
||||
// Even though cudaEventElapsedTime can be safely called from any device, if
|
||||
// the current device is not initialized, it will create a new cuda context,
|
||||
// which will consume a lot of memory.
|
||||
DeviceIndex orig_device{-1};
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device));
|
||||
C10_CUDA_CHECK(c10::cuda::SetDevice(device_index));
|
||||
cudaEvent_t cuda_event1 = static_cast<cudaEvent_t>(event1);
|
||||
cudaEvent_t cuda_event2 = static_cast<cudaEvent_t>(event2);
|
||||
float time_ms = 0;
|
||||
// raise cudaErrorNotReady if either event is recorded but not yet completed
|
||||
C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, cuda_event1, cuda_event2));
|
||||
C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device));
|
||||
return static_cast<double>(time_ms);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace c10::cuda::impl
|
||||
|
||||
@ -20,8 +20,6 @@
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#define STRONG_NODISCARD [[nodiscard]]
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__) && __MSC_VER < 1922
|
||||
#define STRONG_CONSTEXPR
|
||||
#else
|
||||
@ -127,18 +125,18 @@ public:
|
||||
swap(a.val, b.val);
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
constexpr T& value_of() & noexcept { return val;}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
constexpr const T& value_of() const & noexcept { return val;}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
constexpr T&& value_of() && noexcept { return std::move(val);}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend constexpr T& value_of(type& t) noexcept { return t.val;}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend constexpr const T& value_of(const type& t) noexcept { return t.val;}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend constexpr T&& value_of(type&& t) noexcept { return std::move(t).val;}
|
||||
private:
|
||||
T val;
|
||||
@ -192,7 +190,7 @@ namespace impl {
|
||||
template <
|
||||
typename T,
|
||||
typename = impl::WhenStrongType<T>>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
constexpr
|
||||
auto
|
||||
access(T&& t)
|
||||
@ -215,7 +213,7 @@ class equality::modifier<::strong::type<T, Tag, M...>>
|
||||
{
|
||||
using type = ::strong::type<T, Tag, M...>;
|
||||
public:
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -228,7 +226,7 @@ public:
|
||||
return value_of(lh) == value_of(rh);
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -251,7 +249,7 @@ namespace impl
|
||||
using TT = underlying_type_t<T>;
|
||||
using OT = underlying_type_t<Other>;
|
||||
public:
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator==(const T& lh, const Other& rh)
|
||||
@ -260,7 +258,7 @@ namespace impl
|
||||
{
|
||||
return value_of(lh) == impl::access(rh);
|
||||
}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator==(const Other& lh, const T& rh)
|
||||
@ -269,7 +267,7 @@ namespace impl
|
||||
{
|
||||
return impl::access(lh) == value_of(rh) ;
|
||||
}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator!=(const T& lh, const Other rh)
|
||||
@ -278,7 +276,7 @@ namespace impl
|
||||
{
|
||||
return value_of(lh) != impl::access(rh);
|
||||
}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator!=(const Other& lh, const T& rh)
|
||||
@ -307,7 +305,7 @@ namespace impl
|
||||
using TT = underlying_type_t<T>;
|
||||
using OT = underlying_type_t<Other>;
|
||||
public:
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator<(const T& lh, const Other& rh)
|
||||
@ -316,7 +314,7 @@ namespace impl
|
||||
{
|
||||
return value_of(lh) < impl::access(rh);
|
||||
}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator<(const Other& lh, const T& rh)
|
||||
@ -326,7 +324,7 @@ namespace impl
|
||||
return impl::access(lh) < value_of(rh) ;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator<=(const T& lh, const Other& rh)
|
||||
@ -335,7 +333,7 @@ namespace impl
|
||||
{
|
||||
return value_of(lh) <= impl::access(rh);
|
||||
}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator<=(const Other& lh, const T& rh)
|
||||
@ -345,7 +343,7 @@ namespace impl
|
||||
return impl::access(lh) <= value_of(rh) ;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator>(const T& lh, const Other& rh)
|
||||
@ -354,7 +352,7 @@ namespace impl
|
||||
{
|
||||
return value_of(lh) > impl::access(rh);
|
||||
}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator>(const Other& lh, const T& rh)
|
||||
@ -364,7 +362,7 @@ namespace impl
|
||||
return impl::access(lh) > value_of(rh) ;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator>=(const T& lh, const Other& rh)
|
||||
@ -373,7 +371,7 @@ namespace impl
|
||||
{
|
||||
return value_of(lh) >= impl::access(rh);
|
||||
}
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto operator>=(const Other& lh, const T& rh)
|
||||
@ -487,7 +485,7 @@ class ordered::modifier<::strong::type<T, Tag, M...>>
|
||||
{
|
||||
using type = ::strong::type<T, Tag, M...>;
|
||||
public:
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -500,7 +498,7 @@ public:
|
||||
return value_of(lh) < value_of(rh);
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -513,7 +511,7 @@ public:
|
||||
return value_of(lh) <= value_of(rh);
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -526,7 +524,7 @@ public:
|
||||
return value_of(lh) > value_of(rh);
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
|
||||
@ -829,7 +827,7 @@ class affine_point<D>::modifier<::strong::type<T, Tag, M...>>
|
||||
public:
|
||||
using difference = std::conditional_t<std::is_same<D, void>{}, strong::type<base_diff_type, Tag, strong::difference>, D>;
|
||||
static_assert(std::is_constructible<difference, base_diff_type>::value, "");
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
difference
|
||||
@ -864,7 +862,7 @@ public:
|
||||
return lh;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
type
|
||||
@ -875,7 +873,7 @@ public:
|
||||
return lh += d;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
type
|
||||
@ -886,7 +884,7 @@ public:
|
||||
return rh+= d;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
type
|
||||
@ -911,7 +909,7 @@ class pointer::modifier<::strong::type<T, Tag, M...>>
|
||||
using type = strong::type<T, Tag, M...>;
|
||||
public:
|
||||
template <typename TT = T>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -925,7 +923,7 @@ public:
|
||||
}
|
||||
|
||||
template <typename TT = T>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -939,7 +937,7 @@ public:
|
||||
}
|
||||
|
||||
template <typename TT = T>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -953,7 +951,7 @@ public:
|
||||
}
|
||||
|
||||
template <typename TT = T>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
auto
|
||||
@ -966,7 +964,7 @@ public:
|
||||
return value_of(t) != nullptr;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
STRONG_CONSTEXPR
|
||||
decltype(*std::declval<const T&>())
|
||||
operator*()
|
||||
@ -976,7 +974,7 @@ public:
|
||||
return *value_of(self);
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
STRONG_CONSTEXPR
|
||||
decltype(&(*std::declval<const T&>())) operator->() const { return &operator*();}
|
||||
};
|
||||
@ -987,7 +985,7 @@ struct arithmetic
|
||||
class modifier
|
||||
{
|
||||
public:
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1058,7 +1056,7 @@ struct arithmetic
|
||||
return lh;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1070,7 +1068,7 @@ struct arithmetic
|
||||
return lh;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1082,7 +1080,7 @@ struct arithmetic
|
||||
return lh;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1094,7 +1092,7 @@ struct arithmetic
|
||||
return lh;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1107,7 +1105,7 @@ struct arithmetic
|
||||
}
|
||||
|
||||
template <typename TT = T, typename = decltype(value_of(std::declval<TT>()) % value_of(std::declval<TT>()))>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1191,7 +1189,7 @@ struct bitarithmetic
|
||||
return lh;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1203,7 +1201,7 @@ struct bitarithmetic
|
||||
return T(v);
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1215,7 +1213,7 @@ struct bitarithmetic
|
||||
return lh;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1227,7 +1225,7 @@ struct bitarithmetic
|
||||
return lh;
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1240,7 +1238,7 @@ struct bitarithmetic
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1253,7 +1251,7 @@ struct bitarithmetic
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
friend
|
||||
STRONG_CONSTEXPR
|
||||
T
|
||||
@ -1286,7 +1284,7 @@ struct indexed<void> {
|
||||
using type = strong::type<T, Tag, Ms...>;
|
||||
public:
|
||||
template<typename I>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
operator[](
|
||||
const I &i)
|
||||
@ -1298,7 +1296,7 @@ struct indexed<void> {
|
||||
}
|
||||
|
||||
template<typename I>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
operator[](
|
||||
const I &i)
|
||||
@ -1310,7 +1308,7 @@ struct indexed<void> {
|
||||
}
|
||||
|
||||
template<typename I>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
operator[](
|
||||
const I &i)
|
||||
@ -1322,7 +1320,7 @@ struct indexed<void> {
|
||||
}
|
||||
|
||||
template<typename I, typename C = cref>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
at(
|
||||
const I &i)
|
||||
@ -1333,7 +1331,7 @@ struct indexed<void> {
|
||||
}
|
||||
|
||||
template<typename I, typename R = ref>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
at(
|
||||
const I &i)
|
||||
@ -1344,7 +1342,7 @@ struct indexed<void> {
|
||||
}
|
||||
|
||||
template<typename I, typename R = rref>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
at(
|
||||
const I &i)
|
||||
@ -1362,7 +1360,7 @@ class indexed<I>::modifier<type<T, Tag, M...>>
|
||||
{
|
||||
using type = ::strong::type<T, Tag, M...>;
|
||||
public:
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
operator[](
|
||||
const I& i)
|
||||
@ -1374,7 +1372,7 @@ public:
|
||||
return value_of(self)[impl::access(i)];
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
operator[](
|
||||
const I& i)
|
||||
@ -1386,7 +1384,7 @@ public:
|
||||
return value_of(self)[impl::access(i)];
|
||||
}
|
||||
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
operator[](
|
||||
const I& i)
|
||||
@ -1399,7 +1397,7 @@ public:
|
||||
}
|
||||
|
||||
template <typename TT = T>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
at(
|
||||
const I& i)
|
||||
@ -1411,7 +1409,7 @@ public:
|
||||
}
|
||||
|
||||
template <typename TT = T>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
at(
|
||||
const I& i)
|
||||
@ -1423,7 +1421,7 @@ public:
|
||||
}
|
||||
|
||||
template <typename TT = T>
|
||||
STRONG_NODISCARD
|
||||
[[nodiscard]]
|
||||
auto
|
||||
at(
|
||||
const I& i)
|
||||
|
||||
@ -48,6 +48,10 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
|
||||
return getCurrentXPUStream(d.index()).unwrap();
|
||||
}
|
||||
|
||||
Stream getNewStream(Device d, int priority = 0) const override {
|
||||
return getStreamFromPool(priority, d.index());
|
||||
}
|
||||
|
||||
Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
|
||||
const override {
|
||||
return getStreamFromPool(isHighPriority, d.index());
|
||||
@ -100,6 +104,7 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
|
||||
if (xpu_event)
|
||||
delete xpu_event;
|
||||
xpu_event = new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier());
|
||||
*event = reinterpret_cast<void*>(xpu_event);
|
||||
|
||||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
|
||||
if (C10_UNLIKELY(interp)) {
|
||||
@ -146,11 +151,29 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
|
||||
xpu_stream.synchronize();
|
||||
}
|
||||
|
||||
void synchronizeEvent(void* event) const override {
|
||||
if (!event)
|
||||
return;
|
||||
auto* xpu_event = reinterpret_cast<sycl::event*>(event);
|
||||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
|
||||
if (C10_UNLIKELY(interp)) {
|
||||
(*interp)->trace_gpu_event_synchronization(
|
||||
c10::kXPU, reinterpret_cast<uintptr_t>(xpu_event));
|
||||
}
|
||||
xpu_event->wait_and_throw();
|
||||
}
|
||||
|
||||
void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
|
||||
const override {
|
||||
const XPUStream xpu_stream{stream};
|
||||
XPUCachingAllocator::recordStream(data_ptr, xpu_stream);
|
||||
}
|
||||
|
||||
double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
|
||||
const override {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "elapsedTime is not supported by XPU backend.");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace c10::xpu::impl
|
||||
|
||||
@ -54,7 +54,9 @@ TEST(XPUGuardTest, EventBehavior) {
|
||||
c10::impl::VirtualGuardImpl impl(device.type());
|
||||
c10::Stream stream1 = impl.getStream(device);
|
||||
c10::Stream stream2 = impl.getStream(device);
|
||||
c10::Event event(device.type());
|
||||
c10::Event event1(device.type());
|
||||
// event is lazily created.
|
||||
EXPECT_FALSE(event1.eventId());
|
||||
|
||||
constexpr int numel = 1024;
|
||||
int hostData1[numel];
|
||||
@ -63,32 +65,54 @@ TEST(XPUGuardTest, EventBehavior) {
|
||||
clearHostData(hostData2, numel);
|
||||
|
||||
auto xpu_stream1 = c10::xpu::XPUStream(stream1);
|
||||
int* deviceData = sycl::malloc_device<int>(numel, xpu_stream1);
|
||||
int* deviceData1 = sycl::malloc_device<int>(numel, xpu_stream1);
|
||||
|
||||
// Copy hostData1 to deviceData via stream1, and then copy deviceData to
|
||||
// Copy hostData1 to deviceData1 via stream1, and then copy deviceData1 to
|
||||
// hostData2 via stream2.
|
||||
xpu_stream1.queue().memcpy(deviceData, hostData1, sizeof(int) * numel);
|
||||
xpu_stream1.queue().memcpy(deviceData1, hostData1, sizeof(int) * numel);
|
||||
// stream2 wait on stream1's completion.
|
||||
event.record(stream1);
|
||||
event.block(stream2);
|
||||
event1.record(stream1);
|
||||
event1.block(stream2);
|
||||
auto xpu_stream2 = c10::xpu::XPUStream(stream2);
|
||||
xpu_stream2.queue().memcpy(hostData2, deviceData, sizeof(int) * numel);
|
||||
xpu_stream2.queue().memcpy(hostData2, deviceData1, sizeof(int) * numel);
|
||||
xpu_stream2.synchronize();
|
||||
|
||||
EXPECT_TRUE(event.query());
|
||||
EXPECT_TRUE(event1.query());
|
||||
validateHostData(hostData2, numel);
|
||||
event.record(stream2);
|
||||
EXPECT_TRUE(event.query());
|
||||
event1.record(stream2);
|
||||
event1.synchronize();
|
||||
EXPECT_TRUE(event1.query());
|
||||
|
||||
clearHostData(hostData2, numel);
|
||||
xpu_stream1.queue().memcpy(deviceData, hostData1, sizeof(int) * numel);
|
||||
xpu_stream1.queue().memcpy(deviceData1, hostData1, sizeof(int) * numel);
|
||||
// stream2 wait on stream1's completion.
|
||||
event.record(stream1);
|
||||
event.block(stream2);
|
||||
// event will overwrite the previously captured state.
|
||||
event.record(stream2);
|
||||
xpu_stream2.queue().memcpy(hostData2, deviceData, sizeof(int) * numel);
|
||||
event1.record(stream1);
|
||||
event1.block(stream2);
|
||||
// event1 will overwrite the previously captured state.
|
||||
event1.record(stream2);
|
||||
xpu_stream2.queue().memcpy(hostData2, deviceData1, sizeof(int) * numel);
|
||||
xpu_stream2.synchronize();
|
||||
EXPECT_TRUE(event.query());
|
||||
EXPECT_TRUE(event1.query());
|
||||
validateHostData(hostData2, numel);
|
||||
|
||||
clearHostData(hostData2, numel);
|
||||
// ensure deviceData1 and deviceData2 are different buffers.
|
||||
int* deviceData2 = sycl::malloc_device<int>(numel, xpu_stream1);
|
||||
sycl::free(deviceData1, c10::xpu::get_device_context());
|
||||
c10::Event event2(device.type());
|
||||
|
||||
// Copy hostData1 to deviceData2 via stream1, and then copy deviceData2 to
|
||||
// hostData1 via stream1.
|
||||
xpu_stream1.queue().memcpy(deviceData2, hostData1, sizeof(int) * numel);
|
||||
event2.record(xpu_stream1);
|
||||
event2.synchronize();
|
||||
EXPECT_TRUE(event2.query());
|
||||
clearHostData(hostData1, numel);
|
||||
xpu_stream1.queue().memcpy(hostData1, deviceData2, sizeof(int) * numel);
|
||||
event2.record(xpu_stream1);
|
||||
event2.synchronize();
|
||||
EXPECT_TRUE(event2.query());
|
||||
EXPECT_NE(event1.eventId(), event2.eventId());
|
||||
ASSERT_THROW(event1.elapsedTime(event2), c10::Error);
|
||||
sycl::free(deviceData2, c10::xpu::get_device_context());
|
||||
}
|
||||
|
||||
@ -123,7 +123,6 @@ if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
|
||||
if(BUILD_CAFFE2_OPS)
|
||||
endif()
|
||||
add_subdirectory(proto)
|
||||
add_subdirectory(python)
|
||||
endif()
|
||||
if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
|
||||
add_subdirectory(proto)
|
||||
@ -1998,158 +1997,10 @@ if(BUILD_PYTHON)
|
||||
set_source_files_properties(../aten/src/ATen/native/EmbeddingBag.cpp PROPERTIES COMPILE_FLAGS -Wno-attributes)
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/../caffe2/operators/box_with_nms_limit_op.cc PROPERTIES COMPILE_FLAGS -Wno-attributes)
|
||||
endif()
|
||||
# Allow different install locations for libcaffe2
|
||||
# For setuptools installs (that all build Python), install libcaffe2 into
|
||||
# site-packages, alongside the torch libraries. The pybind11 library needs
|
||||
# an rpath to the torch library folder
|
||||
# For cmake installs, including c++ only installs, install libcaffe2 into
|
||||
# CMAKE_INSTALL_PREFIX/lib . The pybind11 library can have a hardcoded
|
||||
# rpath
|
||||
set(caffe2_pybind11_rpath "${_rpath_portable_origin}")
|
||||
if(${BUILDING_WITH_TORCH_LIBS})
|
||||
# site-packages/caffe2/python/caffe2_pybind11_state
|
||||
# site-packages/torch/lib
|
||||
set(caffe2_pybind11_rpath "${_rpath_portable_origin}/../../torch/lib")
|
||||
endif(${BUILDING_WITH_TORCH_LIBS})
|
||||
|
||||
# Must also include `CMAKE_SHARED_LINKER_FLAGS` in linker flags for
|
||||
# `caffe2_pybind11_state_*` targets because paths to required libraries may
|
||||
# need to be found there (e.g., specifying path to `libiomp5` with `LDFLAGS`).
|
||||
set(_caffe2_pybind11_state_linker_flags "${CMAKE_SHARED_LINKER_FLAGS}")
|
||||
if(APPLE)
|
||||
set(_caffe2_pybind11_state_linker_flags "${_caffe2_pybind11_state_linker_flags} -undefined dynamic_lookup")
|
||||
endif()
|
||||
|
||||
# ---[ Python.
|
||||
if(BUILD_CAFFE2)
|
||||
add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
|
||||
target_compile_definitions(torch PRIVATE BUILD_CAFFE2)
|
||||
target_compile_definitions(torch_python PRIVATE BUILD_CAFFE2)
|
||||
if(USE_NUMPY)
|
||||
target_compile_options(caffe2_pybind11_state PRIVATE "-DUSE_NUMPY")
|
||||
target_link_libraries(caffe2_pybind11_state PRIVATE numpy::numpy)
|
||||
endif()
|
||||
if(NOT MSVC)
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
|
||||
endif()
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "" DEBUG_POSTFIX "")
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
|
||||
target_include_directories(caffe2_pybind11_state PRIVATE $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(caffe2_pybind11_state PRIVATE ${Caffe2_CPU_INCLUDE})
|
||||
|
||||
target_link_libraries(caffe2_pybind11_state PRIVATE
|
||||
torch_library python::python pybind::pybind11)
|
||||
if(USE_MKLDNN)
|
||||
target_link_libraries(caffe2_pybind11_state PRIVATE caffe2::mkldnn)
|
||||
endif()
|
||||
if(WIN32)
|
||||
target_link_libraries(caffe2_pybind11_state PRIVATE onnx_proto)
|
||||
endif(WIN32)
|
||||
|
||||
# Install caffe2_pybind11_state(_gpu|hip) in site-packages/caffe2/python,
|
||||
# so it needs an rpath to find libcaffe2
|
||||
set_target_properties(
|
||||
caffe2_pybind11_state PROPERTIES LIBRARY_OUTPUT_DIRECTORY
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
install(TARGETS caffe2_pybind11_state DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
|
||||
if(MSVC AND BUILD_SHARED_LIBS)
|
||||
install(FILES $<TARGET_PDB_FILE:caffe2_pybind11_state> DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python" OPTIONAL)
|
||||
endif()
|
||||
set_target_properties(caffe2_pybind11_state PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
|
||||
|
||||
if(USE_CUDA)
|
||||
add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS})
|
||||
if(USE_NUMPY)
|
||||
target_compile_options(caffe2_pybind11_state_gpu PRIVATE "-DUSE_NUMPY")
|
||||
target_link_libraries(caffe2_pybind11_state_gpu PRIVATE numpy::numpy)
|
||||
endif()
|
||||
if(NOT MSVC)
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
|
||||
endif()
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "" DEBUG_POSTFIX "")
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
|
||||
target_include_directories(caffe2_pybind11_state_gpu PRIVATE $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(caffe2_pybind11_state_gpu PRIVATE ${Caffe2_CPU_INCLUDE})
|
||||
target_link_libraries(caffe2_pybind11_state_gpu PRIVATE
|
||||
torch_library python::python pybind::pybind11)
|
||||
if(USE_MKLDNN)
|
||||
target_link_libraries(caffe2_pybind11_state_gpu PRIVATE caffe2::mkldnn)
|
||||
endif()
|
||||
if(WIN32)
|
||||
target_link_libraries(caffe2_pybind11_state_gpu PRIVATE onnx_proto)
|
||||
endif(WIN32)
|
||||
|
||||
# Install with same rpath as non-gpu caffe2_pybind11_state
|
||||
set_target_properties(
|
||||
caffe2_pybind11_state_gpu PROPERTIES LIBRARY_OUTPUT_DIRECTORY
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
install(TARGETS caffe2_pybind11_state_gpu DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
|
||||
if(MSVC AND BUILD_SHARED_LIBS)
|
||||
install(FILES $<TARGET_PDB_FILE:caffe2_pybind11_state_gpu> DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python" OPTIONAL)
|
||||
endif()
|
||||
set_target_properties(caffe2_pybind11_state_gpu PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
|
||||
endif()
|
||||
|
||||
if(USE_ROCM)
|
||||
add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS})
|
||||
if(USE_NUMPY)
|
||||
target_compile_options(caffe2_pybind11_state_hip PRIVATE "-DUSE_NUMPY")
|
||||
target_link_libraries(caffe2_pybind11_state_hip PRIVATE numpy::numpy)
|
||||
endif()
|
||||
if(NOT MSVC)
|
||||
target_compile_options(caffe2_pybind11_state_hip PRIVATE ${HIP_CXX_FLAGS} -fvisibility=hidden)
|
||||
endif()
|
||||
set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "")
|
||||
set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
|
||||
set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
|
||||
target_include_directories(caffe2_pybind11_state_hip PRIVATE $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(caffe2_pybind11_state_hip PRIVATE ${Caffe2_CPU_INCLUDE} ${Caffe2_HIP_INCLUDE})
|
||||
target_link_libraries(caffe2_pybind11_state_hip PRIVATE
|
||||
torch_library python::python pybind::pybind11)
|
||||
|
||||
# Install with same rpath as non-hip caffe2_pybind11_state
|
||||
set_target_properties(
|
||||
caffe2_pybind11_state_hip PROPERTIES LIBRARY_OUTPUT_DIRECTORY
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
install(TARGETS caffe2_pybind11_state_hip DESTINATION "${PYTHON_LIB_REL_PATH}/caffe2/python")
|
||||
set_target_properties(caffe2_pybind11_state_hip PROPERTIES INSTALL_RPATH "${caffe2_pybind11_rpath}")
|
||||
endif()
|
||||
|
||||
if(MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
# If we are building under windows, we will copy the file from
|
||||
# build/caffe2/python/{Debug,Release}/caffe2_pybind11_state.pyd
|
||||
# to its parent folder so that we can do in-build execution.
|
||||
add_custom_target(windows_python_copy_lib ALL)
|
||||
add_dependencies(windows_python_copy_lib caffe2_pybind11_state)
|
||||
add_custom_command(
|
||||
TARGET windows_python_copy_lib POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
$<TARGET_FILE:caffe2_pybind11_state>
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
if(USE_CUDA)
|
||||
add_dependencies(windows_python_copy_lib caffe2_pybind11_state_gpu)
|
||||
add_custom_command(
|
||||
TARGET windows_python_copy_lib POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
$<TARGET_FILE:caffe2_pybind11_state_gpu>
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
endif()
|
||||
if(USE_ROCM)
|
||||
add_dependencies(windows_python_copy_lib caffe2_pybind11_state_hip)
|
||||
add_custom_command(
|
||||
TARGET windows_python_copy_lib POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
$<TARGET_FILE:caffe2_pybind11_state_hip>
|
||||
${CMAKE_BINARY_DIR}/caffe2/python)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Finally, Copy all python files to build directory
|
||||
# Create a custom target that copies all python files.
|
||||
file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR}
|
||||
"${PROJECT_SOURCE_DIR}/caffe2/*.py")
|
||||
endif()
|
||||
|
||||
# generated pb files are copied from build/caffe2 to caffe2
|
||||
|
||||
@ -1,20 +0,0 @@
|
||||
if(USE_MKLDNN)
|
||||
message(STATUS "Including IDEEP operators")
|
||||
|
||||
# ---[ CPU files.
|
||||
file(GLOB_RECURSE tmp *.cc)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
|
||||
# exclude test files and gpu files
|
||||
file(GLOB_RECURSE tmp *_test.cc)
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
|
||||
|
||||
# ---[ CPU test files - currently none but just to be safe
|
||||
file(GLOB_RECURSE tmp *_test.cc)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
|
||||
|
||||
# ---[ Send the lists to the parent scope.
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
else()
|
||||
message(STATUS "Excluding ideep operators as we are not using ideep")
|
||||
endif()
|
||||
@ -1,48 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <caffe2/core/macros.h> // For caffe2 macros.
|
||||
#include <caffe2/utils/eigen_utils.h>
|
||||
// All caffe2 ideep related headers
|
||||
#include <ideep.hpp>
|
||||
#include <caffe2/ideep/utils/ideep_context.h>
|
||||
#include <caffe2/ideep/utils/ideep_operator.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
enum ConvAlgorithm {
|
||||
CONV_ALGORITHM_AUTO = 0,
|
||||
CONV_ALGORITHM_WINOGRAD = 1,
|
||||
CONV_ALGORITHM_MAX
|
||||
};
|
||||
|
||||
enum FusionType {
|
||||
FUSION_UNKNOWN = 0,
|
||||
FUSION_CONV_RELU = 1,
|
||||
FUSION_CONV_SUM = 2,
|
||||
FUSION_CONV_SUM_RELU = 3,
|
||||
FUSION_MAX
|
||||
};
|
||||
|
||||
#define USE_IDEEP_DEF_ALIASES() \
|
||||
/* the tensor type created/handled by iDEEP */ \
|
||||
using itensor = ideep::tensor; \
|
||||
/* the date layout of iDEEP tensor */ \
|
||||
using iformat = ideep::format_tag; \
|
||||
/* the scales for iDEEP tensor with different data type */ \
|
||||
using iscale = ideep::scale_t; \
|
||||
/* the detial algorithm for iDEEP operators, e.g. winograd */ \
|
||||
using ialgo = ideep::algorithm; \
|
||||
/* the kind of propagation for iDEEP operators, e.g. forward, training */ \
|
||||
using iprop = ideep::prop_kind; \
|
||||
/* the kind of low precision operators, e.g. signed/unsigned activation */ \
|
||||
using ilowp_kind = ideep::lowp_kind; \
|
||||
/* the data type of iDEEP tensor, e.g. f32, u8, s8 */ \
|
||||
using idtype = ideep::tensor::data_type; \
|
||||
/* the descriptor of iDEEP tensor */ \
|
||||
using itdesc = ideep::tensor::descriptor; \
|
||||
/* the attribute for operator to describe the details of inputs&fusion */ \
|
||||
using iattr = ideep::attr_t; \
|
||||
/* the detail flags for batch normalization */ \
|
||||
using ibn_flag = ideep::batch_normalization_flag;
|
||||
|
||||
} // namespace caffe2
|
||||
@ -1,160 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
void adam_ideep_compute(
|
||||
int N,
|
||||
const float* w,
|
||||
const float* g,
|
||||
const float* m,
|
||||
const float* v,
|
||||
float* nw,
|
||||
float* nm,
|
||||
float* nv,
|
||||
float beta1,
|
||||
float beta2,
|
||||
float eps_hat,
|
||||
float correction,
|
||||
const float* lr) {
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for schedule(static)
|
||||
#endif
|
||||
for (auto i = 0; i < N; ++i) {
|
||||
float gi = g[i];
|
||||
float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
|
||||
float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
|
||||
nw[i] = w[i] + lr[0] * correction * mi / (std::sqrt(vi) + eps_hat);
|
||||
}
|
||||
}
|
||||
|
||||
void adam_ideep_compute_output_grad(
|
||||
int N,
|
||||
const float* w,
|
||||
const float* g,
|
||||
const float* m,
|
||||
const float* v,
|
||||
float* nw,
|
||||
float* nm,
|
||||
float* nv,
|
||||
float* ng,
|
||||
float beta1,
|
||||
float beta2,
|
||||
float eps_hat,
|
||||
float correction,
|
||||
const float* lr) {
|
||||
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for schedule(static)
|
||||
#endif
|
||||
for (auto i = 0; i < N; ++i) {
|
||||
float gi = g[i];
|
||||
float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
|
||||
float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
|
||||
float ngi = ng[i] = correction * mi / (std::sqrt(vi) + eps_hat);
|
||||
nw[i] = w[i] + lr[0] * ngi;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class IDEEPAdamOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPAdamOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
beta1_(OperatorBase::GetSingleArgument<float>("beta1", 0.9f)),
|
||||
beta2_(OperatorBase::GetSingleArgument<float>("beta2", 0.999f)),
|
||||
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5f)) {}
|
||||
bool RunOnDevice() override {
|
||||
// Iter live on the CPU
|
||||
CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
|
||||
const auto& params = Input(PARAM);
|
||||
const auto& moment_1 = Input(MOMENT_1);
|
||||
const auto& moment_2 = Input(MOMENT_2);
|
||||
const auto& grad = Input(GRAD);
|
||||
// TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
|
||||
const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
|
||||
auto* out_params = Output(OUTPUT_PARAM);
|
||||
auto* out_moment1 = Output(OUTPUT_MOMENT_1);
|
||||
auto* out_moment2 = Output(OUTPUT_MOMENT_2);
|
||||
|
||||
CAFFE_ENFORCE(lr.size() == 1);
|
||||
CAFFE_ENFORCE(grad.get_nelems() == params.get_nelems());
|
||||
CAFFE_ENFORCE(grad.get_nelems() == moment_1.get_nelems());
|
||||
CAFFE_ENFORCE(grad.get_nelems() == moment_2.get_nelems());
|
||||
if (params != *out_params)
|
||||
out_params->init(params.get_descriptor());
|
||||
if (moment_1 != *out_moment1)
|
||||
out_moment1->init(moment_1.get_descriptor());
|
||||
if (moment_2 != *out_moment2)
|
||||
out_moment2->init(moment_2.get_descriptor());
|
||||
const auto w = static_cast<float *>(params.get_data_handle());
|
||||
const auto g = static_cast<float *>(grad.get_data_handle());
|
||||
const auto m = static_cast<float *>(moment_1.get_data_handle());
|
||||
const auto v = static_cast<float *>(moment_2.get_data_handle());
|
||||
auto nw = static_cast<float *>(out_params->get_data_handle());
|
||||
auto nm = static_cast<float *>(out_moment1->get_data_handle());
|
||||
auto nv = static_cast<float *>(out_moment2->get_data_handle());
|
||||
const auto nlr = lr.template data<T>();
|
||||
const auto iter =
|
||||
OperatorBase::Input<TensorCPU>(ITER, CPU).template data<int64_t>()[0];
|
||||
const auto t = iter + 1;
|
||||
const auto correction =
|
||||
std::sqrt(T(1.) - std::pow(beta2_, t)) / (T(1.) - std::pow(beta1_, t));
|
||||
if (OutputSize() == 3) {
|
||||
adam_ideep_compute(
|
||||
grad.get_nelems(),
|
||||
w,
|
||||
g,
|
||||
m,
|
||||
v,
|
||||
nw,
|
||||
nm,
|
||||
nv,
|
||||
beta1_,
|
||||
beta2_,
|
||||
epsilon_,
|
||||
correction,
|
||||
nlr);
|
||||
} else {
|
||||
auto* out_grad = Output(OUTPUT_GRAD);
|
||||
if (grad != *out_grad)
|
||||
out_grad->init(grad.get_descriptor());
|
||||
auto ng = static_cast<float *>(out_grad->get_data_handle());
|
||||
adam_ideep_compute_output_grad(
|
||||
grad.get_nelems(),
|
||||
w,
|
||||
g,
|
||||
m,
|
||||
v,
|
||||
nw,
|
||||
nm,
|
||||
nv,
|
||||
ng,
|
||||
beta1_,
|
||||
beta2_,
|
||||
epsilon_,
|
||||
correction,
|
||||
nlr);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes,cppcoreguidelines-avoid-magic-numbers)
|
||||
T beta1_{0.9};
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes,cppcoreguidelines-avoid-magic-numbers)
|
||||
T beta2_{0.999};
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes,cppcoreguidelines-avoid-magic-numbers)
|
||||
T epsilon_{1e-8};
|
||||
INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, GRAD, LR, ITER);
|
||||
OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Adam, IDEEPAdamOp<float>);
|
||||
|
||||
} // namespace
|
||||
@ -1,55 +0,0 @@
|
||||
#include <caffe2/ideep/operators/conv_pool_base_op.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class ChannelShuffleOp final : public IDEEPConvPoolOpBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvPoolOpBase(operator_def, ws) {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
ideep::channel_shuffle_forward::compute(X, *Y, group_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class ChannelShuffleGradientOp final : public IDEEPConvPoolOpBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
ChannelShuffleGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvPoolOpBase(operator_def, ws) {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dX = Output(INPUT_GRAD);
|
||||
|
||||
ideep::channel_shuffle_backward::compute(dY, *dX, group_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
INPUT_TAGS(OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(INPUT_GRAD);
|
||||
};
|
||||
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(ChannelShuffle, ChannelShuffleOp);
|
||||
REGISTER_IDEEP_OPERATOR(ChannelShuffleGradient, ChannelShuffleGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,176 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
|
||||
#include <caffe2/operators/concat_split_op.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPConcatOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
using FALLBACK_OP = IDEEPFallbackOp<ConcatOp<CPUContext>, SkipIndices<0>>;
|
||||
|
||||
IDEEPConcatOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
fallback_(operator_def, ws) {
|
||||
CAFFE_ENFORCE(
|
||||
!(OperatorBase::HasArgument("axis") && OperatorBase::HasArgument("order")),
|
||||
"You shouldn't specify both the dim to concat, and the order "
|
||||
"in the case of 4-D images.");
|
||||
if (OperatorBase::HasArgument("axis")) {
|
||||
axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
|
||||
add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
|
||||
} else {
|
||||
axis_ = 1;
|
||||
add_axis_ = 0;
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPConcatOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
bool fallback_to_cpu = false;
|
||||
vector<itensor> inputs_itensor;
|
||||
|
||||
for (int i = 0; i < InputSize(); ++i) {
|
||||
if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
|
||||
auto& tensor_ideep = Input(i);
|
||||
if (tensor_ideep.ndims() == 0 || tensor_ideep.get_nelems() == 0)
|
||||
continue;
|
||||
inputs_itensor.emplace_back(tensor_ideep);
|
||||
} else {
|
||||
CAFFE_ENFORCE(
|
||||
BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
|
||||
"Expect cpu tensor if not itensor");
|
||||
auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
|
||||
if (tensor_cpu.sizes().size() == 0 || tensor_cpu.numel() == 0)
|
||||
continue;
|
||||
fallback_to_cpu = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fallback_to_cpu) {
|
||||
int adj_size = inputs_itensor[0].ndims() + (add_axis_ ? 1 : 0);
|
||||
int canonical_axis = canonical_axis_index_(axis_, adj_size);
|
||||
auto* output = Output(OUTPUT);
|
||||
Tensor* axis_info = OutputTensor(AXIS_INFO,
|
||||
vector<int64_t>(1, InputSize()), at::dtype<int>().device(CPU));
|
||||
auto* axis_data = axis_info->template mutable_data<int>();
|
||||
auto axis_vdata =
|
||||
ideep::concat::compute(inputs_itensor, canonical_axis, add_axis_, *output);
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (int i = 0; i < axis_vdata.size(); i++) {
|
||||
axis_data[i] = axis_vdata[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return fallback_.Run(0);
|
||||
}
|
||||
|
||||
private:
|
||||
int axis_;
|
||||
int add_axis_;
|
||||
FALLBACK_OP fallback_;
|
||||
|
||||
INPUT_TAGS(INPUT0);
|
||||
OUTPUT_TAGS(OUTPUT, AXIS_INFO);
|
||||
};
|
||||
|
||||
class IDEEPSplitOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPSplitOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
axis_offset_(OperatorBase::GetRepeatedArgument<int>("split")) {
|
||||
CAFFE_ENFORCE(
|
||||
!(OperatorBase::HasArgument("axis") && OperatorBase::HasArgument("order")),
|
||||
"You shouldn't specify both the dim to split, and the order "
|
||||
"in the case of 4-D images.");
|
||||
if (OperatorBase::HasArgument("axis")) {
|
||||
axis_ = OperatorBase::GetSingleArgument<int>("axis", -1);
|
||||
// only exists for computing the gradient of a Concat with 'add_axis'
|
||||
add_axis_ = OperatorBase::GetSingleArgument<int>("add_axis", 0);
|
||||
} else {
|
||||
axis_ = 1;
|
||||
add_axis_ = 0;
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPSplitOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& input = Input(INPUT);
|
||||
int canonical_axis = canonical_axis_index_(axis_, input.ndims());
|
||||
const int input_channels = input.get_dim(canonical_axis);
|
||||
vector<int> axis_vdata(OutputSize(), 0);
|
||||
if (InputSize() == 2) {
|
||||
// We obtain split from the input tensor.
|
||||
CAFFE_ENFORCE_EQ(
|
||||
axis_offset_.size(),
|
||||
0,
|
||||
"If you set split with an input blob, do not pass in "
|
||||
"split in the argument.");
|
||||
auto& axis_info = OperatorBase::Input<Tensor>(AXIS_INFO, CPU);
|
||||
CAFFE_ENFORCE_EQ(axis_info.numel(), OutputSize());
|
||||
auto* axis_data = axis_info.template data<int>();
|
||||
axis_vdata.assign(axis_data, axis_data + OutputSize());
|
||||
} else if (axis_offset_.size() == 0) {
|
||||
CAFFE_ENFORCE_EQ(
|
||||
input_channels % OutputSize(),
|
||||
0,
|
||||
"If you did not specify split explicitly, the number of "
|
||||
"input channels should be divisible by the output size.");
|
||||
axis_vdata.assign(OutputSize(), input_channels / OutputSize());
|
||||
} else {
|
||||
// We obtain split from the parameters.
|
||||
CAFFE_ENFORCE_EQ(
|
||||
axis_offset_.size(),
|
||||
OutputSize(),
|
||||
"The number of splits specified should be equal to the "
|
||||
"number of outputs.");
|
||||
axis_vdata = axis_offset_;
|
||||
}
|
||||
|
||||
CAFFE_ENFORCE_EQ(
|
||||
add_axis_ ? OutputSize()
|
||||
: std::accumulate(
|
||||
axis_vdata.data(), axis_vdata.data() + OutputSize(), 0),
|
||||
input_channels,
|
||||
"Sum of split dimensions do not match: should be ",
|
||||
input_channels);
|
||||
|
||||
auto iten_vector = ideep::spliter::compute(
|
||||
input, axis_vdata, canonical_axis, add_axis_);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
iten_vector.size(),
|
||||
OutputSize(),
|
||||
"Output size does not match: should be ",
|
||||
OutputSize());
|
||||
|
||||
for (int i = 0; i < OutputSize(); i++) {
|
||||
auto* output = Output(i);
|
||||
*output = iten_vector[i];
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
int axis_;
|
||||
int add_axis_;
|
||||
vector<int> axis_offset_;
|
||||
|
||||
INPUT_TAGS(INPUT, AXIS_INFO);
|
||||
};
|
||||
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Concat, IDEEPConcatOp);
|
||||
REGISTER_IDEEP_OPERATOR(Split, IDEEPSplitOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,357 +0,0 @@
|
||||
#include <caffe2/ideep/operators/conv_pool_base_op.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPConvOp : public IDEEPConvPoolOpBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPConvOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvPoolOpBase(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
order_ == StorageOrder::NCHW, "Unsupported storage order.");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
pad_l() == pad_r() && pad_t() == pad_b(),
|
||||
"Uneven padding not supported.");
|
||||
|
||||
fusion_type_ = FUSION_UNKNOWN;
|
||||
last_input_ = BIAS_OR_INPUT_S;
|
||||
|
||||
training_mode_ = OperatorBase::GetSingleArgument<int>("training_mode", 0);
|
||||
pk_ = training_mode_ ? iprop::forward_training : iprop::forward_inference;
|
||||
|
||||
algo_ = ialgo::convolution_direct;
|
||||
auto conv_algorithm = OperatorBase::GetSingleArgument<int>(
|
||||
"conv_algorithm", CONV_ALGORITHM_AUTO);
|
||||
if (conv_algorithm == CONV_ALGORITHM_WINOGRAD) {
|
||||
algo_ = ialgo::convolution_winograd;
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPConvOp() override {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const auto& X = Input(INPUT_X);
|
||||
const auto& filter = Input(FILTER);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
CAFFE_ENFORCE(4 == X.ndims());
|
||||
CAFFE_ENFORCE(4 == filter.ndims());
|
||||
CAFFE_ENFORCE_EQ(filter.get_dim(2), kernel_h());
|
||||
CAFFE_ENFORCE_EQ(filter.get_dim(3), kernel_w());
|
||||
CAFFE_ENFORCE(
|
||||
X.get_dim(1) == filter.get_dim(1) * group_,
|
||||
"Convolution op: input channels does not match: # of input channels ",
|
||||
X.get_dim(1),
|
||||
" is not equal to kernel channels * group:",
|
||||
filter.get_dim(1),
|
||||
"*",
|
||||
group_);
|
||||
|
||||
bool input_changed = (cached_X_descriptor_ != X.get_descriptor());
|
||||
if (input_changed) {
|
||||
cached_X_descriptor_ = X.dup_descriptor();
|
||||
}
|
||||
|
||||
bool weights_changed = (cached_weights_descriptor_ != filter.get_descriptor());
|
||||
if (!training_mode_ && weights_changed) {
|
||||
cached_weights_descriptor_ = filter.dup_descriptor();
|
||||
auto expected_descriptor =
|
||||
ideep::convolution_forward::expected_weights_desc(
|
||||
filter.get_dims(),
|
||||
idtype::f32,
|
||||
{stride_.begin(), stride_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
group_,
|
||||
algo_,
|
||||
pk_,
|
||||
idtype::f32,
|
||||
X.get_dims());
|
||||
if (filter.get_descriptor() != expected_descriptor) {
|
||||
filter_.init(expected_descriptor);
|
||||
filter_.feed_from(filter);
|
||||
} else {
|
||||
filter_ = filter;
|
||||
}
|
||||
}
|
||||
|
||||
bool with_bias = InputSize() > last_input_;
|
||||
auto filter_in = training_mode_ ? filter : filter_;
|
||||
if (training_mode_ || input_changed || weights_changed) {
|
||||
auto Y_dims_conv = CalcOutputDims(X, filter.get_dim(0));
|
||||
if (with_bias) {
|
||||
ideep::convolution_forward::prepare(
|
||||
conv_param,
|
||||
X,
|
||||
filter_in,
|
||||
Input(BIAS_OR_INPUT_S),
|
||||
Y_dims_conv,
|
||||
*Y,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
group_,
|
||||
dummy_scale_,
|
||||
dummy_scale_,
|
||||
dummy_scale_,
|
||||
attr_,
|
||||
algo_,
|
||||
pk_);
|
||||
} else {
|
||||
ideep::convolution_forward::prepare(
|
||||
conv_param,
|
||||
X,
|
||||
filter_in,
|
||||
Y_dims_conv,
|
||||
*Y,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
group_,
|
||||
dummy_scale_,
|
||||
dummy_scale_,
|
||||
dummy_scale_,
|
||||
attr_,
|
||||
algo_,
|
||||
pk_);
|
||||
}
|
||||
}
|
||||
|
||||
if (with_bias) {
|
||||
ideep::convolution_forward::compute(conv_param, X, filter_in,
|
||||
Input(BIAS_OR_INPUT_S), *Y);
|
||||
} else {
|
||||
ideep::convolution_forward::compute(conv_param, X, filter_in, *Y);
|
||||
}
|
||||
|
||||
if (fusion_type_ == FUSION_CONV_SUM
|
||||
|| fusion_type_ == FUSION_CONV_SUM_RELU) {
|
||||
CAFFE_ENFORCE_EQ(Y, &(Input(InputSize() - 1)),
|
||||
"Convolution fusion op: InPlace is enforced for sum fusion.");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
iprop pk_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
ialgo algo_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
iattr attr_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
int last_input_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
bool training_mode_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
FusionType fusion_type_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
itensor filter_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
iscale dummy_scale_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
itensor::descriptor cached_X_descriptor_, cached_weights_descriptor_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
ideep::convolution_forward_params conv_param;
|
||||
|
||||
INPUT_TAGS(INPUT_X, FILTER, BIAS_OR_INPUT_S, INPUT_S);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPConvFusionOp final : public IDEEPConvOp {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPConvFusionOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvOp(operator_def, ws) {
|
||||
CAFFE_ENFORCE(OperatorBase::HasArgument("fusion_type"),
|
||||
"You should specify the fusion type");
|
||||
fusion_type_ = static_cast<FusionType>(
|
||||
OperatorBase::GetSingleArgument<int>("fusion_type", FUSION_UNKNOWN));
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
fusion_type_ > FUSION_UNKNOWN && fusion_type_ < FUSION_MAX,
|
||||
"Undefined Conv fusion type.",
|
||||
fusion_type_);
|
||||
|
||||
switch (fusion_type_) {
|
||||
case FUSION_CONV_RELU:
|
||||
attr_ = iattr::fuse_relu();
|
||||
last_input_ = BIAS_OR_INPUT_S;
|
||||
break;
|
||||
case FUSION_CONV_SUM:
|
||||
attr_ = iattr::fuse_sum();
|
||||
last_input_ = INPUT_S;
|
||||
break;
|
||||
case FUSION_CONV_SUM_RELU:
|
||||
attr_ = iattr::residual();
|
||||
last_input_ = INPUT_S;
|
||||
break;
|
||||
default:
|
||||
CAFFE_THROW("Unsupported conv fusion type!");
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPConvFusionOp() override {}
|
||||
};
|
||||
|
||||
const char* kConvFusionDoc = R"DOC(
|
||||
Note that other parameters, such as the stride and
|
||||
kernel size, or the pads' sizes in each direction are not necessary for input
|
||||
because they are provided by the ConvPoolOpBase operator. Various dimension
|
||||
checks are done implicitly, and the sizes are specified in the Input docs for
|
||||
this operator. As is expected, the filter is convolved with a subset of the
|
||||
image and the bias is added; this is done throughout the image data and the
|
||||
output is computed. As a side note on the implementation layout:
|
||||
conv_op_impl.h is the templated implementation of the conv_op.h file, which is
|
||||
why they are separate files.
|
||||
)DOC";
|
||||
|
||||
std::function<void(OpSchema&)> ConvFusionDocGenerator(const char* dim) {
|
||||
return [=](OpSchema& schema) {
|
||||
string doc = R"DOC(
|
||||
The convolution fusion operator consumes an input vector, a {dim}filter blob,
|
||||
a bias blob and another input vector and computes the output. This operator
|
||||
gives the chance to fuse the ReLU or element-wise Sum with a convolution
|
||||
operator. {conv_fusion_doc})DOC";
|
||||
c10::ReplaceAll(doc, "{dim}", dim);
|
||||
c10::ReplaceAll(doc, "{conv_fusion_doc}", kConvFusionDoc);
|
||||
schema.SetDoc(doc);
|
||||
schema.Input(
|
||||
0,
|
||||
"X",
|
||||
"Input data blob from previous layer; has size (N x C x H x W), "
|
||||
"where N is the batch size, C is the number of channels, "
|
||||
"and H and W are the height and width. Note that this is for the NCHW "
|
||||
"usage. On the other hand, the NHWC Op has a different set of "
|
||||
"dimension constraints. ");
|
||||
schema.Input(
|
||||
1,
|
||||
"filter",
|
||||
"The filter blob that will be used in the "
|
||||
"convolutions; has size (M x C x kH x kW), where C is the number of "
|
||||
"channels, and kH and kW are the height and width of the kernel.");
|
||||
schema.Input(
|
||||
2,
|
||||
"bias",
|
||||
"The 1D bias blob that is added through the "
|
||||
"convolution; has size (M).");
|
||||
schema.Input(
|
||||
3,
|
||||
"S",
|
||||
"Input data blob for element-wise Sum fusion from previous layer; "
|
||||
"has the same size of convolution output. Its input index should "
|
||||
"be 2 if no bias for this convolution, and it MUST be inplace with "
|
||||
"output Y.");
|
||||
schema.Output(
|
||||
0,
|
||||
"Y",
|
||||
"Output data blob that contains the result of the "
|
||||
"convolution fusion. The output dimensions are functions of the kernel "
|
||||
"size, stride size, and pad lengths."
|
||||
"");
|
||||
};
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,clang-diagnostic-unused-function)
|
||||
OPERATOR_SCHEMA(ConvFusion)
|
||||
.NumInputs(2, 4)
|
||||
.NumOutputs(1)
|
||||
.TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
|
||||
.CostInferenceFunction(OpSchema::CostInferenceFunctionType(
|
||||
ConvPoolOpBase<CPUContext>::CostInferenceForConv))
|
||||
.Arg("fusion_type", "Which fusion type is used")
|
||||
.AllowInplace({{2, 0}, {3, 0}})
|
||||
.FillUsing(ConvFusionDocGenerator(""));
|
||||
|
||||
class IDEEPConvGradientOp final : public IDEEPConvPoolOpBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvPoolOpBase(operator_def, ws),
|
||||
no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
pad_l() == pad_r() && pad_t() == pad_b(),
|
||||
"Uneven padding not supported.");
|
||||
CAFFE_ENFORCE(
|
||||
!(no_bias_ && OutputSize() == 3),
|
||||
"If bias is not present, you should not have 3 grad output.");
|
||||
CAFFE_ENFORCE(
|
||||
OperatorBase::GetSingleArgument<int>("training_mode", 0),
|
||||
"In order to backward propagate weights correctly, "
|
||||
"please set training_mode=1");
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPConvGradientOp() override {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& filter = Input(FILTER);
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dfilter = Output(FILTER_GRAD);
|
||||
|
||||
if (no_bias_) {
|
||||
ideep::convolution_backward_weights::compute(
|
||||
X,
|
||||
dY,
|
||||
filter.get_dims(),
|
||||
*dfilter,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
group_);
|
||||
} else {
|
||||
auto* dbias = Output(BIAS_OR_INPUT_GRAD);
|
||||
ideep::convolution_backward_weights::compute(
|
||||
X,
|
||||
dY,
|
||||
filter.get_dims(),
|
||||
*dfilter,
|
||||
*dbias,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
group_);
|
||||
}
|
||||
|
||||
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
|
||||
auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
|
||||
ideep::convolution_backward_data::compute(
|
||||
dY,
|
||||
filter,
|
||||
X.get_dims(),
|
||||
*dX,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
group_);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
bool no_bias_;
|
||||
|
||||
INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Conv, IDEEPConvOp);
|
||||
REGISTER_IDEEP_OPERATOR(ConvFusion, IDEEPConvFusionOp);
|
||||
REGISTER_IDEEP_OPERATOR(ConvGradient, IDEEPConvGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,77 +0,0 @@
|
||||
#ifndef CAFFE2_IDEEP_OPERATORS_CONV_POOL_BASE_OP_H_
|
||||
#define CAFFE2_IDEEP_OPERATORS_CONV_POOL_BASE_OP_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "caffe2/ideep/ideep_utils.h"
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class IDEEPConvPoolOpBase : public ConvPoolOpBase<IDEEPContext> {
|
||||
public:
|
||||
IDEEPConvPoolOpBase(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<IDEEPContext>(operator_def, ws) {}
|
||||
~IDEEPConvPoolOpBase() override {}
|
||||
|
||||
inline const ideep::tensor& Input(int index) {
|
||||
return OperatorBase::template Input<ideep::tensor>(index);
|
||||
}
|
||||
inline ideep::tensor* Output(int index) {
|
||||
return OperatorBase::template Output<ideep::tensor>(index);
|
||||
}
|
||||
|
||||
ideep::tensor::dims pad_tl() const {
|
||||
return {pad_t(), pad_l()};
|
||||
}
|
||||
|
||||
ideep::tensor::dims pad_br() const {
|
||||
return {pad_b(), pad_r()};
|
||||
}
|
||||
|
||||
ideep::tensor::dims CalcOutputDims(
|
||||
const ideep::tensor& input,
|
||||
int output_channel) {
|
||||
CAFFE_ENFORCE_GT(input.get_size(), 0);
|
||||
std::vector<int> output_dims;
|
||||
const auto input_dims = input.get_dims();
|
||||
std::vector<std::int64_t> input_Tdims(
|
||||
input_dims.cbegin(), input_dims.cend());
|
||||
InferOutputSize(
|
||||
input_Tdims,
|
||||
output_channel,
|
||||
StorageOrder::NCHW, //order_,
|
||||
global_pooling_,
|
||||
legacy_pad_,
|
||||
dilation_,
|
||||
stride_,
|
||||
&kernel_,
|
||||
&pads_,
|
||||
&output_dims);
|
||||
return {output_dims.begin(), output_dims.end()};
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
if (!global_pooling_) {
|
||||
for (const auto dim : c10::irange(kernel_.size())) {
|
||||
CAFFE_ENFORCE_GT(kernel_[dim], 0);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
return RunOnDeviceWithOrderNCHW();
|
||||
} catch (ideep::error& e) {
|
||||
LOG(ERROR) << "IDEEP error:" << e.message;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#define USE_IDEEP_CONV_POOL_BASE_FUNCTIONS() \
|
||||
USE_OPERATOR_BASE_FUNCTIONS; \
|
||||
/* using override */ using IDEEPConvPoolOpBase::Input; \
|
||||
/* using override */ using IDEEPConvPoolOpBase::Output;
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_IDEEP_OPERATORS_CONV_POOL_BASE_OP_H_
|
||||
@ -1,160 +0,0 @@
|
||||
#include "caffe2/operators/conv_transpose_op.h"
|
||||
#include "caffe2/ideep/operators/conv_transpose_unpool_base_op.h"
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPConvTransposeOp final : public IDEEPConvTransposeUnpoolBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvTransposeUnpoolBase(operator_def, ws),
|
||||
training_mode_(
|
||||
OperatorBase::GetSingleArgument<int>("training_mode", 0)) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
pad_l() == pad_r() && pad_t() == pad_b(),
|
||||
"Uneven padding not supported.");
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPConvTransposeOp() override {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& filter = Input(FILTER);
|
||||
auto* Y = Output(OUTPUT);
|
||||
CAFFE_ENFORCE_EQ(X.ndims(), 4);
|
||||
CAFFE_ENFORCE_EQ(filter.ndims(), 4);
|
||||
CAFFE_ENFORCE_EQ(filter.get_dim(2), kernel_h());
|
||||
CAFFE_ENFORCE_EQ(filter.get_dim(3), kernel_w());
|
||||
CAFFE_ENFORCE_EQ(filter.get_dim(0), X.get_dim(1),
|
||||
"filter number must be equal to input channel number");
|
||||
|
||||
auto Y_dims = CalcOutputDims(X, filter.get_dim(1));
|
||||
|
||||
bool weights_changed = (cached_weights_descriptor_ != filter.get_descriptor());
|
||||
if (!training_mode_ && weights_changed) {
|
||||
cached_weights_descriptor_ = filter.dup_descriptor();
|
||||
// NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
|
||||
auto filter_in = filter;
|
||||
|
||||
auto expected_descriptor =
|
||||
ideep::convolution_transpose_forward::expected_weights_desc(
|
||||
filter.get_dims(),
|
||||
filter.get_data_type(),
|
||||
{stride_.begin(), stride_.end()},
|
||||
pad_tl(),
|
||||
pad_br());
|
||||
if (filter_in.get_descriptor() != expected_descriptor) {
|
||||
filter_.init(expected_descriptor);
|
||||
filter_.feed_from(filter_in, /*is_deconv_weights=*/true);
|
||||
} else {
|
||||
filter_ = filter_in;
|
||||
}
|
||||
}
|
||||
|
||||
auto transposed_filter = training_mode_ ? filter : filter_;
|
||||
transposed_filter.transpose_(0, 1);
|
||||
|
||||
if (InputSize() > BIAS) {
|
||||
const auto& bias = Input(BIAS);
|
||||
CAFFE_ENFORCE_EQ(bias.ndims(), 1, "bias must be 1D tensor");
|
||||
CAFFE_ENFORCE_EQ(
|
||||
bias.get_dim(0), filter.get_dim(1),
|
||||
"bias dimension must be equal to output channel number");
|
||||
|
||||
ideep::convolution_transpose_forward::compute(
|
||||
X, transposed_filter, bias, Y_dims, *Y,
|
||||
{stride_.begin(), stride_.end()} , pad_tl(), pad_br());
|
||||
} else {
|
||||
ideep::convolution_transpose_forward::compute(
|
||||
X, transposed_filter, Y_dims, *Y,
|
||||
{stride_.begin(), stride_.end()}, pad_tl(), pad_br());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
|
||||
const bool training_mode_;
|
||||
ideep::tensor filter_;
|
||||
ideep::tensor::descriptor cached_weights_descriptor_;
|
||||
};
|
||||
|
||||
class IDEEPConvTransposeGradientOp final : public IDEEPConvTransposeUnpoolBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPConvTransposeGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvTransposeUnpoolBase(operator_def, ws),
|
||||
no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", false)) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
pad_l() == pad_r() && pad_t() == pad_b(),
|
||||
"Uneven padding not supported.");
|
||||
CAFFE_ENFORCE(
|
||||
!(no_bias_ && OutputSize() == 3),
|
||||
"If bias is not present, you should not have 3 grad output.");
|
||||
CAFFE_ENFORCE(
|
||||
OperatorBase::GetSingleArgument<int>("training_mode", 0),
|
||||
"In order to backward propagate weights correctly, "
|
||||
"please set training_mode=1");
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPConvTransposeGradientOp() override {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& filter = Input(FILTER);
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dfilter = Output(FILTER_GRAD);
|
||||
auto transposed_filter = filter;
|
||||
transposed_filter.transpose_(0, 1);
|
||||
|
||||
if (no_bias_) {
|
||||
ideep::convolution_transpose_backward_weights::compute(
|
||||
X,
|
||||
dY,
|
||||
filter.get_dims(),
|
||||
*dfilter,
|
||||
{stride_.begin(), stride_.end()},
|
||||
pad_tl(),
|
||||
pad_br());
|
||||
} else {
|
||||
auto* dbias = Output(BIAS_OR_INPUT_GRAD);
|
||||
ideep::convolution_transpose_backward_weights::compute(
|
||||
X,
|
||||
dY,
|
||||
filter.get_dims(),
|
||||
*dfilter,
|
||||
*dbias,
|
||||
{stride_.begin(), stride_.end()},
|
||||
pad_tl(),
|
||||
pad_br());
|
||||
}
|
||||
|
||||
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
|
||||
auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
|
||||
ideep::convolution_transpose_backward_data::compute(
|
||||
dY, transposed_filter, X.get_dims(), *dX,
|
||||
{stride_.begin(), stride_.end()}, pad_tl(), pad_br());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
bool no_bias_;
|
||||
|
||||
INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(ConvTranspose, IDEEPConvTransposeOp);
|
||||
REGISTER_IDEEP_OPERATOR(ConvTransposeGradient, IDEEPConvTransposeGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,261 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "caffe2/ideep/ideep_utils.h"
|
||||
#include "caffe2/proto/caffe2_legacy.pb.h"
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPConvTransposeUnpoolBase : public caffe2::IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPConvTransposeUnpoolBase(const caffe2::OperatorDef& operator_def, caffe2::Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
legacy_pad_(
|
||||
static_cast<caffe2::LegacyPadding>(OperatorBase::GetSingleArgument<int>(
|
||||
"legacy_pad",
|
||||
caffe2::LegacyPadding::NOTSET))),
|
||||
kernel_(OperatorBase::GetRepeatedArgument<int>("kernels")),
|
||||
stride_(OperatorBase::GetRepeatedArgument<int>("strides")),
|
||||
pads_(OperatorBase::GetRepeatedArgument<int>("pads")),
|
||||
adj_(OperatorBase::GetRepeatedArgument<int>("adjs")),
|
||||
shared_buffer_(
|
||||
OperatorBase::GetSingleArgument<int>("shared_buffer", 0)) {
|
||||
// For the padding, they should either be the legacy padding strategy
|
||||
// (VALID or SAME), or an explicit, non-negative value.
|
||||
if (legacy_pad_ == caffe2::LegacyPadding::VALID ||
|
||||
legacy_pad_ == caffe2::LegacyPadding::SAME) {
|
||||
CAFFE_ENFORCE(
|
||||
!OperatorBase::HasArgument("pads"),
|
||||
"If you use legacy padding VALID or SAME, you should not specify "
|
||||
"any specific padding values.");
|
||||
}
|
||||
// Get old arguments values.
|
||||
if (OperatorBase::HasArgument("kernel")) {
|
||||
kernel_.resize(2, OperatorBase::GetSingleArgument<int>("kernel", 0));
|
||||
} else if (
|
||||
OperatorBase::HasArgument("kernel_h") &&
|
||||
OperatorBase::HasArgument("kernel_w")) {
|
||||
kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_h", 0));
|
||||
kernel_.push_back(OperatorBase::GetSingleArgument<int>("kernel_w", 0));
|
||||
}
|
||||
|
||||
if (OperatorBase::HasArgument("stride")) {
|
||||
stride_.resize(2, OperatorBase::GetSingleArgument<int>("stride", 0));
|
||||
} else if (
|
||||
OperatorBase::HasArgument("stride_h") &&
|
||||
OperatorBase::HasArgument("stride_w")) {
|
||||
stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_h", 0));
|
||||
stride_.push_back(OperatorBase::GetSingleArgument<int>("stride_w", 0));
|
||||
}
|
||||
|
||||
if (OperatorBase::HasArgument("adj")) {
|
||||
adj_.resize(2, OperatorBase::GetSingleArgument<int>("adj", 0));
|
||||
} else if (
|
||||
OperatorBase::HasArgument("adj_h") &&
|
||||
OperatorBase::HasArgument("adj_w")) {
|
||||
adj_.push_back(OperatorBase::GetSingleArgument<int>("adj_h", 0));
|
||||
adj_.push_back(OperatorBase::GetSingleArgument<int>("adj_w", 0));
|
||||
}
|
||||
|
||||
if (OperatorBase::HasArgument("pad")) {
|
||||
CAFFE_ENFORCE(
|
||||
legacy_pad_ != caffe2::LegacyPadding::VALID &&
|
||||
legacy_pad_ != caffe2::LegacyPadding::SAME,
|
||||
"If you use legacy padding VALID or SAME, you should not specify "
|
||||
"any specific padding values.");
|
||||
pads_.resize(4, OperatorBase::GetSingleArgument<int>("pad", 0));
|
||||
} else if (
|
||||
OperatorBase::HasArgument("pad_t") &&
|
||||
OperatorBase::HasArgument("pad_l") &&
|
||||
OperatorBase::HasArgument("pad_b") &&
|
||||
OperatorBase::HasArgument("pad_r")) {
|
||||
CAFFE_ENFORCE(
|
||||
legacy_pad_ != caffe2::LegacyPadding::VALID &&
|
||||
legacy_pad_ != caffe2::LegacyPadding::SAME,
|
||||
"If you use legacy padding VALID or SAME, you should not specify "
|
||||
"any specific padding values.");
|
||||
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_t", 0));
|
||||
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_l", 0));
|
||||
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_b", 0));
|
||||
pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_r", 0));
|
||||
}
|
||||
|
||||
// Fill default values.
|
||||
if (kernel_.empty()) {
|
||||
kernel_.assign({0, 0});
|
||||
}
|
||||
|
||||
if (stride_.empty()) {
|
||||
stride_.assign(kernel_.size(), 1);
|
||||
}
|
||||
|
||||
if (pads_.empty()) {
|
||||
pads_.assign(kernel_.size() * 2, 0);
|
||||
}
|
||||
|
||||
if (adj_.empty()) {
|
||||
adj_.assign(kernel_.size(), 0);
|
||||
}
|
||||
|
||||
CAFFE_ENFORCE_EQ(stride_.size(), kernel_.size());
|
||||
CAFFE_ENFORCE_EQ(adj_.size(), kernel_.size());
|
||||
|
||||
if (legacy_pad_ != caffe2::LegacyPadding::VALID &&
|
||||
legacy_pad_ != caffe2::LegacyPadding::SAME) {
|
||||
CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size());
|
||||
}
|
||||
|
||||
for (const auto dim : c10::irange(kernel_.size())) {
|
||||
CAFFE_ENFORCE_GT(kernel_[dim], 0);
|
||||
CAFFE_ENFORCE_GT(stride_[dim], 0);
|
||||
CAFFE_ENFORCE_GE(adj_[dim], 0);
|
||||
CAFFE_ENFORCE_LE(adj_[dim], stride_[dim]);
|
||||
}
|
||||
}
|
||||
~IDEEPConvTransposeUnpoolBase() override {}
|
||||
|
||||
const ideep::tensor& Input(int index) {
|
||||
return OperatorBase::template Input<ideep::tensor>(index);
|
||||
}
|
||||
ideep::tensor* Output(int index) {
|
||||
return OperatorBase::template Output<ideep::tensor>(index);
|
||||
}
|
||||
|
||||
ideep::tensor::dims pad_tl() const {
|
||||
return {pad_t(), pad_l()};
|
||||
}
|
||||
|
||||
ideep::tensor::dims pad_br() const {
|
||||
return {pad_b(), pad_r()};
|
||||
}
|
||||
|
||||
ideep::tensor::dims CalcOutputDims(
|
||||
const ideep::tensor& input,
|
||||
int output_channel) {
|
||||
CAFFE_ENFORCE_GT(input.get_size(), 0);
|
||||
|
||||
int N = input.get_dim(0);
|
||||
ideep::tensor::dims output_dims;
|
||||
auto input_dims = input.get_dims();
|
||||
itensor::dims dims;
|
||||
dims.assign(input_dims.begin() + 2, input_dims.end());
|
||||
for (const auto dim : c10::irange(dims.size())) {
|
||||
int dim_size = 0;
|
||||
ComputeSizeAndPad(
|
||||
dims[dim],
|
||||
stride_[dim],
|
||||
kernel_[dim],
|
||||
adj_[dim],
|
||||
&pads_[dim],
|
||||
&pads_[dim + 2],
|
||||
&dim_size);
|
||||
output_dims.push_back(dim_size);
|
||||
}
|
||||
|
||||
output_dims.insert(output_dims.begin(), {N, output_channel});
|
||||
return output_dims;
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
try {
|
||||
return RunOnDeviceWithOrderNCHW();
|
||||
} catch (ideep::error& e) {
|
||||
LOG(ERROR) << "IDEEP error:" << e.message;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool RunOnDeviceWithOrderNCHW() {
|
||||
CAFFE_THROW("Not implemented");
|
||||
}
|
||||
|
||||
private:
|
||||
caffe2::LegacyPadding legacy_pad_;
|
||||
|
||||
protected:
|
||||
std::vector<int> kernel_;
|
||||
std::vector<int> stride_;
|
||||
std::vector<int> pads_;
|
||||
std::vector<int> adj_;
|
||||
bool shared_buffer_;
|
||||
|
||||
// Accessors for 2D conv params.
|
||||
|
||||
inline int pad_t() const {
|
||||
return pads_[0];
|
||||
}
|
||||
|
||||
inline int pad_l() const {
|
||||
return pads_[1];
|
||||
}
|
||||
|
||||
inline int pad_b() const {
|
||||
return pads_[2];
|
||||
}
|
||||
|
||||
inline int pad_r() const {
|
||||
return pads_[3];
|
||||
}
|
||||
|
||||
inline int kernel_h() const {
|
||||
return kernel_[0];
|
||||
}
|
||||
|
||||
inline int kernel_w() const {
|
||||
return kernel_[1];
|
||||
}
|
||||
|
||||
inline int stride_h() const {
|
||||
return stride_[0];
|
||||
}
|
||||
|
||||
inline int stride_w() const {
|
||||
return stride_[1];
|
||||
}
|
||||
|
||||
inline int adj_h() const {
|
||||
return adj_[0];
|
||||
}
|
||||
|
||||
inline int adj_w() const {
|
||||
return adj_[1];
|
||||
}
|
||||
|
||||
inline void ComputeSizeAndPad(
|
||||
const int in_size,
|
||||
const int stride,
|
||||
const int kernel,
|
||||
const int adj,
|
||||
int* pad_head,
|
||||
int* pad_tail,
|
||||
int* out_size) {
|
||||
switch (legacy_pad_) {
|
||||
case caffe2::LegacyPadding::NOTSET:
|
||||
CAFFE_ENFORCE_GE(*pad_head, 0);
|
||||
CAFFE_ENFORCE_GE(*pad_tail, 0);
|
||||
*out_size =
|
||||
(in_size - 1) * stride + kernel + adj - *pad_head - *pad_tail;
|
||||
break;
|
||||
// We handle cases of LegacyPadding::VALID and LegacyPadding::SAME
|
||||
// the same way
|
||||
case caffe2::LegacyPadding::VALID:
|
||||
case caffe2::LegacyPadding::SAME:
|
||||
*pad_head = 0;
|
||||
*pad_tail = 0;
|
||||
*out_size = (in_size - 1) * stride + kernel + adj;
|
||||
break;
|
||||
case caffe2::LegacyPadding::CAFFE_LEGACY_POOLING:
|
||||
LOG(FATAL) << "CAFFE_LEGACY_POOLING is no longer supported.";
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#define USE_IDEEP_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS() \
|
||||
USE_OPERATOR_BASE_FUNCTIONS; \
|
||||
/* using override */ using IDEEPConvTransposeUnpoolBase::Input; \
|
||||
/* using override */ using IDEEPConvTransposeUnpoolBase::Output;
|
||||
|
||||
} // namespace
|
||||
@ -1,94 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPDropoutOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPDropoutOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
|
||||
is_test_(
|
||||
OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
|
||||
CAFFE_ENFORCE_GE(ratio_, 0);
|
||||
CAFFE_ENFORCE_LT(ratio_, 1);
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPDropoutOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
if (is_test_) {
|
||||
if (Y != &X) {
|
||||
ideep::direct_copy::compute(X, *Y);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
auto* mask = Output(MASK);
|
||||
ideep::dropout_forward::compute(X, ratio_, *Y, *mask);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
float ratio_;
|
||||
bool is_test_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT, MASK);
|
||||
};
|
||||
|
||||
class IDEEPDropoutGradientOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
|
||||
is_test_(
|
||||
OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
|
||||
CAFFE_ENFORCE_GE(ratio_, 0);
|
||||
CAFFE_ENFORCE_LT(ratio_, 1);
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPDropoutGradientOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dX = Output(INPUT_GRAD);
|
||||
|
||||
if (is_test_) {
|
||||
if (dX != &dY) {
|
||||
ideep::direct_copy::compute(dY, *dX);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const auto& mask = Input(MASK);
|
||||
ideep::dropout_backward::compute(mask, dY, *dX);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
float ratio_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
bool is_test_;
|
||||
|
||||
INPUT_TAGS(OUTPUT_GRAD , MASK);
|
||||
OUTPUT_TAGS(INPUT_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Dropout, IDEEPDropoutOp);
|
||||
REGISTER_IDEEP_OPERATOR(DropoutGrad, IDEEPDropoutGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,82 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
|
||||
#include "caffe2/operators/utility_ops.h"
|
||||
#include "caffe2/operators/elementwise_add_op.h"
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPSumOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
using FALLBACK_SUM = IDEEPFallbackOp<SumOp<CPUContext>, SkipIndices<0>>;
|
||||
using FALLBACK_ADD = IDEEPFallbackOp<BinaryElementwiseOp<
|
||||
NumericTypes, CPUContext, AddFunctor<CPUContext>>, SkipIndices<0>>;
|
||||
|
||||
IDEEPSumOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
fallback_sum_(operator_def, ws),
|
||||
fallback_add_(operator_def, ws) {}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPSumOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
itensor::dims input_dims;
|
||||
bool fallback_to_cpu = false;
|
||||
vector<itensor> inputs_itensor;
|
||||
|
||||
// We only support element-wise sum for ideep tensors here.
|
||||
// If a CPU tensor is detected in input list, we have to fallback
|
||||
// to corresponding CPU operator.
|
||||
for (int i = 0; i < InputSize(); ++i) {
|
||||
if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
|
||||
auto& tensor_ideep = Input(i);
|
||||
if (input_dims.empty()) {
|
||||
input_dims = tensor_ideep.get_dims();
|
||||
} else if (input_dims != tensor_ideep.get_dims()) {
|
||||
fallback_to_cpu = true;
|
||||
break;
|
||||
}
|
||||
inputs_itensor.emplace_back(tensor_ideep);
|
||||
} else {
|
||||
CAFFE_ENFORCE(
|
||||
BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
|
||||
"Expect cpu tensor if not itensor");
|
||||
fallback_to_cpu = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fallback_to_cpu) {
|
||||
auto* Y = Output(OUTPUT);
|
||||
if (InputSize() == 1) {
|
||||
const auto& X = Input(INPUT0);
|
||||
ideep::direct_copy::compute(X, *Y);
|
||||
} else {
|
||||
const vector<float> scales(InputSize(), 1.0);
|
||||
ideep::sum::compute(scales, inputs_itensor, *Y);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (InputSize() == 2) {
|
||||
return fallback_add_.Run(0);
|
||||
}
|
||||
|
||||
return fallback_sum_.Run(0);
|
||||
}
|
||||
|
||||
private:
|
||||
FALLBACK_SUM fallback_sum_;
|
||||
FALLBACK_ADD fallback_add_;
|
||||
|
||||
INPUT_TAGS(INPUT0);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Sum, IDEEPSumOp);
|
||||
REGISTER_IDEEP_OPERATOR(Add, IDEEPSumOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,133 +0,0 @@
|
||||
#include "caffe2/operators/expand_squeeze_dims_op.h"
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPExpandDimsOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
using FALLBACK_OP = IDEEPFallbackOp<ExpandDimsOp<CPUContext>, SkipIndices<0>>;
|
||||
|
||||
IDEEPExpandDimsOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
fallback_(operator_def, ws) {
|
||||
dims_ = OperatorBase::GetRepeatedArgument<int>("dims");
|
||||
auto originalSize = dims_.size();
|
||||
CAFFE_ENFORCE_GT(originalSize, 0, "Parameter `dims` must be provided.");
|
||||
std::sort(dims_.begin(), dims_.end());
|
||||
dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
|
||||
if (dims_.size() < originalSize) {
|
||||
LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
|
||||
}
|
||||
CAFFE_ENFORCE_GE(dims_.front(), 0, "Dimension ids must be non-negative.");
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPExpandDimsOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
if (!OperatorBase::InputBlob(INPUT).template IsType<itensor>()) {
|
||||
return fallback_.Run(0);
|
||||
}
|
||||
|
||||
const auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
if (&X != Y) {
|
||||
// Copy if not inplace
|
||||
ideep::direct_copy::compute(X, *Y);
|
||||
}
|
||||
if (dims_.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto newDims = X.get_dims();
|
||||
CAFFE_ENFORCE_GE(
|
||||
newDims.size() + dims_.size(),
|
||||
dims_.back() + 1,
|
||||
"Input needs at least ",
|
||||
(1 + dims_.back() - dims_.size()),
|
||||
" dimensions given `dims`.");
|
||||
|
||||
for (const auto dim : dims_) {
|
||||
newDims.insert(newDims.begin() + dim, 1);
|
||||
}
|
||||
|
||||
Y->reshape(newDims);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<int> dims_;
|
||||
FALLBACK_OP fallback_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
|
||||
class IDEEPSqueezeOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
using FALLBACK_OP = IDEEPFallbackOp<SqueezeOp<CPUContext>, SkipIndices<0>>;
|
||||
|
||||
IDEEPSqueezeOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
fallback_(operator_def, ws) {
|
||||
dims_ = OperatorBase::GetRepeatedArgument<int>("dims");
|
||||
auto originalSize = dims_.size();
|
||||
CAFFE_ENFORCE_GT(originalSize, 0, "Parameter `dims` must be provided.");
|
||||
|
||||
std::sort(dims_.begin(), dims_.end());
|
||||
dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
|
||||
if (dims_.size() < originalSize) {
|
||||
LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
|
||||
}
|
||||
CAFFE_ENFORCE_GE(dims_.front(), 0, "Dimension ids must be non-negative.");
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPSqueezeOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
if (!OperatorBase::InputBlob(INPUT).template IsType<itensor>()) {
|
||||
return fallback_.Run(0);
|
||||
}
|
||||
|
||||
const auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
CAFFE_ENFORCE_GT(
|
||||
X.ndims(),
|
||||
dims_.back(),
|
||||
"Input needs at least ",
|
||||
(dims_.back() + 1),
|
||||
" dimensions.");
|
||||
const auto& ideep_dims = X.get_dims();
|
||||
std::vector<int64_t> dims(ideep_dims.begin(), ideep_dims.end());
|
||||
const auto new_dims = SqueezeOp<IDEEPContext>::ComputeDims(dims, dims_);
|
||||
itensor::dims new_dims_ideep(new_dims.begin(), new_dims.end());
|
||||
if (&X != Y) {
|
||||
// Copy if not inplace
|
||||
ideep::direct_copy::compute(X, *Y);
|
||||
}
|
||||
|
||||
Y->reshape(new_dims_ideep);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<int> dims_;
|
||||
FALLBACK_OP fallback_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(ExpandDims, IDEEPExpandDimsOp);
|
||||
REGISTER_IDEEP_OPERATOR(Squeeze, IDEEPSqueezeOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,145 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPFullyConnectedOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPFullyConnectedOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
|
||||
axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
|
||||
training_mode_(OperatorBase::GetSingleArgument<int>("training_mode", 0)) {}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPFullyConnectedOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& filter = Input(FILTER);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
itensor X_in = X;
|
||||
auto X_dims = CanonicalDims(X_in.get_dims(), axis_);
|
||||
if (X_in.get_dims() != X_dims) {
|
||||
X_in.reshape(X_dims);
|
||||
}
|
||||
|
||||
if (training_mode_) {
|
||||
filter_ = filter;
|
||||
auto filter_dims = CanonicalDims(filter_.get_dims(), axis_w_);
|
||||
if (filter_.get_dims() != filter_dims) {
|
||||
filter_.reshape(filter_dims);
|
||||
}
|
||||
|
||||
if (InputSize() > BIAS) {
|
||||
bias_ = Input(BIAS);
|
||||
}
|
||||
} else {
|
||||
if (cached_X_descriptor_ != X.get_descriptor()) {
|
||||
cached_X_descriptor_ = X.dup_descriptor();
|
||||
}
|
||||
|
||||
if (cached_weights_descriptor_ != filter.get_descriptor()) {
|
||||
cached_weights_descriptor_ = filter.dup_descriptor();
|
||||
|
||||
filter_ = filter.has_scale() ? filter.to_public() : filter;
|
||||
auto filter_dims = CanonicalDims(filter_.get_dims(), axis_w_);
|
||||
if (filter_.get_dims() != filter_dims) {
|
||||
filter_.reshape(filter_dims);
|
||||
}
|
||||
|
||||
if (InputSize() > BIAS) {
|
||||
const auto& bias = Input(BIAS);
|
||||
bias_ = bias.has_scale() ? bias.to_public() : bias;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (InputSize() > BIAS) {
|
||||
ideep::inner_product_forward::compute(
|
||||
X_in, filter_, bias_, *Y);
|
||||
} else {
|
||||
ideep::inner_product_forward::compute(X_in, filter_, *Y);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t axis_{1};
|
||||
size_t axis_w_{1};
|
||||
bool training_mode_;
|
||||
|
||||
itensor filter_, bias_;
|
||||
itensor::descriptor cached_X_descriptor_, cached_weights_descriptor_;
|
||||
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPFullyConnectedGradientOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPFullyConnectedGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
|
||||
axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)) {}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPFullyConnectedGradientOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& filter = Input(FILTER);
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dfilter = Output(FILTER_GRAD);
|
||||
auto* dbias = Output(BIAS_GRAD);
|
||||
|
||||
itensor X_in = X;
|
||||
auto X_dims = CanonicalDims(X_in.get_dims(), axis_);
|
||||
if (X_in.get_dims() != X_dims) {
|
||||
X_in.reshape(X_dims);
|
||||
}
|
||||
|
||||
itensor filter_in = filter;
|
||||
auto filter_dims = CanonicalDims(filter_in.get_dims(), axis_w_);
|
||||
if (filter_in.get_dims() != filter_dims) {
|
||||
filter_in.reshape(filter_dims);
|
||||
}
|
||||
|
||||
ideep::inner_product_backward_weights::compute(X_in, dY, *dfilter, *dbias);
|
||||
dfilter->to_default_format();
|
||||
|
||||
/**
|
||||
* In mkl-dnn,weight gradient shape is determined by X_in,
|
||||
* so we should ensure that weight gradient shape is consistent with weight shape.
|
||||
*/
|
||||
if (dfilter->get_dims() != filter.get_dims()) {
|
||||
dfilter->reshape(filter.get_dims());
|
||||
}
|
||||
|
||||
if (OutputSize() > INPUT_GRAD) {
|
||||
ideep::inner_product_backward_data::compute(
|
||||
dY, filter_in, X.get_dims(), *Output(INPUT_GRAD));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t axis_{1};
|
||||
size_t axis_w_{1};
|
||||
|
||||
INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(FILTER_GRAD, BIAS_GRAD, INPUT_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(FC, IDEEPFullyConnectedOp);
|
||||
REGISTER_IDEEP_OPERATOR(FCGradient, IDEEPFullyConnectedGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,87 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPLRNOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPLRNOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
size_(OperatorBase::GetSingleArgument<int>("size", 0)),
|
||||
alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
|
||||
beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
|
||||
bias_(OperatorBase::GetSingleArgument<float>("bias", 1)) {
|
||||
TORCH_DCHECK_GT(size_, 0);
|
||||
TORCH_DCHECK_EQ(size_ % 2, 1);
|
||||
TORCH_DCHECK_GT(alpha_, 0);
|
||||
TORCH_DCHECK_GT(beta_, 0);
|
||||
}
|
||||
~IDEEPLRNOp() override = default;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
ideep::lrn_forward::compute(X, *Y, size_, alpha_, beta_, bias_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
const int size_;
|
||||
const float alpha_;
|
||||
const float beta_;
|
||||
const float bias_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPLRNGradientOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPLRNGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
size_(OperatorBase::GetSingleArgument<int>("size", 0)),
|
||||
alpha_(OperatorBase::GetSingleArgument<float>("alpha", 0)),
|
||||
beta_(OperatorBase::GetSingleArgument<float>("beta", 0)),
|
||||
bias_(OperatorBase::GetSingleArgument<float>("bias", 1)) {
|
||||
TORCH_DCHECK_GT(size_, 0);
|
||||
TORCH_DCHECK_EQ(size_ % 2, 1);
|
||||
TORCH_DCHECK_GT(alpha_, 0);
|
||||
TORCH_DCHECK_GT(beta_, 0);
|
||||
}
|
||||
~IDEEPLRNGradientOp() override = default;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& Y = Input(FILTER);
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dX = Output(INPUT_GRAD);
|
||||
|
||||
ideep::lrn_backward::compute(X, dY, Y, *dX, size_, alpha_, beta_, bias_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
const int size_;
|
||||
const float alpha_;
|
||||
const float beta_;
|
||||
const float bias_;
|
||||
|
||||
INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(INPUT_GRAD);
|
||||
};
|
||||
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(LRN, IDEEPLRNOp);
|
||||
REGISTER_IDEEP_OPERATOR(LRNGradient, IDEEPLRNGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,131 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
void momentum_sgd_update(
|
||||
const int N,
|
||||
const float* g,
|
||||
const float* m,
|
||||
float* ng,
|
||||
float* nm,
|
||||
const float* lr,
|
||||
const float momentum,
|
||||
const bool nesterov,
|
||||
float* param) {
|
||||
const float LR = lr[0];
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for schedule(static)
|
||||
#endif
|
||||
for (auto i = 0; i < N; ++i) {
|
||||
if (!nesterov) {
|
||||
const float adjusted_gradient = LR * g[i] + momentum * m[i];
|
||||
nm[i] = adjusted_gradient;
|
||||
ng[i] = adjusted_gradient;
|
||||
} else {
|
||||
const float mi = m[i];
|
||||
const float mi_new = momentum * mi + LR * g[i];
|
||||
nm[i] = mi_new;
|
||||
ng[i] = (1 + momentum) * mi_new - momentum * mi;
|
||||
}
|
||||
|
||||
if (param) {
|
||||
param[i] -= ng[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class IDEEPMomentumSGDOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPMomentumSGDOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
|
||||
nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems());
|
||||
if (Input(GRAD) != *Output(OUTPUT_GRAD)) {
|
||||
Output(OUTPUT_GRAD)->init(Input(GRAD).get_descriptor());
|
||||
}
|
||||
if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) {
|
||||
Output(OUTPUT_MOMENTUM)->init(Input(MOMENTUM).get_descriptor());
|
||||
}
|
||||
|
||||
// TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
|
||||
const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
|
||||
CAFFE_ENFORCE(lr.numel() == 1);
|
||||
|
||||
momentum_sgd_update(
|
||||
Input(GRAD).get_nelems(),
|
||||
static_cast<float*>(Input(GRAD).get_data_handle()),
|
||||
static_cast<float*>(Input(MOMENTUM).get_data_handle()),
|
||||
static_cast<float*>(Output(OUTPUT_GRAD)->get_data_handle()),
|
||||
static_cast<float*>(Output(OUTPUT_MOMENTUM)->get_data_handle()),
|
||||
lr.template data<float>(),
|
||||
momentum_,
|
||||
nesterov_,
|
||||
nullptr);
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
float momentum_ = 0.9f;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
bool nesterov_;
|
||||
INPUT_TAGS(GRAD, MOMENTUM, LR);
|
||||
OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM);
|
||||
};
|
||||
|
||||
class IDEEPMomentumSGDUpdateOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
IDEEPMomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
|
||||
nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems());
|
||||
if (Input(GRAD) != *Output(OUTPUT_GRAD)) {
|
||||
Output(OUTPUT_GRAD)->init(Input(GRAD).get_descriptor());
|
||||
}
|
||||
if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) {
|
||||
Output(OUTPUT_MOMENTUM)->init(Input(MOMENTUM).get_descriptor());
|
||||
}
|
||||
|
||||
// TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
|
||||
const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
|
||||
CAFFE_ENFORCE(lr.numel() == 1);
|
||||
|
||||
momentum_sgd_update(
|
||||
Input(GRAD).get_nelems(),
|
||||
static_cast<float*>(Input(GRAD).get_data_handle()),
|
||||
static_cast<float*>(Input(MOMENTUM).get_data_handle()),
|
||||
static_cast<float*>(Output(OUTPUT_GRAD)->get_data_handle()),
|
||||
static_cast<float*>(Output(OUTPUT_MOMENTUM)->get_data_handle()),
|
||||
lr.template data<float>(),
|
||||
momentum_,
|
||||
nesterov_,
|
||||
static_cast<float*>(Output(OUTPUT_PARAM)->get_data_handle()));
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
float momentum_ = 0.9f;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
bool nesterov_;
|
||||
INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM);
|
||||
OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(MomentumSGD, IDEEPMomentumSGDOp);
|
||||
REGISTER_IDEEP_OPERATOR(MomentumSGDUpdate, IDEEPMomentumSGDUpdateOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,279 +0,0 @@
|
||||
#include <caffe2/ideep/operators/operator_fallback_ideep.h>
|
||||
#include <caffe2/ideep/utils/ideep_operator.h>
|
||||
|
||||
#include <caffe2/operators/abs_op.h>
|
||||
#include <caffe2/operators/accuracy_op.h>
|
||||
#include <caffe2/operators/affine_channel_op.h>
|
||||
#include <caffe2/operators/atan_op.h>
|
||||
#include <caffe2/operators/batch_matmul_op.h>
|
||||
#include <caffe2/operators/cast_op.h>
|
||||
#include <caffe2/operators/clip_op.h>
|
||||
#include <caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h>
|
||||
#include <caffe2/operators/cross_entropy_op.h>
|
||||
#include <caffe2/operators/ctc_beam_search_decoder_op.h>
|
||||
#include <caffe2/operators/ctc_greedy_decoder_op.h>
|
||||
#include <caffe2/operators/distance_op.h>
|
||||
#include <caffe2/operators/dropout_op.h>
|
||||
#include <caffe2/operators/elementwise_add_op.h>
|
||||
#include <caffe2/operators/elementwise_div_op.h>
|
||||
#include <caffe2/operators/elementwise_mul_op.h>
|
||||
#include <caffe2/operators/elementwise_ops.h>
|
||||
#include <caffe2/operators/elementwise_sub_op.h>
|
||||
#include <caffe2/operators/expand_op.h>
|
||||
#include <caffe2/operators/filler_op.h>
|
||||
#include <caffe2/operators/flatten_op.h>
|
||||
#include <caffe2/operators/gather_op.h>
|
||||
#include <caffe2/operators/generate_proposals_op.h>
|
||||
#include <caffe2/operators/given_tensor_fill_op.h>
|
||||
#include <caffe2/operators/load_save_op.h>
|
||||
#include <caffe2/operators/loss_op.h>
|
||||
#include <caffe2/operators/normalize_op.h>
|
||||
#include <caffe2/operators/pad_op.h>
|
||||
#include <caffe2/operators/prelu_op.h>
|
||||
#include <caffe2/operators/reduce_ops.h>
|
||||
#include <caffe2/operators/rmac_regions_op.h>
|
||||
#include <caffe2/operators/roi_align_op.h>
|
||||
#include <caffe2/operators/roi_align_rotated_op.h>
|
||||
#include <caffe2/operators/roi_pool_op.h>
|
||||
#include <caffe2/operators/scale_op.h>
|
||||
#include <caffe2/operators/slice_op.h>
|
||||
#include <caffe2/operators/softmax_op.h>
|
||||
#include <caffe2/operators/softmax_with_loss_op.h>
|
||||
#include <caffe2/operators/sqrt_op.h>
|
||||
#include <caffe2/operators/stop_gradient.h>
|
||||
#include <caffe2/operators/tanh_op.h>
|
||||
#include <caffe2/operators/tensor_protos_db_input.h>
|
||||
#include <caffe2/operators/utility_ops.h>
|
||||
#include <caffe2/queue/queue_ops.h>
|
||||
#include <caffe2/sgd/iter_op.h>
|
||||
#include <caffe2/sgd/learning_rate_op.h>
|
||||
#include "caffe2/operators/bbox_transform_op.h"
|
||||
#include "caffe2/operators/box_with_nms_limit_op.h"
|
||||
|
||||
// can add more non-IDEEP operators if needed
|
||||
namespace caffe2 {
|
||||
|
||||
// Boolean operators
|
||||
REGISTER_IDEEP_COMPARE_OPERATOR(EQ);
|
||||
REGISTER_IDEEP_COMPARE_OPERATOR(GT);
|
||||
REGISTER_IDEEP_COMPARE_OPERATOR(GE);
|
||||
REGISTER_IDEEP_COMPARE_OPERATOR(LT);
|
||||
REGISTER_IDEEP_COMPARE_OPERATOR(LE);
|
||||
REGISTER_IDEEP_COMPARE_OPERATOR(NE);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Softmax, IDEEPFallbackOp<SoftmaxOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
LabelCrossEntropy,
|
||||
IDEEPFallbackOp<LabelCrossEntropyOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
AveragedLoss,
|
||||
IDEEPFallbackOp<AveragedLoss<float, CPUContext>, SkipIndices<0>>);
|
||||
REGISTER_IDEEP_OPERATOR(Flatten, IDEEPFallbackOp<FlattenOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(ResizeLike, IDEEPFallbackOp<ResizeLikeOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(Slice, IDEEPFallbackOp<SliceOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(Clip, IDEEPFallbackOp<ClipOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
ScatterAssign,
|
||||
IDEEPFallbackOp<ScatterAssignOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Cast,
|
||||
IDEEPFallbackOp<CastOp<CPUContext>>);
|
||||
|
||||
// filter operators
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
XavierFill,
|
||||
IDEEPFallbackOp<XavierFillOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
ConstantFill,
|
||||
IDEEPFallbackOp<ConstantFillOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GaussianFill,
|
||||
IDEEPFallbackOp<GaussianFillOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
MSRAFill,
|
||||
IDEEPFallbackOp<MSRAFillOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GivenTensorFill,
|
||||
IDEEPFallbackOp<GivenTensorFillOp<float, CPUContext>>);
|
||||
// Not supported tensor types in below FillOp
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GivenTensorDoubleFill,
|
||||
IDEEPFallbackOp<GivenTensorFillOp<double, CPUContext>, SkipIndices<0>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GivenTensorBoolFill,
|
||||
IDEEPFallbackOp<GivenTensorFillOp<bool, CPUContext>, SkipIndices<0>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GivenTensorIntFill,
|
||||
IDEEPFallbackOp<GivenTensorFillOp<int, CPUContext>, SkipIndices<0>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GivenTensorInt64Fill,
|
||||
IDEEPFallbackOp<GivenTensorFillOp<int64_t, CPUContext>, SkipIndices<0>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GivenTensorStringFill,
|
||||
IDEEPFallbackOp<GivenTensorFillOp<std::string, CPUContext>, SkipIndices<0>>);
|
||||
REGISTER_IDEEP_OPERATOR(Load, IDEEPFallbackOp<LoadOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(Save, IDEEPFallbackOp<SaveOp<CPUContext>>);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
RMACRegions,
|
||||
IDEEPFallbackOp<RMACRegionsOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(RoIPool, IDEEPFallbackOp<RoIPoolOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
RoIAlign,
|
||||
IDEEPFallbackOp<RoIAlignOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
RoIAlignRotated,
|
||||
IDEEPFallbackOp<RoIAlignRotatedOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GenerateProposals,
|
||||
IDEEPFallbackOp<GenerateProposalsOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
GenerateProposalsCPP,
|
||||
IDEEPFallbackOp<GenerateProposalsOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
CollectAndDistributeFpnRpnProposals,
|
||||
IDEEPFallbackOp<CollectAndDistributeFpnRpnProposalsOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
BoxWithNMSLimit,
|
||||
IDEEPFallbackOp<BoxWithNMSLimitOp<CPUContext>, SkipIndices<0,1,2>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
BBoxTransform,
|
||||
IDEEPFallbackOp<BBoxTransformOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
AffineChannel,
|
||||
IDEEPFallbackOp<AffineChannelOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
StopGradient,
|
||||
IDEEPFallbackOp<StopGradientOp<CPUContext>>);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
PadImage,
|
||||
IDEEPFallbackOp<PadImageOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
PRelu,
|
||||
IDEEPFallbackOp<PReluOp<float, CPUContext>>);
|
||||
|
||||
// ctc decoder operators
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
CTCGreedyDecoder,
|
||||
IDEEPFallbackOp<CTCGreedyDecoderOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
CTCBeamSearchDecoder,
|
||||
IDEEPFallbackOp<CTCBeamSearchDecoderOp<CPUContext>>);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
AveragedLossGradient,
|
||||
IDEEPFallbackOp<AveragedLossGradient<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
LabelCrossEntropyGradient,
|
||||
IDEEPFallbackOp<LabelCrossEntropyGradientOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
SoftmaxGradient,
|
||||
IDEEPFallbackOp<SoftmaxGradientOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Iter,
|
||||
IDEEPFallbackOp<IterOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
LearningRate,
|
||||
IDEEPFallbackOp<LearningRateOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Abs,
|
||||
IDEEPFallbackOp<UnaryElementwiseOp<
|
||||
TensorTypes<float>, CPUContext, AbsFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Atan,
|
||||
IDEEPFallbackOp<UnaryElementwiseOp<
|
||||
TensorTypes<float>, CPUContext, AtanFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Sqrt,
|
||||
IDEEPFallbackOp<UnaryElementwiseOp<
|
||||
TensorTypes<float>, CPUContext, SqrtFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Sign,
|
||||
IDEEPFallbackOp<UnaryElementwiseOp<
|
||||
TensorTypes<float>,
|
||||
CPUContext,
|
||||
SignFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Div,
|
||||
IDEEPFallbackOp<BinaryElementwiseOp<
|
||||
NumericTypes, CPUContext, DivFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Mul,
|
||||
IDEEPFallbackOp<
|
||||
BinaryElementwiseOp<NumericTypes, CPUContext, MulFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Sub,
|
||||
IDEEPFallbackOp<BinaryElementwiseOp<
|
||||
NumericTypes, CPUContext, SubFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Tanh,
|
||||
IDEEPFallbackOp<UnaryElementwiseOp<
|
||||
TensorTypes<float>,
|
||||
CPUContext,
|
||||
TanhFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
L1Distance,
|
||||
IDEEPFallbackOp<L1DistanceOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(Scale, IDEEPFallbackOp<ScaleOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Accuracy,
|
||||
IDEEPFallbackOp<AccuracyOp<float, CPUContext>>);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
AddGradient,
|
||||
IDEEPFallbackOp<BinaryElementwiseGradientOp<
|
||||
NumericTypes,
|
||||
CPUContext,
|
||||
AddFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
TanhGradient,
|
||||
IDEEPFallbackOp<BinaryElementwiseOp<
|
||||
TensorTypes<float>,
|
||||
CPUContext,
|
||||
TanhGradientFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
MulGradient,
|
||||
IDEEPFallbackOp<BinaryElementwiseGradientOp<
|
||||
NumericTypes,
|
||||
CPUContext,
|
||||
MulFunctor<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(TensorProtosDBInput, IDEEPFallbackOp<TensorProtosDBInput<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(CloseBlobsQueue, IDEEPFallbackOp<CloseBlobsQueueOp<CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
SoftmaxWithLoss,
|
||||
IDEEPFallbackOp<SoftmaxWithLossOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
SoftmaxWithLossGradient,
|
||||
IDEEPFallbackOp<SoftmaxWithLossGradientOp<float, CPUContext>>);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Expand,
|
||||
IDEEPFallbackOp<ExpandOp<
|
||||
TensorTypes<std::int32_t, std::int64_t, float, double>,
|
||||
CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(Gather, IDEEPFallbackOp<GatherOp<CPUContext>>);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
Normalize,
|
||||
IDEEPFallbackOp<NormalizeOp<float, CPUContext>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
ReduceL2,
|
||||
IDEEPFallbackOp<
|
||||
ReduceOp<TensorTypes<float>, CPUContext, L2Reducer<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
ReduceSum,
|
||||
IDEEPFallbackOp<ReduceOp<
|
||||
TensorTypes<std::int32_t, std::int64_t, float, double>,
|
||||
CPUContext,
|
||||
SumReducer<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
ReduceMean,
|
||||
IDEEPFallbackOp<ReduceOp<
|
||||
TensorTypes<float>, CPUContext, MeanReducer<CPUContext>>>);
|
||||
REGISTER_IDEEP_OPERATOR(
|
||||
BatchMatMul,
|
||||
IDEEPFallbackOp<BatchMatMulOp<CPUContext>>);
|
||||
|
||||
|
||||
} // namespace caffe2
|
||||
@ -1,190 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <caffe2/core/common.h>
|
||||
#include <caffe2/core/context.h>
|
||||
#include <caffe2/core/operator.h>
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
#include <caffe2/proto/caffe2_pb.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
/**
|
||||
* @brief A templated class to allow one to wrap a CPU operator as an IDEEP
|
||||
* operator.
|
||||
*
|
||||
* This class can be used when one does not have the IDEEP implementation ready
|
||||
* yet for an operator. Essentially, what this op does is to automatically
|
||||
* deal with data copy for you. Plausibly, this causes a lot of overhead and
|
||||
* is not optimal, so you should use this operator mostly for quick prototyping
|
||||
* purpose.
|
||||
*
|
||||
* All the input and output of the original operator should be TensorCPU.
|
||||
*
|
||||
* Example usage: if you have a class MyMagicOp that is CPU based, and you use
|
||||
* the registration code
|
||||
* REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
|
||||
* to register the CPU side, you can create its corresponding IDEEP operator
|
||||
* (with performance hits of course) via
|
||||
* REGISTER_IDEEP_OPERATOR(MyMagic,
|
||||
* IDEEPFallbackOp<MyMagicOp>);
|
||||
*
|
||||
* Advanced usage: if you want to have some specific outputs never copied, you
|
||||
* can use the SkipOutputCopy template argument to do that. For example, if
|
||||
* MyMagic produces two outputs and the first output is always going to live on
|
||||
* the CPU, you can do
|
||||
* REGISTER_IDEEP_OPERATOR(MyMagic,
|
||||
* IDEEPFallbackOp<MyMagicOp, SkipIndices<0>>);
|
||||
*/
|
||||
template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
|
||||
class IDEEPFallbackOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPFallbackOp(const OperatorDef& def, Workspace* ws)
|
||||
: IDEEPOperator(def, ws) {
|
||||
CAFFE_ENFORCE_EQ(def.device_option().device_type(), PROTO_IDEEP);
|
||||
base_def_.CopyFrom(def);
|
||||
// base_def_ runs on CPU, so we will set its device option to CPU.
|
||||
// Copy to allow random_seed to be correctly propagated.
|
||||
base_def_.mutable_device_option()->CopyFrom(def.device_option());
|
||||
base_def_.mutable_device_option()->set_device_type(PROTO_CPU);
|
||||
// Create output blobs in parent workspace,
|
||||
// then forward output blobs to local workspace.
|
||||
std::unordered_map<string, string> forwarded_output_blobs;
|
||||
for (const auto i : c10::irange(base_def_.output_size())) {
|
||||
// For in-place case, the in/output tensor for local_ws must be
|
||||
// re-created, instead of forwarding from current workspace.
|
||||
string parent_name(base_def_.output(i));
|
||||
if (!SkipOutputCopy::Contains(i)) {
|
||||
parent_name += "_cpu_output_blob_" + base_def_.type();
|
||||
}
|
||||
local_output_blobs_.push_back(ws->CreateBlob(parent_name));
|
||||
TORCH_CHECK_NOTNULL(local_output_blobs_.back());
|
||||
forwarded_output_blobs[base_def_.output(i)] = parent_name;
|
||||
output_inplace_.push_back(false);
|
||||
for (const string &input_name : base_def_.input()) {
|
||||
if (input_name == base_def_.output(i)) {
|
||||
output_inplace_[i] = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
local_ws_.reset(new Workspace(ws, forwarded_output_blobs));
|
||||
// Set up the symbols for the local workspace.
|
||||
for (const string& name : base_def_.input()) {
|
||||
local_input_blobs_.push_back(local_ws_->CreateBlob(name));
|
||||
TORCH_CHECK_NOTNULL(local_input_blobs_.back());
|
||||
}
|
||||
input_share_.resize(local_input_blobs_.size(), false);
|
||||
base_op_.reset(new CPUOp(base_def_, local_ws_.get()));
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
for (const auto i : c10::irange(InputSize())) {
|
||||
if (InputIsType<itensor>(i)
|
||||
&& (Input(i).has_scale()
|
||||
|| Input(i).get_data_type() == idtype::f32)) {
|
||||
auto& input = Input(i);
|
||||
if (input_share_[i]) {
|
||||
local_input_blobs_[i]->Reset();
|
||||
input_share_[i] = false;
|
||||
}
|
||||
auto dtensor = BlobGetMutableTensor(local_input_blobs_[i], CPU);
|
||||
dtensor->Resize(input.get_dims());
|
||||
// If fallback from INT8, the public format of original input is nhwc.
|
||||
// While the required format is nchw, need to reorder to nchw.
|
||||
if (input.get_desc().is_nhwc()) {
|
||||
itensor temp_ten ({input.get_dims(), idtype::f32, iformat::nchw},
|
||||
dtensor->template mutable_data<float>());
|
||||
temp_ten.feed_from(input);
|
||||
} else if (!input.need_reorder()) {
|
||||
CAFFE_ENFORCE(!input.has_scale(),
|
||||
"Incorrect invocation of get_data_handle");
|
||||
dtensor->ShareExternalPointer(
|
||||
static_cast<float*>(input.get_data_handle()));
|
||||
} else {
|
||||
input.to_public(dtensor->template mutable_data<float>());
|
||||
}
|
||||
} else {
|
||||
VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy.";
|
||||
if (OperatorBase::Inputs()[i]->GetRaw() != local_input_blobs_[i]->GetRaw()) {
|
||||
// Note(jiayq): This removes a const but conceptually
|
||||
// local_input_blobs will only be used as const blob input for the
|
||||
// base op so we are still fine.
|
||||
local_input_blobs_[i]->ShareExternal(
|
||||
const_cast<void *>(OperatorBase::Inputs()[i]->GetRaw()),
|
||||
OperatorBase::Inputs()[i]->meta());
|
||||
}
|
||||
input_share_[i] = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Some CPU ops inherited from OperatorBase directly might need this default
|
||||
// input argument '0' like 'PrefetchOperator'.
|
||||
if (!base_op_->Run(0)) {
|
||||
LOG(ERROR) << "Base op run failed in IDEEPFallbackOp. Def: "
|
||||
<< ProtoDebugString(this->debug_def());
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto i : c10::irange(OutputSize())) {
|
||||
if (SkipOutputCopy::Contains(i)) {
|
||||
VLOG(1) << "Copy output: index " << i << " skipped.";
|
||||
continue;
|
||||
}
|
||||
CAFFE_ENFORCE(
|
||||
BlobIsTensorType(*local_output_blobs_[i], CPU),
|
||||
"IDEEP fallback op currently does not support non-TensorCPU "
|
||||
"output type who needs copying.");
|
||||
const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
|
||||
auto src_dims = src.sizes().vec();
|
||||
if (src.template IsType<float>() && src.dim() != 0 && base_op_->type() != "Python") {
|
||||
Blob* dst = OperatorBase::OutputBlob(i);
|
||||
// The output tensor must be ideep tensor with public format.
|
||||
// If reusing ideep tensor with non-public format, the tensor buffer
|
||||
// will be interpreted incorrectly.
|
||||
if (!dst->template IsType<itensor>() ||
|
||||
!dst->template Get<itensor>().is_public_format()) {
|
||||
dst->Reset(new itensor());
|
||||
}
|
||||
|
||||
itensor::dims dst_dims (src_dims.begin(), src_dims.end());
|
||||
auto dtensor = dst->template GetMutable<itensor>();
|
||||
if (dtensor->get_dims() != dst_dims) {
|
||||
dtensor->resize(dst_dims, idtype::f32);
|
||||
}
|
||||
if (output_inplace_[i]) {
|
||||
dtensor->feed_from(dst_dims, idtype::f32,
|
||||
const_cast<void*>(src.raw_data()));
|
||||
} else {
|
||||
CAFFE_ENFORCE(!dtensor->has_scale(),
|
||||
"Incorrect invocation of set_data_handle");
|
||||
dtensor->set_data_handle(const_cast<void *>(src.raw_data()));
|
||||
}
|
||||
} else {
|
||||
VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
|
||||
Blob* dst = OperatorBase::OutputBlob(i);
|
||||
if (output_inplace_[i]) {
|
||||
auto dtensor = BlobGetMutableTensor(dst, CPU);
|
||||
dtensor->CopyFrom(src);
|
||||
} else {
|
||||
dst->Reset(new Tensor(CPU));
|
||||
BlobSetTensor(dst, src.Alias());
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
vector<Blob*> local_input_blobs_;
|
||||
vector<Blob*> local_output_blobs_;
|
||||
vector<bool> output_inplace_;
|
||||
vector<bool> input_share_;
|
||||
std::unique_ptr<CPUOp> base_op_;
|
||||
std::unique_ptr<Workspace> local_ws_;
|
||||
OperatorDef base_def_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
@ -1,70 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPNHWC2NCHWOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_SIMPLE_IDEEP_CTOR_DTOR(IDEEPNHWC2NCHWOp);
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(0);
|
||||
CAFFE_ENFORCE_EQ(X.ndims(), 4);
|
||||
CAFFE_ENFORCE(X.get_desc().is_nhwc());
|
||||
|
||||
auto *Y = Output(OUTPUT);
|
||||
CAFFE_ENFORCE(Y != &X);
|
||||
|
||||
// NOTE: NHWC changes the shape in framework, but not in MKL-DNN
|
||||
// Thus, for iDEEP tensor, the shapes of NCHW and NHWC are identical.
|
||||
Y->init({X.get_dims(), X.get_data_type(), iformat::nchw});
|
||||
Y->feed_from(X);
|
||||
// NOTE: This ops is only used to quantization path, setting scale
|
||||
// to distinguish with fp32 path activation(always return NCHW format
|
||||
// even ideep tensor has NHWC format) when convert to numpy memory.
|
||||
Y->set_scale({1.0});
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPNCHW2NHWCOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_SIMPLE_IDEEP_CTOR_DTOR(IDEEPNCHW2NHWCOp);
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(0);
|
||||
CAFFE_ENFORCE_EQ(X.ndims(), 4);
|
||||
CAFFE_ENFORCE(X.get_desc().is_nchw());
|
||||
|
||||
auto *Y = Output(OUTPUT);
|
||||
CAFFE_ENFORCE(Y != &X);
|
||||
|
||||
// NOTE: NHWC changes the shape in framework, but not in MKL-DNN
|
||||
// Thus, for iDEEP tensor, the shapes of NCHW and NHWC are identical.
|
||||
Y->init({X.get_dims(), X.get_data_type(), iformat::nhwc});
|
||||
Y->feed_from(X);
|
||||
// NOTE: This ops is only used to quantization path, setting scale
|
||||
// to distinguish with fp32 path activation(always return NCHW format
|
||||
// even ideep tensor has NHWC format) when convert to numpy memory.
|
||||
Y->set_scale({1.0});
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(NHWC2NCHW, IDEEPNHWC2NCHWOp);
|
||||
REGISTER_IDEEP_OPERATOR(NCHW2NHWC, IDEEPNCHW2NHWCOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,120 +0,0 @@
|
||||
#include <caffe2/ideep/operators/conv_pool_base_op.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPPoolOp final : public IDEEPConvPoolOpBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPPoolOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvPoolOpBase(operator_def, ws) {
|
||||
CAFFE_ENFORCE(
|
||||
(dilation_h() == 1) && (dilation_w() == 1),
|
||||
"Pooling op does not support dilation right now.");
|
||||
if (!global_pooling_) {
|
||||
CAFFE_ENFORCE(
|
||||
pad_t() < kernel_h() && pad_b() < kernel_h() &&
|
||||
pad_l() < kernel_w() && pad_r() < kernel_w(),
|
||||
"Pad should be smaller than kernel.");
|
||||
}
|
||||
|
||||
bool training_mode = OperatorBase::GetSingleArgument<int>("training_mode", 1);
|
||||
pk_ = training_mode ? iprop::forward_training : iprop::forward_inference;
|
||||
|
||||
// Figure out the pooling descriptor.
|
||||
if (operator_def.type().substr(0, 7) == "MaxPool") {
|
||||
algo_ = ialgo::pooling_max;
|
||||
} else if (operator_def.type().substr(0, 11) == "AveragePool") {
|
||||
algo_ = ialgo::pooling_avg_exclude_padding;
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPPoolOp() override {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
auto Y_dims = CalcOutputDims(X, X.get_dim(1));
|
||||
|
||||
if (cached_X_descriptor_ != X.get_descriptor()) {
|
||||
cached_X_descriptor_ = X.dup_descriptor();
|
||||
}
|
||||
|
||||
ideep::pooling_forward::compute(X, Y_dims, *Y,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{kernel_.begin(), kernel_.end()},
|
||||
pad_tl(), pad_br(), algo_, pk_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
iprop pk_;
|
||||
ialgo algo_;
|
||||
itensor::descriptor cached_X_descriptor_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPPoolGradientOp final : public IDEEPConvPoolOpBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPPoolGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvPoolOpBase(operator_def, ws) {
|
||||
CAFFE_ENFORCE(
|
||||
(dilation_h() == 1) && (dilation_w() == 1),
|
||||
"Pooling op does not support dilation right now.");
|
||||
if (!global_pooling_) {
|
||||
CAFFE_ENFORCE(
|
||||
pad_t() < kernel_h() && pad_b() < kernel_h() &&
|
||||
pad_l() < kernel_w() && pad_r() < kernel_w(),
|
||||
"Pad should be smaller than kernel.");
|
||||
}
|
||||
// Figure out the pooling descriptor.
|
||||
if (operator_def.type().substr(0, 15) == "MaxPoolGradient") {
|
||||
algo_ = ialgo::pooling_max;
|
||||
} else if (operator_def.type().substr(0, 19) == "AveragePoolGradient") {
|
||||
algo_ = ialgo::pooling_avg_exclude_padding;
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPPoolGradientOp() override {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& Y = Input(OUTPUT);
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dX = Output(INPUT_GRAD);
|
||||
|
||||
ideep::pooling_backward::compute(dY, Y, X, *dX,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{kernel_.begin(), kernel_.end()},
|
||||
pad_tl(), pad_br(), algo_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
ialgo algo_;
|
||||
|
||||
INPUT_TAGS(INPUT, OUTPUT, OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(INPUT_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(MaxPool, IDEEPPoolOp);
|
||||
REGISTER_IDEEP_OPERATOR(MaxPoolGradient, IDEEPPoolGradientOp);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(AveragePool, IDEEPPoolOp);
|
||||
REGISTER_IDEEP_OPERATOR(AveragePoolGradient, IDEEPPoolGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,77 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
template <bool ReluFused>
|
||||
class IDEEPInt8SumReluOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPInt8SumReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)),
|
||||
zero_point_(
|
||||
this->template GetSingleArgument<int32_t>("Y_zero_point", 0)) {
|
||||
if (ReluFused || zero_point_ == 0) {
|
||||
Y_data_type_ = idtype::u8;
|
||||
CAFFE_ENFORCE_EQ(zero_point_, 0, "Wrong zero point");
|
||||
} else {
|
||||
Y_data_type_ = idtype::s8;
|
||||
CAFFE_ENFORCE_EQ(zero_point_, 128, "Wrong zero point");
|
||||
}
|
||||
|
||||
Y_scales_ = ConvertScales({scale_});
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8SumReluOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
itensor temp_ten;
|
||||
itensor::dims input_dims;
|
||||
vector<itensor> inputs_itensor;
|
||||
|
||||
CAFFE_ENFORCE_GT(InputSize(), 1, "Wrong input size (must > 1)");
|
||||
for (int i = 0; i < InputSize(); ++i) {
|
||||
CAFFE_ENFORCE(OperatorBase::InputBlob(i).template IsType<itensor>());
|
||||
auto& Xi = Input(i);
|
||||
if (input_dims.empty())
|
||||
input_dims = Xi.get_dims();
|
||||
CAFFE_ENFORCE(input_dims == Xi.get_dims());
|
||||
inputs_itensor.emplace_back(
|
||||
Xi.get_data_type() != idtype::f32 ? Xi.dequantize() : Xi);
|
||||
}
|
||||
|
||||
temp_ten.init({input_dims, idtype::f32});
|
||||
const vector<float> scales(InputSize(), 1.0);
|
||||
ideep::sum::compute(scales, inputs_itensor, temp_ten);
|
||||
if (ReluFused) {
|
||||
ideep::eltwise_forward::compute(temp_ten, temp_ten);
|
||||
}
|
||||
|
||||
auto* Y = Output(OUTPUT);
|
||||
Y->init({temp_ten.get_dims(), Y_data_type_, iformat::nhwc});
|
||||
Y->set_scale(Y_scales_);
|
||||
Y->feed_from(temp_ten);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
float scale_;
|
||||
int32_t zero_point_;
|
||||
iscale Y_scales_;
|
||||
idtype Y_data_type_;
|
||||
|
||||
INPUT_TAGS(INPUT0);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Sum, DNNLOWP, IDEEPInt8SumReluOp<false>);
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Add, DNNLOWP, IDEEPInt8SumReluOp<false>);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8SumRelu, DNNLOWP, IDEEPInt8SumReluOp<true>);
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8AddRelu, DNNLOWP, IDEEPInt8SumReluOp<true>);
|
||||
|
||||
} // namespace
|
||||
@ -1,258 +0,0 @@
|
||||
#include <caffe2/ideep/operators/conv_pool_base_op.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPInt8ConvOp : public IDEEPConvPoolOpBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
IDEEPInt8ConvOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvPoolOpBase(operator_def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)),
|
||||
zero_point_(
|
||||
this->template GetSingleArgument<int32_t>("Y_zero_point", 0)) {
|
||||
OPERATOR_NEEDS_FEATURE(pad_l() == pad_r() && pad_t() == pad_b(),
|
||||
"Uneven padding not supported.");
|
||||
fusion_type_ = FUSION_UNKNOWN;
|
||||
last_input_ = BIAS_OR_INPUT_S;
|
||||
algo_ = ialgo::convolution_direct;
|
||||
auto conv_algorithm = OperatorBase::GetSingleArgument<int>(
|
||||
"conv_algorithm", CONV_ALGORITHM_AUTO);
|
||||
if (conv_algorithm == CONV_ALGORITHM_WINOGRAD) {
|
||||
algo_ = ialgo::convolution_winograd;
|
||||
}
|
||||
CAFFE_ENFORCE(zero_point_ == 128 || zero_point_ == 0);
|
||||
Y_scales_ = ConvertScales({scale_});
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8ConvOp() override {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
const auto &X = Input(INPUT_X);
|
||||
const auto &filter = Input(FILTER);
|
||||
auto *Y = Output(OUTPUT);
|
||||
|
||||
CAFFE_ENFORCE(X.has_scale());
|
||||
CAFFE_ENFORCE(4 == X.ndims() && 4 == filter.ndims());
|
||||
CAFFE_ENFORCE(X.get_data_type() == idtype::s8
|
||||
|| X.get_data_type() == idtype::u8);
|
||||
CAFFE_ENFORCE(filter.get_dim(2) == kernel_h());
|
||||
CAFFE_ENFORCE(filter.get_dim(3) == kernel_w());
|
||||
CAFFE_ENFORCE(
|
||||
X.get_dim(1) == filter.get_dim(1) * group_,
|
||||
"Convolution op: input channels does not match: # of input channels ",
|
||||
X.get_dim(1), " is not equal to kernel channels * group:",
|
||||
filter.get_dim(1), "*", group_);
|
||||
|
||||
bool input_changed = (cached_X_descriptor_ != X.get_descriptor());
|
||||
if (input_changed) {
|
||||
cached_X_descriptor_ = X.dup_descriptor();
|
||||
}
|
||||
|
||||
bool weights_changed = (cached_weights_descriptor_ != filter.get_descriptor());
|
||||
if (weights_changed) {
|
||||
cached_weights_descriptor_ = filter.dup_descriptor();
|
||||
CAFFE_ENFORCE(filter.get_data_type() == idtype::s8 && filter.has_scale());
|
||||
|
||||
auto X_dt = X.get_data_type();
|
||||
lowp_kind_ = ilowp_kind::LOWP_U8S8;
|
||||
if (X_dt == idtype::s8) {
|
||||
lowp_kind_ = ilowp_kind::LOWP_S8S8;
|
||||
}
|
||||
|
||||
auto expected_descriptor =
|
||||
ideep::convolution_forward::expected_weights_desc(
|
||||
filter.get_dims(),
|
||||
idtype::s8,
|
||||
{stride_.begin(), stride_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
group_,
|
||||
algo_,
|
||||
iprop::forward_inference,
|
||||
X_dt, X.get_dims());
|
||||
if (filter.get_desc() != expected_descriptor) {
|
||||
filter_.init(expected_descriptor);
|
||||
filter_.set_scale(filter.get_scale());
|
||||
filter_.feed_from(filter);
|
||||
} else {
|
||||
filter_ = filter;
|
||||
}
|
||||
|
||||
if (InputSize() > last_input_) {
|
||||
// NOTE: If the bias is shared by other operators in this module,
|
||||
// The existing bias scale should not satisfy current operator.
|
||||
// Thus, we have to requantize it by current input and filter scales.
|
||||
auto bias = Input(BIAS_OR_INPUT_S);
|
||||
bias_.init({bias.get_dims(), idtype::s32});
|
||||
iscale bias_scales (filter_.get_scale());
|
||||
for (auto &scale : bias_scales) { scale *= X.get_scale()[0]; }
|
||||
bias_.set_scale(bias_scales);
|
||||
bias_.feed_from(bias);
|
||||
}
|
||||
}
|
||||
|
||||
bool with_bias = InputSize() > last_input_;
|
||||
if (input_changed || weights_changed) {
|
||||
auto Y_dims = CalcOutputDims(X, filter.get_dim(0));
|
||||
if (with_bias) {
|
||||
ideep::convolution_forward::prepare(
|
||||
conv_param,
|
||||
X,
|
||||
filter_,
|
||||
bias_,
|
||||
Y_dims,
|
||||
*Y,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
group_,
|
||||
iscale(),
|
||||
iscale(),
|
||||
Y_scales_,
|
||||
attr_,
|
||||
algo_,
|
||||
iprop::forward_inference,
|
||||
lowp_kind_);
|
||||
} else {
|
||||
ideep::convolution_forward::prepare(
|
||||
conv_param,
|
||||
X,
|
||||
filter_,
|
||||
Y_dims,
|
||||
*Y,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{dilation_.begin(), dilation_.end()},
|
||||
pad_tl(),
|
||||
pad_br(),
|
||||
group_,
|
||||
iscale(),
|
||||
iscale(),
|
||||
Y_scales_,
|
||||
attr_,
|
||||
algo_,
|
||||
iprop::forward_inference,
|
||||
lowp_kind_);
|
||||
}
|
||||
}
|
||||
|
||||
if (with_bias) {
|
||||
ideep::convolution_forward::compute(conv_param, X, filter_, bias_, *Y);
|
||||
} else {
|
||||
ideep::convolution_forward::compute(conv_param, X, filter_, *Y);
|
||||
}
|
||||
|
||||
if (fusion_type_ != FUSION_CONV_RELU && fusion_type_ != FUSION_UNKNOWN) {
|
||||
CAFFE_ENFORCE(
|
||||
Y == &(Input(InputSize() - 1)),
|
||||
"Convolution fusion op: InPlace is enforced for sum fusion.");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
iattr attr_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
ialgo algo_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
float scale_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
int last_input_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
int32_t zero_point_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
ilowp_kind lowp_kind_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
FusionType fusion_type_;
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
itensor filter_, bias_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
iscale Y_scales_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
itensor::descriptor cached_X_descriptor_, cached_weights_descriptor_;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
||||
ideep::convolution_forward_params conv_param;
|
||||
|
||||
INPUT_TAGS(INPUT_X, FILTER, BIAS_OR_INPUT_S, INPUT_S);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPInt8ConvReluOp final : public IDEEPInt8ConvOp {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPInt8ConvReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPInt8ConvOp(operator_def, ws) {
|
||||
CAFFE_ENFORCE(zero_point_ == 0);
|
||||
last_input_ = BIAS_OR_INPUT_S;
|
||||
attr_ = iattr::fuse_relu();
|
||||
fusion_type_ = FUSION_CONV_RELU;
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8ConvReluOp() override {}
|
||||
};
|
||||
|
||||
class IDEEPInt8ConvSumOp final : public IDEEPInt8ConvOp {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPInt8ConvSumOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPInt8ConvOp(operator_def, ws) {
|
||||
last_input_ = INPUT_S;
|
||||
attr_ = iattr::fuse_sum();
|
||||
fusion_type_ = FUSION_CONV_SUM;
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8ConvSumOp() override {}
|
||||
};
|
||||
|
||||
class IDEEPInt8ConvSumReluOp final : public IDEEPInt8ConvOp {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPInt8ConvSumReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPInt8ConvOp(operator_def, ws) {
|
||||
last_input_ = INPUT_S;
|
||||
attr_ = iattr::residual();
|
||||
fusion_type_ = FUSION_CONV_SUM_RELU;
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8ConvSumReluOp() override {}
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Conv, DNNLOWP, IDEEPInt8ConvOp);
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8ConvRelu, DNNLOWP, IDEEPInt8ConvReluOp);
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8ConvSum, DNNLOWP, IDEEPInt8ConvSumOp);
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8ConvSumRelu, DNNLOWP, IDEEPInt8ConvSumReluOp);
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,clang-diagnostic-unused-function)
|
||||
OPERATOR_SCHEMA(Int8ConvSum)
|
||||
.NumInputs(2, 4)
|
||||
.NumOutputs(1)
|
||||
.TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
|
||||
.CostInferenceFunction(OpSchema::CostInferenceFunctionType(
|
||||
ConvPoolOpBase<CPUContext>::CostInferenceForConv))
|
||||
.AllowInplace({{2, 0}, {3, 0}});
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,clang-diagnostic-unused-function)
|
||||
OPERATOR_SCHEMA(Int8ConvSumRelu)
|
||||
.NumInputs(2, 4)
|
||||
.NumOutputs(1)
|
||||
.TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForConv)
|
||||
.CostInferenceFunction(OpSchema::CostInferenceFunctionType(
|
||||
ConvPoolOpBase<CPUContext>::CostInferenceForConv))
|
||||
.AllowInplace({{2, 0}, {3, 0}});
|
||||
|
||||
} // namespace
|
||||
@ -1,43 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPInt8DequantizeOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPInt8DequantizeOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws) {
|
||||
|
||||
if (HasArgument("output_order")) {
|
||||
Y_fmt_ = static_cast<iformat>(
|
||||
this->template GetSingleArgument<int>("output_order",
|
||||
static_cast<int>(iformat::nchw)));
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8DequantizeOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
if (Y_fmt_ != iformat::undef) {
|
||||
Y->init(X.get_desc().to_type(idtype::f32).to_format(Y_fmt_));
|
||||
} else {
|
||||
Y->init(X.get_desc().to_type(idtype::f32));
|
||||
}
|
||||
Y->feed_from(X);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
iformat Y_fmt_ {iformat::undef};
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Dequantize, DNNLOWP, IDEEPInt8DequantizeOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,96 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
|
||||
class IDEEPInt8FullyConnectedOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPInt8FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
|
||||
axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)),
|
||||
scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)),
|
||||
zero_point_(
|
||||
this->template GetSingleArgument<int32_t>("Y_zero_point", 0)) {
|
||||
CAFFE_ENFORCE(zero_point_ == 128 || zero_point_ == 0);
|
||||
if (zero_point_ == 0) {
|
||||
Y_data_type_ = idtype::u8;
|
||||
} else {
|
||||
Y_data_type_ = idtype::s8;
|
||||
}
|
||||
Y_scales_ = ConvertScales({scale_});
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8FullyConnectedOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& filter = Input(FILTER);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
itensor X_in = X;
|
||||
auto X_dims = CanonicalDims(X_in.get_dims(), axis_);
|
||||
if (X_in.get_dims() != X_dims) {
|
||||
X_in.reshape(X_dims);
|
||||
}
|
||||
|
||||
if (cached_X_descriptor_ != X.get_descriptor()) {
|
||||
cached_X_descriptor_ = X.dup_descriptor();
|
||||
Y_.init({{X.get_dim(0), filter.get_dim(0)}, idtype::f32});
|
||||
}
|
||||
|
||||
if (cached_weights_descriptor_ != filter.get_descriptor()) {
|
||||
cached_weights_descriptor_ = filter.dup_descriptor();
|
||||
CAFFE_ENFORCE(filter.get_data_type() == idtype::s8 && filter.has_scale());
|
||||
|
||||
// INT8 FC is not supported so far.
|
||||
filter_ = filter.to_public();
|
||||
auto filter_dims = CanonicalDims(filter_.get_dims(), axis_w_);
|
||||
if (filter_.get_dims() != filter_dims) {
|
||||
filter_.reshape(filter_dims);
|
||||
}
|
||||
|
||||
if (InputSize() > BIAS) {
|
||||
bias_ = Input(BIAS).to_public();
|
||||
}
|
||||
|
||||
Y_.init({{X.get_dim(0), filter.get_dim(0)}, idtype::f32});
|
||||
}
|
||||
|
||||
X_in = X_in.to_public();
|
||||
if (InputSize() > BIAS) {
|
||||
ideep::inner_product_forward::compute(
|
||||
X_in, filter_, bias_, Y_);
|
||||
} else {
|
||||
ideep::inner_product_forward::compute(X_in, filter_, Y_);
|
||||
}
|
||||
Y->init({Y_.get_dims(), Y_data_type_});
|
||||
Y->set_scale(Y_scales_);
|
||||
Y->feed_from(Y_);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t axis_{1};
|
||||
size_t axis_w_{1};
|
||||
float scale_;
|
||||
int32_t zero_point_;
|
||||
|
||||
idtype Y_data_type_;
|
||||
itensor filter_, bias_, Y_;
|
||||
iscale Y_scales_;
|
||||
itensor::descriptor cached_X_descriptor_, cached_weights_descriptor_;
|
||||
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8FC, DNNLOWP, IDEEPInt8FullyConnectedOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,149 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPInt8GivenTensorFillOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPInt8GivenTensorFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
zero_point_(
|
||||
this->template GetSingleArgument<int32_t>("Y_zero_point", 0)),
|
||||
shape_(this->template GetRepeatedArgument<itensor::dim>("shape")) {
|
||||
CAFFE_ENFORCE(shape_.size() == 4 || shape_.size() == 2 || shape_.size() == 1);
|
||||
CAFFE_ENFORCE(zero_point_ == 0 || zero_point_ == 128,
|
||||
"Not support zero point");
|
||||
if (HasArgument("Y_scales")) {
|
||||
scales_ = this->template GetRepeatedArgument<float>("Y_scales");
|
||||
} else {
|
||||
auto scale = (this->template GetSingleArgument<float>("Y_scale", 1.0));
|
||||
scales_ = {scale};
|
||||
}
|
||||
|
||||
if (shape_.size() == 4) {
|
||||
fmt_ = iformat::nhwc;
|
||||
auto C = shape_[3];
|
||||
shape_[3] = shape_[2];
|
||||
shape_[2] = shape_[1];
|
||||
shape_[1] = C;
|
||||
} else if (shape_.size() == 2) {
|
||||
fmt_ = iformat::nc;
|
||||
} else {
|
||||
fmt_ = iformat::x;
|
||||
}
|
||||
|
||||
auto source_values = this->template GetSingleArgument<string>("values", "");
|
||||
auto src_size = source_values.size();
|
||||
values_.Resize(src_size);
|
||||
uint8_t* values_data = values_.template mutable_data<uint8_t>();
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (int i = 0; i < src_size; i++) {
|
||||
values_data[i] = static_cast<uint8_t>(source_values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto* output = Output(OUTPUT);
|
||||
auto data_type = zero_point_ == 0 ? idtype::u8 : idtype::s8;
|
||||
|
||||
output->init({shape_, data_type});
|
||||
TORCH_DCHECK_EQ(output->get_nelems(), values_.numel())
|
||||
<< "output size: " << output->get_nelems()
|
||||
<< " given size: " << values_.numel();
|
||||
|
||||
if (output->get_nelems() > 0) {
|
||||
itensor temp_ten;
|
||||
temp_ten.init({shape_, data_type, fmt_});
|
||||
auto* data_u8 = static_cast<uint8_t*>(temp_ten.get_data_handle());
|
||||
const auto* values_data = values_.template data<uint8_t>();
|
||||
context_.template CopySameDevice<uint8_t>(
|
||||
temp_ten.get_nelems(), values_data, data_u8);
|
||||
|
||||
// Shift quantized data to s8 per zero point
|
||||
if (zero_point_ == 128) {
|
||||
auto* data_s8 = static_cast<int8_t*>(temp_ten.get_data_handle());
|
||||
auto nelems = temp_ten.get_nelems();
|
||||
for (int i = 0; i < nelems; i++) {
|
||||
data_s8[i] = data_s8[i] - zero_point_;
|
||||
}
|
||||
}
|
||||
|
||||
output->feed_from(temp_ten);
|
||||
}
|
||||
|
||||
output->set_scale(ConvertScales(scales_));
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
iscale scales_;
|
||||
int32_t zero_point_;
|
||||
iformat fmt_;
|
||||
itensor::dims shape_;
|
||||
Tensor values_{CPU};
|
||||
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPInt8GivenIntTensorFillOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPInt8GivenIntTensorFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
zero_point_(
|
||||
this->template GetSingleArgument<int32_t>("Y_zero_point", 0)),
|
||||
shape_(this->template GetRepeatedArgument<itensor::dim>("shape")) {
|
||||
CAFFE_ENFORCE(zero_point_ == 0, "Not support zero point");
|
||||
if (HasArgument("Y_scales")) {
|
||||
scales_ = this->template GetRepeatedArgument<float>("Y_scales");
|
||||
} else {
|
||||
auto scale = (this->template GetSingleArgument<float>("Y_scale", 1.0));
|
||||
scales_ = {scale};
|
||||
}
|
||||
|
||||
auto source_values = this->template GetRepeatedArgument<int32_t>("values");
|
||||
auto src_size = source_values.size();
|
||||
values_.Resize(src_size);
|
||||
auto* values_data = values_.template mutable_data<int32_t>();
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (int i = 0; i < src_size; i++) {
|
||||
values_data[i] = static_cast<int32_t>(source_values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto* output = Output(OUTPUT);
|
||||
output->init({shape_, idtype::s32});
|
||||
output->set_scale(ConvertScales(scales_));
|
||||
TORCH_DCHECK_EQ(output->get_nelems(), values_.numel())
|
||||
<< "output size: " << output->get_nelems()
|
||||
<< " given size: " << values_.numel();
|
||||
|
||||
if (output->get_nelems() > 0) {
|
||||
auto* data = static_cast<int32_t*>(output->get_data_handle());
|
||||
const int32_t* values_data = values_.template data<int32_t>();
|
||||
context_.template CopySameDevice<int32_t>(
|
||||
output->get_nelems(), values_data, data);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
iscale scales_;
|
||||
int32_t zero_point_;
|
||||
itensor::dims shape_;
|
||||
Tensor values_{CPU};
|
||||
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Int8GivenTensorFill, IDEEPInt8GivenTensorFillOp);
|
||||
REGISTER_IDEEP_OPERATOR(Int8GivenIntTensorFill, IDEEPInt8GivenIntTensorFillOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,65 +0,0 @@
|
||||
#include <caffe2/ideep/operators/conv_pool_base_op.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPInt8PoolOp final : public IDEEPConvPoolOpBase {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
|
||||
|
||||
IDEEPInt8PoolOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPConvPoolOpBase(operator_def, ws) {
|
||||
CAFFE_ENFORCE(
|
||||
(dilation_h() == 1) && (dilation_w() == 1),
|
||||
"Pooling op does not support dilation right now.");
|
||||
if (!global_pooling_) {
|
||||
CAFFE_ENFORCE(
|
||||
pad_t() < kernel_h() && pad_b() < kernel_h() &&
|
||||
pad_l() < kernel_w() && pad_r() < kernel_w(),
|
||||
"Pad should be smaller than kernel.");
|
||||
}
|
||||
|
||||
// Figure out the pooling descriptor.
|
||||
if (operator_def.type().substr(0, 11) == "Int8MaxPool") {
|
||||
algo_ = ialgo::pooling_max;
|
||||
} else if (operator_def.type().substr(0, 15) == "Int8AveragePool") {
|
||||
algo_ = ialgo::pooling_avg_exclude_padding;
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported pooling method: " << operator_def.type();
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8PoolOp() override {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override {
|
||||
auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
auto Y_dims = CalcOutputDims(X, X.get_dim(1));
|
||||
|
||||
if (cached_X_descriptor_ != X.get_descriptor()) {
|
||||
cached_X_descriptor_ = X.dup_descriptor();
|
||||
}
|
||||
|
||||
ideep::pooling_forward::compute(X, Y_dims, *Y,
|
||||
{stride_.begin(), stride_.end()},
|
||||
{kernel_.begin(), kernel_.end()},
|
||||
pad_tl(), pad_br(), algo_,
|
||||
iprop::forward_inference);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
ialgo algo_;
|
||||
itensor::descriptor cached_X_descriptor_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8MaxPool, DNNLOWP, IDEEPInt8PoolOp);
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8AveragePool, DNNLOWP, IDEEPInt8PoolOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,61 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPInt8QuantizeOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPInt8QuantizeOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
scale_(this->template GetSingleArgument<float>("Y_scale", 1.0)),
|
||||
zero_point_(
|
||||
this->template GetSingleArgument<int32_t>("Y_zero_point", 0)) {
|
||||
|
||||
if (HasArgument("output_order")) {
|
||||
Y_fmt_ = static_cast<iformat>(
|
||||
this->template GetSingleArgument<int>("output_order",
|
||||
static_cast<int>(iformat::nchw)));
|
||||
}
|
||||
|
||||
CAFFE_ENFORCE(zero_point_ == 0 || zero_point_ == 128,
|
||||
"Not support this zero point");
|
||||
Y_data_type_ = zero_point_ == 0 ? idtype::u8 : idtype::s8;
|
||||
Y_scales_ = ConvertScales({scale_});
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8QuantizeOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(0);
|
||||
CAFFE_ENFORCE(X.get_data_type() == idtype::f32, "Not support data type");
|
||||
|
||||
auto* Y = Output(0);
|
||||
if (Y_fmt_ != iformat::undef) {
|
||||
Y->init(X.get_desc().to_type(Y_data_type_).to_format(Y_fmt_));
|
||||
} else {
|
||||
Y->init(X.get_desc().to_type(Y_data_type_));
|
||||
}
|
||||
Y->set_scale(Y_scales_);
|
||||
Y->feed_from(X);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
float scale_;
|
||||
int32_t zero_point_;
|
||||
iscale Y_scales_;
|
||||
idtype Y_data_type_;
|
||||
iformat Y_fmt_ {iformat::undef};
|
||||
|
||||
INPUT_TAGS(INPUT0);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Quantize, DNNLOWP, IDEEPInt8QuantizeOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,43 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPInt8ReluOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPInt8ReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws), alpha_(0.0) {
|
||||
// Figure out the Relu descriptor.
|
||||
if (operator_def.type().substr(0, 8) == "Int8Relu") {
|
||||
alpha_ = 0.0;
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported Relu method: " << operator_def.type();
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPInt8ReluOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
ideep::eltwise_forward::compute(
|
||||
X, *Y, ialgo::eltwise_relu, iprop::forward_inference, alpha_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
float alpha_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR_WITH_ENGINE(Int8Relu, DNNLOWP, IDEEPInt8ReluOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,74 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
#include <caffe2/queue/blobs_queue.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPCreateBlobsQueueOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPCreateBlobsQueueOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
ws_(ws),
|
||||
name(operator_def.output().Get(0)) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto capacity = GetSingleArgument("capacity", 1);
|
||||
const auto numBlobs = GetSingleArgument("num_blobs", 1);
|
||||
const auto enforceUniqueName =
|
||||
GetSingleArgument("enforce_unique_name", false);
|
||||
const auto fieldNames =
|
||||
OperatorBase::template GetRepeatedArgument<std::string>("field_names");
|
||||
CAFFE_ENFORCE_EQ(this->OutputSize(), 1);
|
||||
auto queuePtr = OperatorBase::Outputs()[0]
|
||||
->template GetMutable<std::shared_ptr<BlobsQueue>>();
|
||||
|
||||
CAFFE_ENFORCE(queuePtr);
|
||||
*queuePtr = std::make_shared<BlobsQueue>(
|
||||
ws_, name, capacity, numBlobs, enforceUniqueName, fieldNames);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
Workspace* ws_{nullptr};
|
||||
const std::string name;
|
||||
};
|
||||
|
||||
class IDEEPSafeEnqueueBlobsOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPSafeEnqueueBlobsOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto queue =
|
||||
OperatorBase::Inputs()[0]->template Get<std::shared_ptr<BlobsQueue>>();
|
||||
CAFFE_ENFORCE(queue);
|
||||
auto size = queue->getNumBlobs();
|
||||
CAFFE_ENFORCE(
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
OutputSize() == size + 1,
|
||||
"Expected " + caffe2::to_string(size + 1) + ", " +
|
||||
" got: " + caffe2::to_string(size));
|
||||
bool status = queue->blockingWrite(OperatorBase::Outputs());
|
||||
|
||||
auto st = OperatorBase::Output<TensorCPU>(1, CPU);
|
||||
st->Resize();
|
||||
auto stat = st->template mutable_data<bool>();
|
||||
stat[0] = !status;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(CreateBlobsQueue, IDEEPCreateBlobsQueueOp);
|
||||
SHOULD_NOT_DO_GRADIENT(IDEEPCreateBlobsQueueOp);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(SafeEnqueueBlobs, IDEEPSafeEnqueueBlobsOp);
|
||||
SHOULD_NOT_DO_GRADIENT(IDEEPSafeEnqueueBlobsOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,91 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPReluOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws), alpha_(0.0) {
|
||||
// Figure out the Relu descriptor.
|
||||
if (operator_def.type().substr(0, 4) == "Relu") {
|
||||
alpha_ = 0.0;
|
||||
} else if (operator_def.type().substr(0, 9) == "LeakyRelu") {
|
||||
if (HasArgument("alpha")) {
|
||||
alpha_ = static_cast<float>(
|
||||
OperatorBase::GetSingleArgument<float>("alpha", 0.01));
|
||||
}
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported Relu method: " << operator_def.type();
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPReluOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
ideep::eltwise_forward::compute(
|
||||
X, *Y, ialgo::eltwise_relu, iprop::forward_training, alpha_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
float alpha_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPReluGradientOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws), alpha_(0.0) {
|
||||
// Figure out the Relu descriptor.
|
||||
if (operator_def.type().substr(0, 12) == "ReluGradient") {
|
||||
alpha_ = 0.0;
|
||||
} else if (operator_def.type().substr(0, 17) == "LeakyReluGradient") {
|
||||
if (HasArgument("alpha")) {
|
||||
alpha_ = static_cast<float>(
|
||||
OperatorBase::GetSingleArgument<float>("alpha", 0.01));
|
||||
}
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported Relu method: " << operator_def.type();
|
||||
}
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPReluGradientOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& Y = Input(OUTPUT);
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dX = Output(INPUT_GRAD);
|
||||
|
||||
ideep::eltwise_backward::compute(Y, dY, *dX, ialgo::eltwise_relu, alpha_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
float alpha_;
|
||||
|
||||
INPUT_TAGS(OUTPUT, OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(INPUT_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Relu, IDEEPReluOp);
|
||||
REGISTER_IDEEP_OPERATOR(ReluGradient, IDEEPReluGradientOp);
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(LeakyRelu, IDEEPReluOp);
|
||||
REGISTER_IDEEP_OPERATOR(LeakyReluGradient, IDEEPReluGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,134 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
// Takes a shape and data tensor and reshapes it
|
||||
class IDEEPReshapeOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPReshapeOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
new_shape_(OperatorBase::GetRepeatedArgument<itensor::dim>("shape")) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
ideep::tensor::dims actual_new_shape = new_shape_;
|
||||
if (InputSize() == 2) {
|
||||
CAFFE_ENFORCE(
|
||||
!OperatorBase::HasArgument("shape"),
|
||||
"New shape is specified by the input blob, do not pass in "
|
||||
"the argument `shape`.");
|
||||
|
||||
// shape info live on CPU
|
||||
auto& shape = OperatorBase::Input<TensorCPU>(1, CPU);
|
||||
CAFFE_ENFORCE(shape.ndim() == 1, "Shape should be 1-D");
|
||||
actual_new_shape.reserve(shape.size());
|
||||
if (shape.template IsType<int>()) {
|
||||
const int* shape_data = shape.template data<int>();
|
||||
actual_new_shape.assign(shape_data, shape_data + shape.size());
|
||||
} else if (shape.template IsType<int64_t>()) {
|
||||
const int64_t* shape_data = shape.template data<int64_t>();
|
||||
for (int i = 0; i < shape.size(); ++i) {
|
||||
actual_new_shape.push_back(static_cast<int>(shape_data[i]));
|
||||
}
|
||||
} else {
|
||||
CAFFE_THROW(
|
||||
"IDEEP reshape only supports shape data in int32_t or int64_t");
|
||||
}
|
||||
} else {
|
||||
CAFFE_ENFORCE(
|
||||
OperatorBase::HasArgument("shape"), "Argument `shape` is missing.");
|
||||
}
|
||||
|
||||
auto& input = Input(0);
|
||||
// Copy over the dimensions for those that are specified zero.
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (int i = 0; i < actual_new_shape.size() && i < input.ndims(); ++i) {
|
||||
if (actual_new_shape[i] == 0) {
|
||||
actual_new_shape[i] = input.get_dim(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Checks if the new shape is valid and fills in the missing dimension
|
||||
// specified by -1.
|
||||
// NOTE: At most one dimension can be -1.
|
||||
auto total_size = input.get_nelems();
|
||||
int size = 1;
|
||||
int unknown_idx = -1;
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (int i = 0; i < actual_new_shape.size(); ++i) {
|
||||
const auto dim = actual_new_shape[i];
|
||||
if (dim == -1) {
|
||||
CAFFE_ENFORCE(
|
||||
unknown_idx == -1,
|
||||
"Argument `shape` has more than one missing dimension.");
|
||||
unknown_idx = i;
|
||||
} else {
|
||||
size *= dim;
|
||||
}
|
||||
}
|
||||
if (size == 0 && total_size != 0) {
|
||||
CAFFE_THROW(
|
||||
"Can not reshape a non-zero size (",
|
||||
total_size,
|
||||
") tensor to zero size.");
|
||||
}
|
||||
|
||||
if (unknown_idx != -1) {
|
||||
CAFFE_ENFORCE_NE(
|
||||
size,
|
||||
0,
|
||||
"New shape at dim ",
|
||||
unknown_idx,
|
||||
" can not be inferred since new size is zero.");
|
||||
CAFFE_ENFORCE(
|
||||
total_size % size == 0,
|
||||
"Argument `shape` does not agree with the input data.",
|
||||
" (",
|
||||
total_size,
|
||||
" vs ",
|
||||
size,
|
||||
")");
|
||||
actual_new_shape[unknown_idx] = total_size / size;
|
||||
} else {
|
||||
CAFFE_ENFORCE_EQ(
|
||||
total_size,
|
||||
size,
|
||||
"Argument `shape` does not agree with the input data.",
|
||||
" (",
|
||||
total_size,
|
||||
" != ",
|
||||
size,
|
||||
")");
|
||||
}
|
||||
|
||||
// Write the original shape to the second output.
|
||||
// shape info live on CPU
|
||||
TensorCPU* old_shape = OperatorBase::Output<TensorCPU>(1, CPU);
|
||||
old_shape->Resize(input.ndims());
|
||||
int* old_shape_data = old_shape->template mutable_data<int>();
|
||||
for (int i = 0; i < input.ndims(); ++i) {
|
||||
old_shape_data[i] = input.get_dim(i);
|
||||
}
|
||||
|
||||
auto* output = Output(0);
|
||||
if (output != &input) {
|
||||
// If we are not doing in-place computation, a copy is needed.
|
||||
output->reinit_like(input);
|
||||
ideep::direct_copy::compute(input, *output);
|
||||
}
|
||||
|
||||
output->reshape(actual_new_shape);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
ideep::tensor::dims new_shape_;
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Reshape, IDEEPReshapeOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,70 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
// RecordShapeOp records the shape of the input tensor to a vector of int. You
|
||||
// mostly don't need this operator explicitly, and it is mostly used in the
|
||||
// autodiff process.
|
||||
class IDEEPShapeOp : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPShapeOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
axes_(OperatorBase ::GetRepeatedArgument<int>("axes")) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
int numDims = 0;
|
||||
int numAxes = axes_.size();
|
||||
vector<int64_t> dims;
|
||||
const char* data_dims = nullptr;
|
||||
auto* output = OperatorBase::Output<Tensor>(OUTPUT, CPU);
|
||||
|
||||
if (OperatorBase::InputBlob(DATA).template IsType<itensor>()) {
|
||||
auto& data = Input(DATA);
|
||||
numDims = data.ndims();
|
||||
auto idims = data.get_dims();
|
||||
dims.assign(idims.begin(), idims.end());
|
||||
data_dims = reinterpret_cast<const char*>(dims.data());
|
||||
} else {
|
||||
auto& data = OperatorBase::Input<Tensor>(DATA, CPU);
|
||||
numDims = data.dim();
|
||||
data_dims = reinterpret_cast<const char*>(data.sizes().data());
|
||||
}
|
||||
|
||||
if (numAxes == 0) {
|
||||
output->Resize(numDims);
|
||||
int64_t* output_data = output->template mutable_data<int64_t>();
|
||||
context_.CopyBytesSameDevice(
|
||||
numDims * sizeof(int64_t), data_dims, output_data);
|
||||
return true;
|
||||
}
|
||||
|
||||
output->Resize(numAxes);
|
||||
auto out = reinterpret_cast<char*>(output->template mutable_data<int64_t>());
|
||||
for (int i = 0; i < numAxes; i++) {
|
||||
auto axis = axes_[i];
|
||||
CAFFE_ENFORCE_LT(axis, numDims, "Axis out of range");
|
||||
CAFFE_ENFORCE_GE(axis, 0, "Each axis should be non-negative");
|
||||
context_.CopyBytesSameDevice(
|
||||
sizeof(int64_t), data_dims + axis * sizeof(int64_t), out);
|
||||
out += sizeof(int64_t);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<int> axes_;
|
||||
|
||||
INPUT_TAGS(DATA);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Shape, IDEEPShapeOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,64 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPSigmoidOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPSigmoidOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws) {
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPSigmoidOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
ideep::eltwise_forward::compute(
|
||||
X, *Y, ialgo::eltwise_logistic, iprop::forward_training);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
class IDEEPSigmoidGradientOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPSigmoidGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws) {
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPSigmoidGradientOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& Y = Input(OUTPUT);
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
auto* dX = Output(INPUT_GRAD);
|
||||
|
||||
ideep::eltwise_backward::compute(Y, dY, *dX, ialgo::eltwise_logistic);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
INPUT_TAGS(OUTPUT, OUTPUT_GRAD);
|
||||
OUTPUT_TAGS(INPUT_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Sigmoid, IDEEPSigmoidOp);
|
||||
REGISTER_IDEEP_OPERATOR(SigmoidGradient, IDEEPSigmoidGradientOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,110 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPSpatialBNOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPSpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
|
||||
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
|
||||
momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.9)) {
|
||||
CAFFE_ENFORCE(
|
||||
(is_test_ && OutputSize() > OUTPUT)
|
||||
|| (!is_test_ && OutputSize() > SAVED_VAR));
|
||||
CAFFE_ENFORCE_GT(epsilon_, 0);
|
||||
CAFFE_ENFORCE_GE(momentum_, 0);
|
||||
CAFFE_ENFORCE_LE(momentum_, 1);
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPSpatialBNOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& scale = Input(SCALE);
|
||||
const auto& bias = Input(BIAS);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
TORCH_DCHECK_EQ(scale.ndims(), 1);
|
||||
TORCH_DCHECK_EQ(bias.ndims(), 1);
|
||||
TORCH_DCHECK_EQ(scale.get_dim(0), X.get_dim(1));
|
||||
TORCH_DCHECK_EQ(bias.get_dim(0), X.get_dim(1));
|
||||
|
||||
if (is_test_) {
|
||||
const auto& est_mean = Input(EST_MEAN);
|
||||
const auto& est_var = Input(EST_VAR);
|
||||
auto X_ = X.get_data_type() != idtype::f32 ? X.dequantize() : X;
|
||||
ideep::batch_normalization_forward_inference::compute(
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
|
||||
X_, est_mean, est_var, scale, bias, *Y, epsilon_);
|
||||
} else {
|
||||
auto* saved_mean = Output(SAVED_MEAN);
|
||||
auto* saved_var = Output(SAVED_VAR);
|
||||
auto* running_mean = Output(RUNNING_MEAN);
|
||||
auto* running_var = Output(RUNNING_VAR);
|
||||
ideep::batch_normalization_forward_training::compute(
|
||||
X, scale, bias, *Y, *saved_mean, *saved_var,
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
|
||||
*running_mean, *running_var, momentum_, epsilon_);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
bool is_test_;
|
||||
double epsilon_;
|
||||
double momentum_;
|
||||
|
||||
INPUT_TAGS(INPUT, SCALE, BIAS, EST_MEAN, EST_VAR);
|
||||
OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_VAR);
|
||||
};
|
||||
|
||||
class IDEEPSpatialBNGradientOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPSpatialBNGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)) {
|
||||
CAFFE_ENFORCE(InputSize() > SAVED_VAR);
|
||||
CAFFE_ENFORCE(OutputSize() > BIAS_GRAD);
|
||||
}
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPSpatialBNGradientOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& scale = Input(SCALE);
|
||||
const auto& dY = Input(OUTPUT_GRAD);
|
||||
const auto& saved_mean = Input(SAVED_MEAN);
|
||||
const auto& saved_var = Input(SAVED_VAR);
|
||||
auto* dX = Output(INPUT_GRAD);
|
||||
auto* dscale = Output(SCALE_GRAD);
|
||||
auto* dbias = Output(BIAS_GRAD);
|
||||
|
||||
ideep::batch_normalization_backward::compute(
|
||||
X, saved_mean, saved_var, dY, scale,
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
|
||||
*dX, *dscale, *dbias, epsilon_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
double epsilon_;
|
||||
|
||||
INPUT_TAGS(INPUT, SCALE, OUTPUT_GRAD, SAVED_MEAN, SAVED_VAR);
|
||||
OUTPUT_TAGS(INPUT_GRAD, SCALE_GRAD, BIAS_GRAD);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(SpatialBN, IDEEPSpatialBNOp);
|
||||
REGISTER_IDEEP_OPERATOR(SpatialBNGradient, IDEEPSpatialBNGradientOp)
|
||||
|
||||
} // namespace
|
||||
@ -1,36 +0,0 @@
|
||||
#include <caffe2/ideep/ideep_utils.h>
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class IDEEPTransposeOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPTransposeOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws),
|
||||
axes_(this->template GetRepeatedArgument<int>("axes")){ }
|
||||
// NOLINTNEXTLINE(modernize-use-equals-default)
|
||||
~IDEEPTransposeOp() override {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = Input(INPUT);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
Y->transpose_from(X.to_public(nullptr, X.get_data_type()), axes_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<int> axes_;
|
||||
|
||||
INPUT_TAGS(INPUT);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(Transpose, IDEEPTransposeOp);
|
||||
|
||||
} // namespace
|
||||
@ -1,133 +0,0 @@
|
||||
#include "caffe2/operators/utility_ops.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/ideep/ideep_utils.h"
|
||||
|
||||
using namespace caffe2;
|
||||
|
||||
namespace {
|
||||
|
||||
class CopyCPUToIDEEPOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_SIMPLE_IDEEP_CTOR_DTOR(CopyCPUToIDEEPOp);
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = OperatorBase::Input<Tensor>(0, CPU);
|
||||
auto* Y = OperatorBase::OutputBlob(0);
|
||||
itensor::dims src_dims(X.sizes().begin(), X.sizes().end());
|
||||
if (!(Y->template IsType<itensor>() &&
|
||||
Y->Get<itensor>().get_data_type() == itensor::data_type::f32) ||
|
||||
Y->Get<itensor>().get_dims() != src_dims) {
|
||||
Y->Reset(new itensor());
|
||||
Y->GetMutable<itensor>()->resize(src_dims, itensor::data_type::f32);
|
||||
}
|
||||
Y->GetMutable<itensor>()->feed_from(
|
||||
src_dims, itensor::data_type::f32, X.raw_data());
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class IDEEPCopyOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_SIMPLE_IDEEP_CTOR_DTOR(IDEEPCopyOp);
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const auto& X = OperatorBase::Input<itensor>(0);
|
||||
auto* Y = Output(0);
|
||||
if (X != *Y) {
|
||||
Y->reinit_like(X);
|
||||
ideep::direct_copy::compute(X, *Y);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class CopyIDEEPToCPUOp final : public IDEEPOperator {
|
||||
public:
|
||||
USE_SIMPLE_IDEEP_CTOR_DTOR(CopyIDEEPToCPUOp);
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
bool RunOnDevice() override {
|
||||
const auto& input_blob = OperatorBase::InputBlob(0);
|
||||
if (BlobIsTensorType(input_blob, CPU)) {
|
||||
VLOG(2) << "Directing sharing of TensorCPU";
|
||||
const auto& X = OperatorBase::Input<Tensor>(0, CPU);
|
||||
OutputTensorCopyFrom(0, at::device(CPU), X);
|
||||
} else {
|
||||
const auto& X = OperatorBase::Input<itensor>(0);
|
||||
if (X.get_data_type() == itensor::data_type::f32) {
|
||||
std::vector<int64_t> dims;
|
||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
||||
for (int i = 0; i < X.get_dims().size(); ++i) {
|
||||
dims.push_back(X.get_dims()[i]);
|
||||
}
|
||||
auto* Y =
|
||||
OperatorBase::OutputTensor(0, dims, at::dtype<float>().device(CPU));
|
||||
itensor temp_ten(
|
||||
X.get_desc().to_default_format(),
|
||||
Y->template mutable_data<float>());
|
||||
X.reorder_to(temp_ten);
|
||||
} else {
|
||||
CAFFE_THROW("Unsupported ideep type: ",
|
||||
static_cast<int>(X.get_data_type()));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class IDEEPWeightedSumOp : public IDEEPOperator {
|
||||
public:
|
||||
USE_IDEEP_DEF_ALIASES();
|
||||
USE_IDEEP_OPERATOR_FUNCTIONS();
|
||||
|
||||
IDEEPWeightedSumOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: IDEEPOperator(operator_def, ws) {}
|
||||
bool RunOnDevice() override {
|
||||
CAFFE_ENFORCE_EQ(InputSize() % 2, 0);
|
||||
auto ndims = Input(0).ndims();
|
||||
auto nelems = Input(0).get_nelems();
|
||||
auto w_nelems = Input(1).get_nelems();
|
||||
CAFFE_ENFORCE_GT(nelems, 0);
|
||||
CAFFE_ENFORCE_EQ(w_nelems, 1);
|
||||
auto* output = Output(0);
|
||||
std::vector<float> scales;
|
||||
scales.reserve(InputSize() / 2);
|
||||
std::vector<itensor> inputs;
|
||||
inputs.reserve(InputSize() / 2);
|
||||
for (int i = 0; i < InputSize(); i += 2) {
|
||||
auto& X = Input(i);
|
||||
CAFFE_ENFORCE(X.ndims() == ndims);
|
||||
CAFFE_ENFORCE(X.get_nelems() == nelems);
|
||||
CAFFE_ENFORCE(Input(i + 1).get_nelems() == w_nelems);
|
||||
inputs.push_back(X);
|
||||
auto scale = static_cast<float *>(Input(i + 1).get_data_handle());
|
||||
scales.push_back(scale[0]);
|
||||
}
|
||||
|
||||
ideep::sum::compute(scales, inputs, *output);
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_IDEEP_OPERATOR(CopyCPUToIDEEP, CopyCPUToIDEEPOp);
|
||||
REGISTER_IDEEP_OPERATOR(CopyIDEEPToCPU, CopyIDEEPToCPUOp);
|
||||
REGISTER_IDEEP_OPERATOR(Copy, IDEEPCopyOp);
|
||||
REGISTER_IDEEP_OPERATOR(WeightedSum, IDEEPWeightedSumOp);
|
||||
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-function,cppcoreguidelines-avoid-non-const-global-variables)
|
||||
OPERATOR_SCHEMA(CopyCPUToIDEEP)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.Input(0, "cpu_blob", "The input TensorCPU to copy")
|
||||
.Output(0, "ideep_blob", "The output IDEEP tensort to copy to");
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-function,cppcoreguidelines-avoid-non-const-global-variables)
|
||||
OPERATOR_SCHEMA(CopyIDEEPToCPU)
|
||||
.NumInputs(1)
|
||||
.NumOutputs(1)
|
||||
.Input(0, "ideep_blob", "The input IDEEP tensort to copy")
|
||||
.Output(0, "cpu_blob", "The output TensorCPU to copy to");
|
||||
|
||||
} // namespace
|
||||
@ -1,171 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <random>
|
||||
|
||||
#include <caffe2/core/context.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class IDEEPContext final : public BaseContext {
|
||||
public:
|
||||
typedef std::mt19937 rand_gen_type;
|
||||
IDEEPContext() : random_seed_(RandomNumberSeed()) {}
|
||||
explicit IDEEPContext(const DeviceOption& option)
|
||||
: random_seed_(
|
||||
option.has_random_seed() ? option.random_seed()
|
||||
: RandomNumberSeed()) {
|
||||
CAFFE_ENFORCE_EQ(option.device_type(), PROTO_IDEEP);
|
||||
}
|
||||
explicit IDEEPContext(const at::Device& device)
|
||||
: IDEEPContext(DeviceToOption(device)) {}
|
||||
|
||||
~IDEEPContext() noexcept override {}
|
||||
|
||||
inline void SwitchToDevice(int64_t /*stream_id*/) override {}
|
||||
using BaseContext::SwitchToDevice;
|
||||
|
||||
inline void WaitEvent(const Event& ev) override {
|
||||
ev.Wait(IDEEP, this);
|
||||
}
|
||||
|
||||
inline void Record(Event* ev, const char* err_msg = nullptr) const override {
|
||||
CAFFE_ENFORCE(ev, "Event must not be null.");
|
||||
ev->Record(IDEEP, this, err_msg);
|
||||
}
|
||||
|
||||
|
||||
inline void FinishDeviceComputation() override {}
|
||||
|
||||
inline rand_gen_type& RandGenerator() {
|
||||
if (!random_generator_.get()) {
|
||||
random_generator_.reset(new rand_gen_type(random_seed_));
|
||||
}
|
||||
return *random_generator_.get();
|
||||
}
|
||||
|
||||
inline static at::DataPtr New(size_t nbytes) {
|
||||
return GetAllocator(CPU)->allocate(nbytes);
|
||||
}
|
||||
|
||||
void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
|
||||
if (nbytes == 0) {
|
||||
return;
|
||||
}
|
||||
CAFFE_ENFORCE(src);
|
||||
CAFFE_ENFORCE(dst);
|
||||
memcpy(dst, src, nbytes);
|
||||
}
|
||||
|
||||
void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
|
||||
CopyBytesSameDevice(nbytes, src, dst);
|
||||
}
|
||||
|
||||
void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
|
||||
CopyBytesSameDevice(nbytes, src, dst);
|
||||
}
|
||||
|
||||
bool SupportsNonFundamentalTypes() const override {
|
||||
// IDEEP meta copy is OK
|
||||
return true;
|
||||
}
|
||||
|
||||
// Two copy functions that deals with cross-device copies.
|
||||
template <class SrcContext, class DstContext>
|
||||
inline void CopyBytes(size_t nbytes, const void* src, void* dst);
|
||||
|
||||
template <typename T, class SrcContext, class DstContext>
|
||||
inline void Copy(size_t n, const T* src, T* dst) {
|
||||
if (c10::guts::is_fundamental<T>::value) {
|
||||
CopyBytes<SrcContext, DstContext>(
|
||||
n * sizeof(T),
|
||||
static_cast<const void*>(src),
|
||||
static_cast<void*>(dst));
|
||||
} else {
|
||||
for (const auto i : c10::irange(n)) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class SrcContext, class DstContext>
|
||||
inline void
|
||||
CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
|
||||
if (meta.copy()) {
|
||||
meta.copy()(src, dst, n);
|
||||
} else {
|
||||
CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static bool HasAsyncPartDefault() {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool SupportsAsyncScheduling() {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
|
||||
return true;
|
||||
}
|
||||
|
||||
at::Device device() const override {
|
||||
return at::Device(IDEEP);
|
||||
}
|
||||
|
||||
DeviceType device_type() const override {
|
||||
return IDEEP;
|
||||
}
|
||||
|
||||
static constexpr DeviceType GetDeviceType() {
|
||||
return IDEEP;
|
||||
}
|
||||
|
||||
protected:
|
||||
// TODO(jiayq): instead of hard-coding a generator, make it more flexible.
|
||||
int random_seed_{1701};
|
||||
std::unique_ptr<rand_gen_type> random_generator_;
|
||||
};
|
||||
|
||||
template <>
|
||||
inline void IDEEPContext::CopyBytes<IDEEPContext, IDEEPContext>(
|
||||
size_t nbytes,
|
||||
const void* src,
|
||||
void* dst) {
|
||||
if (nbytes == 0) {
|
||||
return;
|
||||
}
|
||||
CAFFE_ENFORCE(src);
|
||||
CAFFE_ENFORCE(dst);
|
||||
memcpy(dst, src, nbytes);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void IDEEPContext::CopyBytes<CPUContext, IDEEPContext>(
|
||||
size_t nbytes,
|
||||
const void* src,
|
||||
void* dst) {
|
||||
if (nbytes == 0) {
|
||||
return;
|
||||
}
|
||||
CAFFE_ENFORCE(src);
|
||||
CAFFE_ENFORCE(dst);
|
||||
memcpy(dst, src, nbytes);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void IDEEPContext::CopyBytes<IDEEPContext, CPUContext>(
|
||||
size_t nbytes,
|
||||
const void* src,
|
||||
void* dst) {
|
||||
if (nbytes == 0) {
|
||||
return;
|
||||
}
|
||||
CAFFE_ENFORCE(src);
|
||||
CAFFE_ENFORCE(dst);
|
||||
memcpy(dst, src, nbytes);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
@ -1,150 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ideep.hpp>
|
||||
#include <caffe2/core/operator.h>
|
||||
#include <caffe2/proto/caffe2_pb.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
C10_DECLARE_REGISTRY(
|
||||
IDEEPOperatorRegistry,
|
||||
OperatorBase,
|
||||
const OperatorDef&,
|
||||
Workspace*);
|
||||
|
||||
#define REGISTER_IDEEP_OPERATOR_CREATOR(key, ...) \
|
||||
C10_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__)
|
||||
#define REGISTER_IDEEP_OPERATOR(name, ...) \
|
||||
C10_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__)
|
||||
#define REGISTER_IDEEP_OPERATOR_WITH_ENGINE(name, engine, ...) \
|
||||
C10_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
|
||||
#define REGISTER_IDEEP_OPERATOR_STR(str_name, ...) \
|
||||
C10_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__)
|
||||
#define REGISTER_IDEEP_COMPARE_OPERATOR(Op) \
|
||||
REGISTER_IDEEP_OPERATOR( \
|
||||
Op, \
|
||||
IDEEPFallbackOp<BinaryElementwiseOp< \
|
||||
TensorTypes<bool, int32_t, int64_t, float, double>, \
|
||||
CPUContext, \
|
||||
Op##Functor<CPUContext>, \
|
||||
FixedType<bool>>>)
|
||||
|
||||
|
||||
// IDEEPOperator is the base scaffolding of the operators that uses IDEEP. It
|
||||
// provides a few operators that are useful to IDEEP specific implementations.
|
||||
class IDEEPOperator : public OperatorBase {
|
||||
public:
|
||||
explicit IDEEPOperator(const OperatorDef& operator_def, Workspace* ws)
|
||||
: OperatorBase(operator_def, ws),
|
||||
context_(operator_def.device_option()),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
}
|
||||
~IDEEPOperator() override {}
|
||||
|
||||
inline const ideep::tensor& Input(int index) {
|
||||
return OperatorBase::template Input<ideep::tensor>(index);
|
||||
}
|
||||
inline ideep::tensor* Output(int index) {
|
||||
return OperatorBase::template Output<ideep::tensor>(index);
|
||||
}
|
||||
|
||||
// The run function of Operator switches to the device, and then carries out
|
||||
// the actual computation with RunOnDevice(). You should implement RunOnDevice
|
||||
// instead of Run().
|
||||
bool Run(int /* unused */ /*stream_id*/) final {
|
||||
// Since IDEEP does not need to do SwithToDevice and
|
||||
// FinishDeviceComputation,
|
||||
// it is always just a re-route to RunOnDevice().
|
||||
try {
|
||||
StartAllObservers();
|
||||
bool result = RunOnDevice();
|
||||
StopAllObservers();
|
||||
return result;
|
||||
} catch (EnforceNotMet& err) {
|
||||
TORCH_RETHROW(err, getErrorMsg());
|
||||
} catch (ideep::error& e) {
|
||||
LOG(ERROR) << "IDEEP error:" << e.message;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
// Waits for a previous event. Note that to properly wait and run
|
||||
// asynchronously, WaitEvent, RunAsync and Record should all be executed
|
||||
// on the same CPU thread.
|
||||
void WaitEvent(const Event& ev, int /* unused */) final {
|
||||
context_.WaitEvent(ev);
|
||||
}
|
||||
|
||||
void WaitEvents(const std::vector<const Event*>& events, int /* unused */)
|
||||
final {
|
||||
for (const auto& ev : events) {
|
||||
context_.WaitEvent(*ev);
|
||||
}
|
||||
}
|
||||
|
||||
void RecordEvent(const char* err_msg = nullptr) final {
|
||||
if (event_) {
|
||||
context_.Record(event_.get(), err_msg);
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool RunOnDevice() = 0;
|
||||
|
||||
protected:
|
||||
std::string getErrorMsg() {
|
||||
if (has_debug_def()) {
|
||||
return "Error from operator: " + ProtoDebugString(debug_def());
|
||||
} else {
|
||||
return "Error from operator: no op def";
|
||||
}
|
||||
}
|
||||
|
||||
IDEEPContext context_;
|
||||
StorageOrder order_;
|
||||
};
|
||||
|
||||
#define USE_IDEEP_OPERATOR_FUNCTIONS() \
|
||||
USE_OPERATOR_BASE_FUNCTIONS; \
|
||||
/* using override */ using IDEEPOperator::Input; \
|
||||
/* using override */ using IDEEPOperator::Output; \
|
||||
/* using override */ using IDEEPOperator::order_; \
|
||||
/* using override */ using IDEEPOperator::context_;
|
||||
|
||||
#define USE_SIMPLE_IDEEP_CTOR_DTOR(name) \
|
||||
name(const OperatorDef& operator_def, Workspace* ws) \
|
||||
: IDEEPOperator(operator_def, ws) {} \
|
||||
~name() override {}
|
||||
|
||||
// Convert zero_point scales to min_max scales
|
||||
// NOTE:
|
||||
// The scales in operator is saved in FBGEMM format,
|
||||
// while FBGEMM scales are the reciprocals of MKL-DNN scales.
|
||||
// This function is provided to convert scales from FBGEMM to MKL-DNN
|
||||
inline ideep::scale_t ConvertScales(
|
||||
const std::vector<float> scales_z) {
|
||||
ideep::scale_t scales (scales_z);
|
||||
for (auto it = scales.begin(); it != scales.end(); it++) {
|
||||
*it = 1.0f / *it;
|
||||
}
|
||||
return scales;
|
||||
}
|
||||
|
||||
inline ideep::tensor::dims CanonicalDims(
|
||||
ideep::tensor::dims adims, int32_t axis) {
|
||||
CAFFE_ENFORCE(axis < (int32_t)adims.size(), "Invalid axis!");
|
||||
CAFFE_ENFORCE(axis > (int32_t)-adims.size(), "Invalid axis!");
|
||||
if (adims.size() == 2 || axis == 1)
|
||||
return adims;
|
||||
if (axis < 0) {
|
||||
axis += (int32_t)adims.size();
|
||||
}
|
||||
|
||||
auto dim0 = std::accumulate(adims.begin(), adims.begin() + axis, 1,
|
||||
std::multiplies<ideep::tensor::dim_t>());
|
||||
auto dim1 = std::accumulate(adims.begin() + axis, adims.end(), 1,
|
||||
std::multiplies<ideep::tensor::dim_t>());
|
||||
return ideep::tensor::dims({dim0, dim1});
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
@ -1,63 +0,0 @@
|
||||
#include <caffe2/core/event_cpu.h>
|
||||
#include <caffe2/core/operator.h>
|
||||
#include <caffe2/proto/caffe2_pb.h>
|
||||
#include <ideep/tensor.hpp>
|
||||
#include "ideep_context.h"
|
||||
|
||||
namespace at {
|
||||
REGISTER_CONTEXT(DeviceType::IDEEP, caffe2::IDEEPContext);
|
||||
|
||||
namespace {
|
||||
void CopyBytesWrapper(
|
||||
size_t nbytes,
|
||||
const void* src,
|
||||
Device src_device,
|
||||
void* dst,
|
||||
Device dst_device) {
|
||||
if (nbytes == 0) {
|
||||
return;
|
||||
}
|
||||
CAFFE_ENFORCE(src);
|
||||
CAFFE_ENFORCE(dst);
|
||||
memcpy(dst, src, nbytes);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
REGISTER_COPY_BYTES_FUNCTION(
|
||||
DeviceType::IDEEP,
|
||||
DeviceType::CPU,
|
||||
CopyBytesWrapper);
|
||||
REGISTER_COPY_BYTES_FUNCTION(
|
||||
DeviceType::CPU,
|
||||
DeviceType::IDEEP,
|
||||
CopyBytesWrapper);
|
||||
REGISTER_COPY_BYTES_FUNCTION(
|
||||
DeviceType::IDEEP,
|
||||
DeviceType::IDEEP,
|
||||
CopyBytesWrapper);
|
||||
} // namespace at
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
CAFFE_KNOWN_TYPE(ideep::tensor);
|
||||
|
||||
C10_DEFINE_REGISTRY(
|
||||
IDEEPOperatorRegistry,
|
||||
OperatorBase,
|
||||
const OperatorDef&,
|
||||
Workspace*);
|
||||
|
||||
CAFFE_REGISTER_DEVICE_TYPE(DeviceType::IDEEP, IDEEPOperatorRegistry);
|
||||
|
||||
REGISTER_EVENT_CREATE_FUNCTION(IDEEP, EventCreateCPU);
|
||||
REGISTER_EVENT_RECORD_FUNCTION(IDEEP, EventRecordCPU);
|
||||
REGISTER_EVENT_WAIT_FUNCTION(IDEEP, IDEEP, EventWaitCPUCPU);
|
||||
REGISTER_EVENT_WAIT_FUNCTION(IDEEP, CPU, EventWaitCPUCPU);
|
||||
REGISTER_EVENT_WAIT_FUNCTION(CPU, IDEEP, EventWaitCPUCPU);
|
||||
REGISTER_EVENT_FINISH_FUNCTION(IDEEP, EventFinishCPU);
|
||||
REGISTER_EVENT_QUERY_FUNCTION(IDEEP, EventQueryCPU);
|
||||
REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
|
||||
REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
|
||||
REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);
|
||||
|
||||
} // namespace caffe2
|
||||
@ -1,11 +0,0 @@
|
||||
add_subdirectory(contrib)
|
||||
|
||||
# CPU source, test sources, binary sources
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
|
||||
|
||||
# GPU source, test sources, binary sources
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
|
||||
@ -1,15 +0,0 @@
|
||||
add_subdirectory(ios)
|
||||
|
||||
if(USE_NNAPI)
|
||||
add_subdirectory(nnapi)
|
||||
endif()
|
||||
|
||||
# CPU source, test sources, binary sources
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
|
||||
|
||||
# GPU source, test sources, binary sources
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
|
||||
@ -1,17 +0,0 @@
|
||||
# TODO: figure out conflict between contrib/nnpack/nnpack_ops.cc and mobile_nnpack.cc
|
||||
if(IOS)
|
||||
# Basic ios srcs.
|
||||
set(Caffe2_CONTRIB_IOS_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe_predictor.cc"
|
||||
# "${CMAKE_CURRENT_SOURCE_DIR}/mobile_nnpack.cc"
|
||||
)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_IOS_SRC})
|
||||
|
||||
if(USE_METAL)
|
||||
# metal/mpscnn files
|
||||
add_subdirectory(mpscnn)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
@ -1,52 +0,0 @@
|
||||
|
||||
#include "ios_caffe.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
|
||||
#include "caffe2/predictor/predictor.h"
|
||||
|
||||
Caffe2IOSPredictor* MakeCaffe2Predictor(const std::string& init_net_str,
|
||||
const std::string& predict_net_str,
|
||||
bool disableMultithreadProcessing,
|
||||
bool allowMetalOperators,
|
||||
std::string& errorMessage) {
|
||||
caffe2::NetDef init_net, predict_net;
|
||||
init_net.ParseFromString(init_net_str);
|
||||
predict_net.ParseFromString(predict_net_str);
|
||||
|
||||
Caffe2IOSPredictor* predictor = NULL;
|
||||
try {
|
||||
predictor = Caffe2IOSPredictor::NewCaffe2IOSPredictor(
|
||||
init_net, predict_net, disableMultithreadProcessing, allowMetalOperators);
|
||||
} catch (const std::exception& e) {
|
||||
std::string error = e.what();
|
||||
errorMessage.swap(error);
|
||||
return NULL;
|
||||
}
|
||||
return predictor;
|
||||
}
|
||||
|
||||
void GenerateStylizedImage(std::vector<float>& originalImage,
|
||||
const std::string& init_net_str,
|
||||
const std::string& predict_net_str,
|
||||
int height,
|
||||
int width,
|
||||
std::vector<float>& dataOut) {
|
||||
caffe2::NetDef init_net, predict_net;
|
||||
init_net.ParseFromString(init_net_str);
|
||||
predict_net.ParseFromString(predict_net_str);
|
||||
caffe2::Predictor p(init_net, predict_net);
|
||||
|
||||
std::vector<int> dims({1, 3, height, width});
|
||||
caffe2::Tensor input(caffe2::CPU);
|
||||
input.Resize(dims);
|
||||
input.ShareExternalPointer(originalImage.data());
|
||||
caffe2::Predictor::TensorList input_vec;
|
||||
input_vec.emplace_back(std::move(input));
|
||||
caffe2::Predictor::TensorList output_vec;
|
||||
p(input_vec, &output_vec);
|
||||
assert(output_vec.size() == 1);
|
||||
caffe2::TensorCPU* output = &output_vec.front();
|
||||
// output is our styled image
|
||||
float* outputArray = output->mutable_data<float>();
|
||||
dataOut.assign(outputArray, outputArray + output->size());
|
||||
}
|
||||
@ -1,25 +0,0 @@
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "caffe2/mobile/contrib/ios/ios_caffe_defines.h"
|
||||
#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
|
||||
#include "caffe2/predictor/predictor.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
IOS_CAFFE_EXPORT Caffe2IOSPredictor* MakeCaffe2Predictor(const std::string& init_net_str,
|
||||
const std::string& predict_net_str,
|
||||
bool disableMultithreadProcessing,
|
||||
bool allowMetalOperators,
|
||||
std::string& errorMessage);
|
||||
IOS_CAFFE_EXPORT void GenerateStylizedImage(std::vector<float>& originalImage,
|
||||
const std::string& init_net_str,
|
||||
const std::string& predict_net_str,
|
||||
int height,
|
||||
int width,
|
||||
std::vector<float>& dataOut);
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -1,2 +0,0 @@
|
||||
|
||||
#define IOS_CAFFE_EXPORT __attribute__((visibility("default")))
|
||||
@ -1,68 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
|
||||
#include "caffe2/core/flags.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
|
||||
#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
|
||||
#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
|
||||
#endif
|
||||
|
||||
C10_DECLARE_bool(caffe2_force_shared_col_buffer);
|
||||
|
||||
Caffe2IOSPredictor* Caffe2IOSPredictor::NewCaffe2IOSPredictor(const caffe2::NetDef& init_net,
|
||||
const caffe2::NetDef& predict_net,
|
||||
bool disableMultithreadProcessing,
|
||||
bool allowMetalOperators) {
|
||||
caffe2::NetDef metal_predict_net;
|
||||
bool usingMetalOperators = false;
|
||||
#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
|
||||
if (allowMetalOperators) {
|
||||
caffe2::dumpDef(predict_net);
|
||||
if (caffe2::tryConvertToMPSCNN(init_net, predict_net, &metal_predict_net)) {
|
||||
LOG(INFO) << "Successfully converted to MPSCNN";
|
||||
caffe2::dumpDef(metal_predict_net);
|
||||
usingMetalOperators = true;
|
||||
} else {
|
||||
LOG(ERROR) << "Failed converting model to MPSCNN";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return new Caffe2IOSPredictor(init_net,
|
||||
usingMetalOperators ? metal_predict_net : predict_net,
|
||||
disableMultithreadProcessing,
|
||||
usingMetalOperators);
|
||||
}
|
||||
|
||||
Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
|
||||
const caffe2::NetDef& predict_net,
|
||||
bool disableMultithreadProcessing,
|
||||
bool usingMetalOperators)
|
||||
: usingMetalOperators(usingMetalOperators), predictor_(init_net, predict_net) {
|
||||
#ifdef C10_MOBILE
|
||||
if (disableMultithreadProcessing) {
|
||||
caffe2::ThreadPool* threadpool = predictor_.ws()->GetThreadPool();
|
||||
if (threadpool != nullptr) {
|
||||
threadpool->setMinWorkSize(std::numeric_limits<size_t>::max());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void Caffe2IOSPredictor::run(const Tensor& inData, Tensor& outData, std::string& errorMessage) {
|
||||
FLAGS_caffe2_force_shared_col_buffer = true;
|
||||
caffe2::Tensor input = caffe2::empty(inData.dims, at::dtype<uint8_t>().device(caffe2::CPU));
|
||||
input.ShareExternalPointer(inData.data);
|
||||
caffe2::Predictor::TensorList input_vec;
|
||||
input_vec.emplace_back(std::move(input));
|
||||
caffe2::Predictor::TensorList output_vec;
|
||||
try {
|
||||
predictor_(input_vec, &output_vec);
|
||||
} catch (const std::exception& e) {
|
||||
std::string error = e.what();
|
||||
errorMessage.swap(error);
|
||||
return;
|
||||
}
|
||||
caffe2::Tensor* output = &output_vec.front();
|
||||
outData.data = output->mutable_data<uint8_t>();
|
||||
outData.dims = output->sizes().vec();
|
||||
}
|
||||
@ -1,36 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/mobile/contrib/ios/ios_caffe_defines.h"
|
||||
#include "caffe2/predictor/predictor.h"
|
||||
|
||||
struct Tensor {
|
||||
std::vector<int64_t> dims;
|
||||
uint8_t* data;
|
||||
};
|
||||
|
||||
class IOS_CAFFE_EXPORT Caffe2IOSPredictor final {
|
||||
public:
|
||||
/**
|
||||
@allowMetalOperators Allow converting eligible operators to Metal GPU framework accelerated
|
||||
operators. Setting this flag to true doesn't guarantee predictor will be using Metal operators;
|
||||
Client code must check usingMetalOperators flag to determine predictor is using them.
|
||||
*/
|
||||
static Caffe2IOSPredictor* NewCaffe2IOSPredictor(const caffe2::NetDef& init_net,
|
||||
const caffe2::NetDef& predict_net,
|
||||
bool disableMultithreadProcessing,
|
||||
bool allowMetalOperators);
|
||||
void run(const Tensor& inData, Tensor& outData, std::string& errorMessage);
|
||||
~Caffe2IOSPredictor(){};
|
||||
|
||||
const bool usingMetalOperators;
|
||||
|
||||
private:
|
||||
Caffe2IOSPredictor(const caffe2::NetDef& init_net,
|
||||
const caffe2::NetDef& predict_net,
|
||||
bool disableMultithreadProcessing,
|
||||
bool usingMetalOperators);
|
||||
caffe2::Predictor predictor_;
|
||||
};
|
||||
@ -1,7 +0,0 @@
|
||||
if(USE_METAL)
|
||||
file(GLOB_RECURSE tmp *.mm *.cc)
|
||||
# exclude test files
|
||||
file(GLOB_RECURSE test_files *_test.cc)
|
||||
exclude(tmp "${tmp}" ${test_files})
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
|
||||
endif()
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,23 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
namespace caffe2 {
|
||||
static constexpr const char* kMPSCNNReadCountArg = "__mpscnn_read_count__";
|
||||
static constexpr const char* kMPSCNNOutputIsTempImageArg = "__mpscnn_output_is_temp_img__";
|
||||
static constexpr const int kMetalMaxTextureArrLength = 2048;
|
||||
// We currently only try to convert a fixed set of operators that handle a subset of a full
|
||||
// CNN. We also only run when MPSCNN is available, provides a speedup.
|
||||
// On failure, returns false. On success, returns true, and sets the MPSCNN net in the output
|
||||
// parameter.
|
||||
|
||||
bool tryConvertToMPSCNN(const NetDef& initNet, const NetDef& predictNet, NetDef* mpscnnPredictNet);
|
||||
|
||||
// Exposed for testing.
|
||||
NetDef annotateDefWithReadCounts(const NetDef& net);
|
||||
NetDef rewriteForMetal(const NetDef& net);
|
||||
NetDef runMPSCNNFusion(const NetDef& net);
|
||||
void dumpDef(const NetDef& d);
|
||||
void mpscnnRecordExecutionFinish();
|
||||
} // namespace caffe2
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user