mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
2021-04-21 nightly release (1e03a2505f9cee92587bf45fbbbbfedede5cb9ec)
This commit is contained in:
@ -125,7 +125,7 @@ def FalsePred(_):
|
||||
def TruePred(_):
|
||||
return True
|
||||
|
||||
_VC2019 = VcSpec(2019, ["14", "28", "29333"], hide_version=True)
|
||||
_VC2019 = VcSpec(2019)
|
||||
|
||||
WORKFLOW_DATA = [
|
||||
# VS2019 CUDA-10.1
|
||||
|
@ -6786,7 +6786,7 @@ workflows:
|
||||
python_version: "3.6"
|
||||
use_cuda: "1"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_test:
|
||||
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
|
||||
@ -6799,7 +6799,7 @@ workflows:
|
||||
test_name: pytorch-windows-test1
|
||||
use_cuda: "1"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_test:
|
||||
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
|
||||
@ -6812,7 +6812,7 @@ workflows:
|
||||
test_name: pytorch-windows-test2
|
||||
use_cuda: "1"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_build:
|
||||
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
|
||||
@ -6821,7 +6821,7 @@ workflows:
|
||||
python_version: "3.6"
|
||||
use_cuda: "1"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_test:
|
||||
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
|
||||
@ -6840,7 +6840,7 @@ workflows:
|
||||
test_name: pytorch-windows-test1
|
||||
use_cuda: "1"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_test:
|
||||
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
|
||||
@ -6859,7 +6859,7 @@ workflows:
|
||||
test_name: pytorch-windows-test2
|
||||
use_cuda: "1"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_build:
|
||||
build_environment: pytorch-win-vs2019-cpu-py3
|
||||
@ -6868,7 +6868,7 @@ workflows:
|
||||
python_version: "3.6"
|
||||
use_cuda: "0"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_test:
|
||||
build_environment: pytorch-win-vs2019-cpu-py3
|
||||
@ -6886,7 +6886,7 @@ workflows:
|
||||
test_name: pytorch-windows-test1
|
||||
use_cuda: "0"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_test:
|
||||
build_environment: pytorch-win-vs2019-cpu-py3
|
||||
@ -6904,7 +6904,7 @@ workflows:
|
||||
test_name: pytorch-windows-test2
|
||||
use_cuda: "0"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- pytorch_windows_test:
|
||||
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
|
||||
@ -6922,7 +6922,7 @@ workflows:
|
||||
test_name: pytorch-windows-test1
|
||||
use_cuda: "0"
|
||||
vc_product: BuildTools
|
||||
vc_version: "14.28.29333"
|
||||
vc_version: ""
|
||||
vc_year: "2019"
|
||||
- update_s3_htmls:
|
||||
context: org-member
|
||||
|
164
.github/scripts/generate_linux_ci_workflows.py
vendored
Executable file
164
.github/scripts/generate_linux_ci_workflows.py
vendored
Executable file
@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import jinja2
|
||||
|
||||
DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
|
||||
|
||||
GITHUB_DIR = Path(__file__).parent.parent
|
||||
|
||||
CPU_TEST_RUNNER = "linux.2xlarge"
|
||||
CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
|
||||
|
||||
|
||||
class PyTorchLinuxWorkflow:
|
||||
def __init__(self, build_environment: str, docker_image_base: str):
|
||||
self.build_environment = build_environment
|
||||
self.docker_image_base = docker_image_base
|
||||
self.test_runner_type = CPU_TEST_RUNNER
|
||||
if "cuda" in build_environment:
|
||||
self.test_runner_type = CUDA_TEST_RUNNER
|
||||
|
||||
def generate_workflow_file(
|
||||
self, workflow_template: jinja2.Template, jinja_env: jinja2.Environment
|
||||
) -> Path:
|
||||
output_file_path = GITHUB_DIR.joinpath(
|
||||
f"workflows/{self.build_environment}.yml"
|
||||
)
|
||||
with open(output_file_path, "w") as output_file:
|
||||
output_file.write(
|
||||
workflow_template.render(
|
||||
build_environment=self.build_environment,
|
||||
docker_image_base=self.docker_image_base,
|
||||
test_runner_type=self.test_runner_type
|
||||
)
|
||||
)
|
||||
output_file.write('\n')
|
||||
return output_file_path
|
||||
|
||||
|
||||
WORKFLOWS = [
|
||||
PyTorchLinuxWorkflow(
|
||||
build_environment="pytorch-linux-xenial-py3.6-gcc5.4",
|
||||
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
|
||||
),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-parallelnative-linux-xenial-py3.6-gcc5.4",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-pure_torch-linux-xenial-py3.6-gcc5.4",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3.6-gcc7",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-asan",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang7-onnx",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-bionic-py3.6-clang9-noarch",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-xla-linux-bionic-py3.6-clang9",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-vulkan-linux-bionic-py3.6-clang9",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-bionic-py3.8-gcc9-coverage",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.8-gcc9",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-bionic-rocm3.9-py3.6",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm3.9-py3.6",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-mobile",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-dynamic",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-static",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-code-analysis",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
# PyTorchLinuxWorkflow(
|
||||
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
|
||||
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
|
||||
# ),
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
jinja_env = jinja2.Environment(
|
||||
variable_start_string="!{{",
|
||||
loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))),
|
||||
)
|
||||
workflow_template = jinja_env.get_template("linux_ci_workflow.yml.in")
|
||||
for workflow in WORKFLOWS:
|
||||
print(
|
||||
workflow.generate_workflow_file(
|
||||
workflow_template=workflow_template,
|
||||
jinja_env=jinja_env
|
||||
)
|
||||
)
|
43
.github/scripts/install_nvidia_utils_linux.sh
vendored
Executable file
43
.github/scripts/install_nvidia_utils_linux.sh
vendored
Executable file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -eou pipefail
|
||||
|
||||
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \
|
||||
DRIVER_FN="NVIDIA-Linux-x86_64-460.39.run"
|
||||
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
|
||||
|
||||
install_nvidia_docker2_amzn2() {
|
||||
(
|
||||
set -x
|
||||
# Needed for yum-config-manager
|
||||
sudo yum install -y yum-utils
|
||||
sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
|
||||
sudo yum install -y nvidia-docker2
|
||||
sudo systemctl restart docker
|
||||
)
|
||||
}
|
||||
|
||||
install_nvidia_driver() {
|
||||
(
|
||||
set -x
|
||||
sudo yum groupinstall -y "Development Tools"
|
||||
curl -fsL -o nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
|
||||
sudo /bin/bash nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
|
||||
nvidia-smi
|
||||
)
|
||||
}
|
||||
|
||||
# Install container toolkit based on distribution
|
||||
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
|
||||
case "${DISTRIBUTION}" in
|
||||
amzn*)
|
||||
install_nvidia_docker2_amzn2
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "== Installing nvidia driver ${DRIVER_FN} =="
|
||||
install_nvidia_driver
|
5
.github/scripts/report_git_status.sh
vendored
Executable file
5
.github/scripts/report_git_status.sh
vendored
Executable file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
CHANGES=$(git status --porcelain)
|
||||
echo "$CHANGES"
|
||||
git diff
|
||||
[ -z "$CHANGES" ]
|
193
.github/templates/linux_ci_workflow.yml.in
vendored
Normal file
193
.github/templates/linux_ci_workflow.yml.in
vendored
Normal file
@ -0,0 +1,193 @@
|
||||
# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
|
||||
#
|
||||
# Template is at: .github/templates/linux_ci_workflow.yml
|
||||
# Generation script: .github/scripts/generate_linux_ci_workflows.py
|
||||
name: Linux CI (!{{ build_environment }})
|
||||
|
||||
on:
|
||||
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
|
||||
# pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- release/*
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
BUILD_ENVIRONMENT: !{{ build_environment }}
|
||||
DOCKER_IMAGE_BASE: !{{ docker_image_base }}
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
TORCH_CUDA_ARCH_LIST: 5.2
|
||||
IN_CI: 1
|
||||
|
||||
jobs:
|
||||
calculate-docker-image:
|
||||
runs-on: ubuntu-18.04
|
||||
outputs:
|
||||
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v2
|
||||
- name: Calculate docker image tag
|
||||
id: calculate-tag
|
||||
run: |
|
||||
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
|
||||
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
|
||||
build:
|
||||
runs-on: linux.2xlarge
|
||||
needs: calculate-docker-image
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
|
||||
steps:
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
|
||||
submodules: recursive
|
||||
- name: Log in to ECR
|
||||
run: |
|
||||
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
|
||||
bash /tmp/ecr-login.sh
|
||||
rm /tmp/ecr-login.sh
|
||||
- name: Pull docker image
|
||||
run: |
|
||||
docker pull "${DOCKER_IMAGE}"
|
||||
- name: Create test binary build directories
|
||||
run: |
|
||||
mkdir -pv ../custom-op-build
|
||||
mkdir -pv ../custom-backend-build
|
||||
mkdir -pv ../jit-hook-build
|
||||
- name: Build PyTorch
|
||||
run: |
|
||||
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
|
||||
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
|
||||
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
|
||||
# Why the three volume mounts here? So test binaries are put in the correct spot
|
||||
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
|
||||
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
|
||||
# TODO: Stop building test binaries as part of the build phase
|
||||
docker run \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e MAX_JOBS \
|
||||
-e SCCACHE_BUCKET \
|
||||
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
||||
-e TORCH_CUDA_ARCH_LIST \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--tty \
|
||||
--user jenkins \
|
||||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
|
||||
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
|
||||
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}" \
|
||||
sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)/../":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
|
||||
- name: Archive artifacts into zip
|
||||
run: |
|
||||
(cd "${GITHUB_WORKSPACE}/../" && zip -r pytorch/artifacts.zip pytorch/dist pytorch/build custom-op-build/ custom-backend-build/ jit-hook-build/)
|
||||
- uses: actions/upload-artifact@v2
|
||||
name: Store PyTorch Build Artifacts
|
||||
with:
|
||||
name: ${{ env.BUILD_ENVIRONMENT }}
|
||||
retention-days: 30
|
||||
if-no-files-found: error
|
||||
path:
|
||||
artifacts.zip
|
||||
- name: Clean up docker images
|
||||
if: always()
|
||||
run: |
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
test:
|
||||
runs-on: !{{ test_runner_type }}
|
||||
needs:
|
||||
- calculate-docker-image
|
||||
- build
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
|
||||
steps:
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v2
|
||||
- name: Log in to ECR
|
||||
run: |
|
||||
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
|
||||
bash /tmp/ecr-login.sh
|
||||
rm /tmp/ecr-login.sh
|
||||
- name: Pull docker image
|
||||
run: |
|
||||
docker pull "${DOCKER_IMAGE}"
|
||||
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
||||
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
|
||||
run: |
|
||||
bash .github/scripts/install_nvidia_utils_linux.sh
|
||||
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
|
||||
- name: Determine shm-size
|
||||
run: |
|
||||
shm_size="1g"
|
||||
case "${BUILD_ENVIRONMENT}" in
|
||||
*cuda*)
|
||||
shm_size="2g"
|
||||
;;
|
||||
*rocm*)
|
||||
shm_size="8g"
|
||||
;;
|
||||
esac
|
||||
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
|
||||
- uses: actions/download-artifact@v2
|
||||
name: Download PyTorch Build Artifacts
|
||||
with:
|
||||
name: ${{ env.BUILD_ENVIRONMENT }}
|
||||
- name: Unzip artifacts
|
||||
run: |
|
||||
(cd "${GITHUB_WORKSPACE}/../" && unzip -q pytorch/artifacts.zip)
|
||||
- name: Output disk space left
|
||||
run: |
|
||||
sudo df -H
|
||||
- name: Test PyTorch
|
||||
run: |
|
||||
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
|
||||
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
|
||||
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
|
||||
# Why the three volume mounts here? So test binaries are put in the correct spot
|
||||
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
|
||||
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
|
||||
# TODO: Stop building test binaries as part of the build phase
|
||||
# Used for GPU_FLAG since that doesn't play nice
|
||||
# shellcheck disable=SC2086
|
||||
docker run \
|
||||
${GPU_FLAG:-} \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e IN_CI \
|
||||
-e MAX_JOBS \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--shm-size="${SHM_SIZE}" \
|
||||
--tty \
|
||||
--user jenkins \
|
||||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
|
||||
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
|
||||
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}" \
|
||||
sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'
|
||||
- name: Clean up docker images
|
||||
if: always()
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
2
.github/workflows/auto_label.yml
vendored
2
.github/workflows/auto_label.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
ISSUE_NUMBER="${PR_NUMBER}"
|
||||
else
|
||||
TITLE="${ISSUE_TITLE}"
|
||||
ISSUE_NUMBER="${ISSUE_NUMBER}"
|
||||
# ISSUE_NUMBER is already set
|
||||
fi
|
||||
echo ::set-output name=TITLE::"${TITLE}"
|
||||
echo ::set-output name=ISSUE_NUMBER::"${ISSUE_NUMBER}"
|
||||
|
72
.github/workflows/lint.yml
vendored
72
.github/workflows/lint.yml
vendored
@ -34,10 +34,10 @@ jobs:
|
||||
- name: Extract scripts from GitHub Actions workflows
|
||||
run: tools/extract_scripts.py --out=.extracted_scripts
|
||||
- name: ShellCheck
|
||||
# https://github.com/koalaman/shellcheck/tree/v0.7.1#installing-a-pre-compiled-binary
|
||||
# https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
|
||||
run: |
|
||||
set -x
|
||||
scversion="v0.7.1"
|
||||
scversion="v0.7.2"
|
||||
wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
|
||||
sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
|
||||
rm -r "shellcheck-${scversion}"
|
||||
@ -45,19 +45,23 @@ jobs:
|
||||
tools/run_shellcheck.sh .jenkins/pytorch .extracted_scripts
|
||||
- name: Ensure correct trailing newlines
|
||||
run: |
|
||||
(! git grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)tools/clang_format_hash' | tools/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))
|
||||
(! git --no-pager grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)tools/clang_format_hash' | tools/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))
|
||||
- name: Ensure no trailing spaces
|
||||
run: |
|
||||
(! git grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
|
||||
(! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
|
||||
- name: Ensure no tabs
|
||||
run: |
|
||||
(! git grep -In $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above lines have tabs; please convert them to spaces"; false))
|
||||
(! git --no-pager grep -In $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above lines have tabs; please convert them to spaces"; false))
|
||||
- name: Ensure no non-breaking spaces
|
||||
run: |
|
||||
(! git grep -In $'\u00a0' -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
|
||||
(! git --no-pager grep -In $'\u00a0' -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
|
||||
- name: Ensure canonical include
|
||||
run: |
|
||||
(! git grep -In $'#include "' -- ./c10 ./aten ./torch/csrc ':(exclude)aten/src/ATen/native/quantized/cpu/qnnpack/**' || (echo "The above lines have include with quotes; please convert them to #include <xxxx>"; false))
|
||||
(! git --no-pager grep -In $'#include "' -- ./c10 ./aten ./torch/csrc ':(exclude)aten/src/ATen/native/quantized/cpu/qnnpack/**' || (echo "The above lines have include with quotes; please convert them to #include <xxxx>"; false))
|
||||
- name: Ensure no unqualified noqa
|
||||
run: |
|
||||
# shellcheck disable=SC2016
|
||||
(! git --no-pager grep -InP '# noqa(?!: [A-Z]+\d{3})' -- '**.py' ':(exclude)caffe2' || (echo 'The above lines have unqualified `noqa`; please convert them to `noqa: XXXX`'; false))
|
||||
# note that this next step depends on a clean checkout;
|
||||
# if you run it locally then it will likely to complain
|
||||
# about all the generated files in torch/test
|
||||
@ -75,7 +79,7 @@ jobs:
|
||||
python torch/testing/check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
|
||||
- name: Ensure no direct cub include
|
||||
run: |
|
||||
(! git grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
|
||||
(! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
|
||||
|
||||
python2-setup-compat:
|
||||
runs-on: ubuntu-18.04
|
||||
@ -91,6 +95,23 @@ jobs:
|
||||
run: |
|
||||
python2 setup.py | grep "Python 2 has reached end-of-life and is no longer supported by PyTorch."
|
||||
|
||||
templates:
|
||||
runs-on: ubuntu-18.04
|
||||
steps:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.x
|
||||
architecture: x64
|
||||
- name: Install Jinja2
|
||||
run: pip install Jinja2
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v2
|
||||
- name: Regenerate workflows
|
||||
run: .github/scripts/generate_linux_ci_workflows.py
|
||||
- name: Assert that regenerating the workflows didn't change them
|
||||
run: .github/scripts/report_git_status.sh
|
||||
|
||||
toc:
|
||||
runs-on: ubuntu-18.04
|
||||
# https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687
|
||||
@ -107,16 +128,11 @@ jobs:
|
||||
run: |
|
||||
set -eux
|
||||
export PATH=~/.npm-global/bin:"$PATH"
|
||||
for FILE in {CONTRIBUTING,README}.md; do
|
||||
for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
|
||||
markdown-toc --bullets='-' -i "$FILE"
|
||||
done
|
||||
- name: Assert that regenerating the ToCs didn't change them
|
||||
run: |
|
||||
set -eux
|
||||
CHANGES=$(git status --porcelain)
|
||||
echo "$CHANGES"
|
||||
git diff
|
||||
[ -z "$CHANGES" ]
|
||||
run: .github/scripts/report_git_status.sh
|
||||
|
||||
flake8-py3:
|
||||
runs-on: ubuntu-18.04
|
||||
@ -137,21 +153,23 @@ jobs:
|
||||
mkdir flake8-output
|
||||
cd flake8-output
|
||||
echo "$HEAD_SHA" > commit-sha.txt
|
||||
- name: Run flake8
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
set -eux
|
||||
pip install typing-extensions # for tools/translate_annotations.py
|
||||
pip install -r requirements-flake8.txt
|
||||
flake8 --version
|
||||
- name: Run flake8
|
||||
run: |
|
||||
set -eux
|
||||
flake8 | tee "${GITHUB_WORKSPACE}"/flake8-output.txt
|
||||
cp flake8-output.txt flake8-output/annotations.json
|
||||
- name: Translate annotations
|
||||
if: github.event_name == 'pull_request'
|
||||
env:
|
||||
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
|
||||
run: |
|
||||
tools/translate_annotations.py \
|
||||
--file=flake8-output.txt \
|
||||
--file="${GITHUB_WORKSPACE}"/flake8-output.txt \
|
||||
--regex='^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorCode>\w+\d+) (?P<errorDesc>.*)' \
|
||||
--commit="$HEAD_SHA" \
|
||||
> flake8-output/annotations.json
|
||||
@ -202,10 +220,7 @@ jobs:
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y clang-tidy-11
|
||||
sudo update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 1000
|
||||
- name: Run clang-tidy
|
||||
env:
|
||||
BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
|
||||
- name: Generate build files
|
||||
run: |
|
||||
set -eux
|
||||
git remote add upstream https://github.com/pytorch/pytorch
|
||||
@ -229,6 +244,12 @@ jobs:
|
||||
--native-functions-path aten/src/ATen/native/native_functions.yaml \
|
||||
--nn-path aten/src
|
||||
fi
|
||||
- name: Run clang-tidy
|
||||
env:
|
||||
BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
# Run Clang-Tidy
|
||||
# The negative filters below are to exclude files that include onnx_pb.h or
|
||||
@ -283,13 +304,16 @@ jobs:
|
||||
architecture: x64
|
||||
- name: Fetch PyTorch
|
||||
uses: actions/checkout@v2
|
||||
- name: Run cmakelint
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
set -eux
|
||||
pip install cmakelint
|
||||
cmakelint --version
|
||||
- name: Run cmakelint
|
||||
run: |
|
||||
set -eux
|
||||
git ls-files -z -- bootstrap '*.cmake' '*.cmake.in' '*CMakeLists.txt' | \
|
||||
grep -E -z -v '^(cmake/Modules/|cmake/Modules_CUDA_fix/)' | \
|
||||
grep -E -z -v '^(cmake/Modules/|cmake/Modules_CUDA_fix/|cmake/Caffe2Config.cmake.in|aten/src/ATen/ATenConfig.cmake.in|cmake/Caffe2ConfigVersion.cmake.in|cmake/TorchConfig.cmake.in|cmake/TorchConfigVersion.cmake.in|cmake/cmake_uninstall.cmake.in)' | \
|
||||
xargs -0 cmakelint --config=.cmakelintrc --spaces=2 --quiet
|
||||
|
||||
mypy:
|
||||
|
193
.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
vendored
Normal file
193
.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml
vendored
Normal file
@ -0,0 +1,193 @@
|
||||
# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
|
||||
#
|
||||
# Template is at: .github/templates/linux_ci_workflow.yml
|
||||
# Generation script: .github/scripts/generate_linux_ci_workflows.py
|
||||
name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
|
||||
|
||||
on:
|
||||
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
|
||||
# pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- release/*
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
BUILD_ENVIRONMENT: pytorch-linux-xenial-py3.6-gcc5.4
|
||||
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
TORCH_CUDA_ARCH_LIST: 5.2
|
||||
IN_CI: 1
|
||||
|
||||
jobs:
|
||||
calculate-docker-image:
|
||||
runs-on: ubuntu-18.04
|
||||
outputs:
|
||||
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v2
|
||||
- name: Calculate docker image tag
|
||||
id: calculate-tag
|
||||
run: |
|
||||
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
|
||||
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
|
||||
build:
|
||||
runs-on: linux.2xlarge
|
||||
needs: calculate-docker-image
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
|
||||
steps:
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
|
||||
submodules: recursive
|
||||
- name: Log in to ECR
|
||||
run: |
|
||||
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
|
||||
bash /tmp/ecr-login.sh
|
||||
rm /tmp/ecr-login.sh
|
||||
- name: Pull docker image
|
||||
run: |
|
||||
docker pull "${DOCKER_IMAGE}"
|
||||
- name: Create test binary build directories
|
||||
run: |
|
||||
mkdir -pv ../custom-op-build
|
||||
mkdir -pv ../custom-backend-build
|
||||
mkdir -pv ../jit-hook-build
|
||||
- name: Build PyTorch
|
||||
run: |
|
||||
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
|
||||
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
|
||||
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
|
||||
# Why the three volume mounts here? So test binaries are put in the correct spot
|
||||
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
|
||||
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
|
||||
# TODO: Stop building test binaries as part of the build phase
|
||||
docker run \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e MAX_JOBS \
|
||||
-e SCCACHE_BUCKET \
|
||||
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
||||
-e TORCH_CUDA_ARCH_LIST \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--tty \
|
||||
--user jenkins \
|
||||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
|
||||
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
|
||||
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}" \
|
||||
sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)/../":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
|
||||
- name: Archive artifacts into zip
|
||||
run: |
|
||||
(cd "${GITHUB_WORKSPACE}/../" && zip -r pytorch/artifacts.zip pytorch/dist pytorch/build custom-op-build/ custom-backend-build/ jit-hook-build/)
|
||||
- uses: actions/upload-artifact@v2
|
||||
name: Store PyTorch Build Artifacts
|
||||
with:
|
||||
name: ${{ env.BUILD_ENVIRONMENT }}
|
||||
retention-days: 30
|
||||
if-no-files-found: error
|
||||
path:
|
||||
artifacts.zip
|
||||
- name: Clean up docker images
|
||||
if: always()
|
||||
run: |
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
test:
|
||||
runs-on: linux.2xlarge
|
||||
needs:
|
||||
- calculate-docker-image
|
||||
- build
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
|
||||
steps:
|
||||
- name: Chown workspace
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v2
|
||||
- name: Log in to ECR
|
||||
run: |
|
||||
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
|
||||
bash /tmp/ecr-login.sh
|
||||
rm /tmp/ecr-login.sh
|
||||
- name: Pull docker image
|
||||
run: |
|
||||
docker pull "${DOCKER_IMAGE}"
|
||||
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
||||
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
|
||||
run: |
|
||||
bash .github/scripts/install_nvidia_utils_linux.sh
|
||||
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
|
||||
- name: Determine shm-size
|
||||
run: |
|
||||
shm_size="1g"
|
||||
case "${BUILD_ENVIRONMENT}" in
|
||||
*cuda*)
|
||||
shm_size="2g"
|
||||
;;
|
||||
*rocm*)
|
||||
shm_size="8g"
|
||||
;;
|
||||
esac
|
||||
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
|
||||
- uses: actions/download-artifact@v2
|
||||
name: Download PyTorch Build Artifacts
|
||||
with:
|
||||
name: ${{ env.BUILD_ENVIRONMENT }}
|
||||
- name: Unzip artifacts
|
||||
run: |
|
||||
(cd "${GITHUB_WORKSPACE}/../" && unzip -q pytorch/artifacts.zip)
|
||||
- name: Output disk space left
|
||||
run: |
|
||||
sudo df -H
|
||||
- name: Test PyTorch
|
||||
run: |
|
||||
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
|
||||
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
|
||||
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
|
||||
# Why the three volume mounts here? So test binaries are put in the correct spot
|
||||
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
|
||||
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
|
||||
# TODO: Stop building test binaries as part of the build phase
|
||||
# Used for GPU_FLAG since that doesn't play nice
|
||||
# shellcheck disable=SC2086
|
||||
docker run \
|
||||
${GPU_FLAG:-} \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e IN_CI \
|
||||
-e MAX_JOBS \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--shm-size="${SHM_SIZE}" \
|
||||
--tty \
|
||||
--user jenkins \
|
||||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
|
||||
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
|
||||
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}" \
|
||||
sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'
|
||||
- name: Clean up docker images
|
||||
if: always()
|
||||
run: |
|
||||
# Ensure the working directory gets chowned back to the current user
|
||||
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
9
.gitignore
vendored
9
.gitignore
vendored
@ -292,3 +292,12 @@ bazel-*
|
||||
# direnv, posh-direnv
|
||||
.envrc
|
||||
.psenvrc
|
||||
|
||||
# generated shellcheck directories
|
||||
.shellcheck_generated*/
|
||||
|
||||
# zip archives
|
||||
*.zip
|
||||
|
||||
# core dump files
|
||||
core.*
|
||||
|
@ -59,6 +59,17 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
|
||||
export BUILD_SPLIT_CUDA=ON
|
||||
fi
|
||||
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
|
||||
export BUILD_CAFFE2=OFF
|
||||
fi
|
||||
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
|
||||
export ATEN_THREADING=TBB
|
||||
export USE_TBB=1
|
||||
elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
|
||||
export ATEN_THREADING=NATIVE
|
||||
fi
|
||||
|
||||
# TODO: Don't run this...
|
||||
pip_install -r requirements.txt || true
|
||||
|
||||
@ -234,7 +245,7 @@ else
|
||||
CUSTOM_OP_TEST="$PWD/test/custom_operator"
|
||||
python --version
|
||||
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
|
||||
mkdir "$CUSTOM_OP_BUILD"
|
||||
mkdir -p "$CUSTOM_OP_BUILD"
|
||||
pushd "$CUSTOM_OP_BUILD"
|
||||
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
|
||||
make VERBOSE=1
|
||||
@ -246,7 +257,7 @@ else
|
||||
JIT_HOOK_TEST="$PWD/test/jit_hooks"
|
||||
python --version
|
||||
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
|
||||
mkdir "$JIT_HOOK_BUILD"
|
||||
mkdir -p "$JIT_HOOK_BUILD"
|
||||
pushd "$JIT_HOOK_BUILD"
|
||||
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
|
||||
make VERBOSE=1
|
||||
@ -257,7 +268,7 @@ else
|
||||
CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
|
||||
CUSTOM_BACKEND_TEST="$PWD/test/custom_backend"
|
||||
python --version
|
||||
mkdir "$CUSTOM_BACKEND_BUILD"
|
||||
mkdir -p "$CUSTOM_BACKEND_BUILD"
|
||||
pushd "$CUSTOM_BACKEND_BUILD"
|
||||
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
|
||||
make VERBOSE=1
|
||||
|
@ -72,7 +72,16 @@ if [[ "$BUILD_ENVIRONMENT" != *pytorch-win-* ]]; then
|
||||
# Save sccache logs to file
|
||||
sccache --stop-server || true
|
||||
rm ~/sccache_error.log || true
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
|
||||
if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then
|
||||
# sccache --start-server seems to hang forever on self hosted runners for GHA
|
||||
# so let's just go ahead and skip the --start-server altogether since it seems
|
||||
# as though sccache still gets used even when the sscache server isn't started
|
||||
# explicitly
|
||||
echo "Skipping sccache server initialization, setting environment variables"
|
||||
export SCCACHE_IDLE_TIMEOUT=1200
|
||||
export SCCACHE_ERROR_LOG=~/sccache_error.log
|
||||
export RUST_LOG=sccache::server=error
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
|
||||
SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
|
||||
else
|
||||
# increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:
|
||||
|
@ -26,7 +26,7 @@ if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
|
||||
retry bash "${WORKSPACE_DIR}"/miniconda3.sh -b -p "${WORKSPACE_DIR}"/miniconda3
|
||||
fi
|
||||
export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
|
||||
# shellcheck disable=SC1090
|
||||
# shellcheck disable=SC1091
|
||||
source "${WORKSPACE_DIR}"/miniconda3/bin/activate
|
||||
retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions dataclasses pip
|
||||
# The torch.hub tests make requests to GitHub.
|
||||
|
@ -51,7 +51,11 @@ test_python_all() {
|
||||
export GLOO_SOCKET_IFNAME=lo0
|
||||
echo "Ninja version: $(ninja --version)"
|
||||
|
||||
if [ -n "$CIRCLE_PULL_REQUEST" ]; then
|
||||
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
|
||||
# CIRCLE_PULL_REQUEST comes from CircleCI
|
||||
# GITHUB_HEAD_REF comes from Github Actions
|
||||
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
|
||||
if [ -n "$IN_PULL_REQUEST" ]; then
|
||||
DETERMINE_FROM=$(mktemp)
|
||||
file_diff_from_base "$DETERMINE_FROM"
|
||||
fi
|
||||
|
@ -115,7 +115,11 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX2-* ]]; then
|
||||
export ATEN_CPU_CAPABILITY=avx
|
||||
fi
|
||||
|
||||
if [ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
|
||||
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
|
||||
# CIRCLE_PULL_REQUEST comes from CircleCI
|
||||
# GITHUB_HEAD_REF comes from Github Actions
|
||||
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
|
||||
if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
|
||||
DETERMINE_FROM=$(mktemp)
|
||||
file_diff_from_base "$DETERMINE_FROM"
|
||||
fi
|
||||
|
@ -42,12 +42,16 @@ fi
|
||||
|
||||
export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers
|
||||
|
||||
if [ -n "$CIRCLE_PULL_REQUEST" ]; then
|
||||
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
|
||||
# CIRCLE_PULL_REQUEST comes from CircleCI
|
||||
# GITHUB_HEAD_REF comes from Github Actions
|
||||
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
|
||||
if [ -n "$IN_PULL_REQUEST" ]; then
|
||||
DETERMINE_FROM="${TMP_DIR}/determine_from"
|
||||
file_diff_from_base "$DETERMINE_FROM"
|
||||
fi
|
||||
|
||||
if [[ "${CIRCLE_JOB}" == *11* ]]; then
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then
|
||||
export BUILD_SPLIT_CUDA=ON
|
||||
fi
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
- [Unit testing](#unit-testing)
|
||||
- [Python Unit Testing](#python-unit-testing)
|
||||
- [Better local unit tests with `pytest`](#better-local-unit-tests-with-pytest)
|
||||
- [Local linting](#local-linting)
|
||||
- [Running `mypy`](#running-mypy)
|
||||
- [C++ Unit Testing](#c-unit-testing)
|
||||
- [Writing documentation](#writing-documentation)
|
||||
@ -357,13 +358,44 @@ The above is an example of testing a change to all Loss functions: this
|
||||
command runs tests such as `TestNN.test_BCELoss` and
|
||||
`TestNN.test_MSELoss` and can be useful to save keystrokes.
|
||||
|
||||
|
||||
### Local linting
|
||||
|
||||
You can run the same linting steps that are used in CI locally via `make`:
|
||||
|
||||
```bash
|
||||
make lint -j 6 # run lint (using 6 parallel jobs)
|
||||
```
|
||||
|
||||
These jobs may require extra dependencies that aren't dependencies of PyTorch
|
||||
itself, so you can install them via this command, which you should only have to
|
||||
run once:
|
||||
|
||||
```bash
|
||||
make setup_lint
|
||||
```
|
||||
|
||||
To run a specific linting step, use one of these targets or see the
|
||||
[`Makefile`](Makefile) for a complete list of options.
|
||||
|
||||
```bash
|
||||
# Check for tabs, trailing newlines, etc.
|
||||
make quick_checks
|
||||
|
||||
make flake8
|
||||
|
||||
make mypy
|
||||
|
||||
make cmakelint
|
||||
```
|
||||
|
||||
### Running `mypy`
|
||||
|
||||
`mypy` is an optional static type checker for Python. We have multiple `mypy`
|
||||
configs for the PyTorch codebase, so you can run them all using this command:
|
||||
|
||||
```bash
|
||||
for CONFIG in mypy*.ini; do mypy --config="$CONFIG"; done
|
||||
make mypy
|
||||
```
|
||||
|
||||
See [Guide for adding type annotations to
|
||||
|
@ -1,6 +1,7 @@
|
||||
# PyTorch Glossary
|
||||
|
||||
- [PyTorch Glossary](#pytorch-glossary)
|
||||
<!-- toc -->
|
||||
|
||||
- [Operation and Kernel](#operation-and-kernel)
|
||||
- [ATen](#aten)
|
||||
- [Operation](#operation)
|
||||
@ -19,6 +20,8 @@
|
||||
- [Tracing](#tracing)
|
||||
- [Scripting](#scripting)
|
||||
|
||||
<!-- tocstop -->
|
||||
|
||||
# Operation and Kernel
|
||||
|
||||
## ATen
|
||||
|
55
Makefile
55
Makefile
@ -14,8 +14,63 @@ ios:
|
||||
|
||||
clean: # This will remove ALL build folders.
|
||||
@rm -r build*/
|
||||
@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
|
||||
|
||||
linecount:
|
||||
@cloc --read-lang-def=caffe.cloc caffe2 || \
|
||||
echo "Cloc is not available on the machine. You can install cloc with " && \
|
||||
echo " sudo apt-get install cloc"
|
||||
|
||||
SHELLCHECK_GHA_GENERATED_FOLDER=.shellcheck_generated_gha
|
||||
shellcheck-gha:
|
||||
@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
|
||||
tools/extract_scripts.py --out=$(SHELLCHECK_GHA_GENERATED_FOLDER)
|
||||
tools/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER)
|
||||
|
||||
generate-gha-workflows:
|
||||
./.github/scripts/generate_linux_ci_workflows.py
|
||||
$(MAKE) shellcheck-gha
|
||||
|
||||
setup_lint:
|
||||
python tools/actions_local_runner.py --file .github/workflows/lint.yml \
|
||||
--job 'flake8-py3' --step 'Install dependencies'
|
||||
python tools/actions_local_runner.py --file .github/workflows/lint.yml \
|
||||
--job 'cmakelint' --step 'Install dependencies'
|
||||
pip install jinja2
|
||||
|
||||
quick_checks:
|
||||
# TODO: This is broken when 'git config submodule.recurse' is 'true'
|
||||
@python tools/actions_local_runner.py \
|
||||
--file .github/workflows/lint.yml \
|
||||
--job 'quick-checks' \
|
||||
--step 'Ensure no trailing spaces' \
|
||||
--step 'Ensure no tabs' \
|
||||
--step 'Ensure no non-breaking spaces' \
|
||||
--step 'Ensure canonical include' \
|
||||
--step 'Ensure no unqualified noqa' \
|
||||
--step 'Ensure no direct cub include' \
|
||||
--step 'Ensure correct trailing newlines'
|
||||
|
||||
flake8:
|
||||
@python tools/actions_local_runner.py \
|
||||
--file .github/workflows/lint.yml \
|
||||
--job 'flake8-py3' \
|
||||
--step 'Run flake8'
|
||||
|
||||
mypy:
|
||||
@python tools/actions_local_runner.py \
|
||||
--file .github/workflows/lint.yml \
|
||||
--job 'mypy' \
|
||||
--step 'Run mypy'
|
||||
|
||||
cmakelint:
|
||||
@python tools/actions_local_runner.py \
|
||||
--file .github/workflows/lint.yml \
|
||||
--job 'cmakelint' \
|
||||
--step 'Run cmakelint'
|
||||
|
||||
clang_tidy:
|
||||
echo "clang-tidy local lint is not yet implemented"
|
||||
exit 1
|
||||
|
||||
lint: flake8 mypy quick_checks cmakelint generate-gha-workflows
|
||||
|
@ -201,8 +201,7 @@ After that, you can use libtorch C++ API from your native code.
|
||||
namespace pytorch_testapp_jni {
|
||||
namespace {
|
||||
struct JITCallGuard {
|
||||
torch::autograd::AutoGradMode no_autograd_guard{false};
|
||||
torch::AutoNonVariableTypeMode non_var_guard{true};
|
||||
c10::InferenceMode guard;
|
||||
torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
|
||||
};
|
||||
}
|
||||
|
@ -26,14 +26,8 @@ namespace pytorch_jni {
|
||||
namespace {
|
||||
|
||||
struct JITCallGuard {
|
||||
// AutoGrad is disabled for mobile by default.
|
||||
torch::autograd::AutoGradMode no_autograd_guard{false};
|
||||
// VariableType dispatch is not included in default mobile build. We need set
|
||||
// this guard globally to avoid dispatch error (only for dynamic dispatch).
|
||||
// Thanks to the unification of Variable class and Tensor class it's no longer
|
||||
// required to toggle the NonVariableTypeMode per op - so it doesn't hurt to
|
||||
// always set NonVariableTypeMode for inference only use case.
|
||||
torch::AutoNonVariableTypeMode non_var_guard{true};
|
||||
// Inference only workload.
|
||||
c10::InferenceMode guard;
|
||||
// Disable graph optimizer to ensure list of unused ops are not changed for
|
||||
// custom mobile build.
|
||||
torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
|
||||
|
@ -17,14 +17,8 @@ namespace pytorch_jni {
|
||||
namespace {
|
||||
|
||||
struct LiteJITCallGuard {
|
||||
// VariableType dispatch is not included in default mobile build. We need set
|
||||
// this guard globally to avoid dispatch error (only for dynamic dispatch).
|
||||
// Thanks to the unification of Variable class and Tensor class it's no longer
|
||||
// required to toggle the NonVariableTypeMode per op - so it doesn't hurt to
|
||||
// always set NonVariableTypeMode for inference only use case.
|
||||
// TODO: avoid having to set this guard for custom mobile build with mobile
|
||||
// interpreter.
|
||||
torch::AutoNonVariableTypeMode non_var_guard{true};
|
||||
// Inference only workload.
|
||||
c10::InferenceMode guard;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
@ -24,8 +24,7 @@ void log(const char* m, T t) {
|
||||
}
|
||||
|
||||
struct JITCallGuard {
|
||||
torch::autograd::AutoGradMode no_autograd_guard{false};
|
||||
torch::AutoNonVariableTypeMode non_var_guard{true};
|
||||
c10::InferenceMode guard;
|
||||
torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
|
||||
};
|
||||
} // namespace
|
||||
|
@ -40,6 +40,10 @@ TORCH_LIBRARY_IMPL(_, AutogradCPU, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, AutogradXPU, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, AutogradCUDA, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
@ -202,6 +202,17 @@ public:
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
Vec256<T> isnan() const {
|
||||
Vec256<T> vec;
|
||||
for (int64_t i = 0; i != size(); i++) {
|
||||
if (_isnan(values[i])) {
|
||||
std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));
|
||||
} else {
|
||||
std::memset(static_cast<void*>(vec.values + i), 0, sizeof(T));
|
||||
}
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
Vec256<T> map(T (*f)(T)) const {
|
||||
Vec256<T> ret;
|
||||
for (int64_t i = 0; i != size(); i++) {
|
||||
|
@ -96,6 +96,9 @@ public:
|
||||
__m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ);
|
||||
return _mm256_movemask_pd(cmp);
|
||||
}
|
||||
Vec256<double> isnan() const {
|
||||
return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
|
||||
}
|
||||
Vec256<double> map(double (*f)(double)) const {
|
||||
__at_align32__ double tmp[size()];
|
||||
store(tmp);
|
||||
|
@ -103,6 +103,9 @@ public:
|
||||
__m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
|
||||
return _mm256_movemask_ps(cmp);
|
||||
}
|
||||
Vec256<float> isnan() const {
|
||||
return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
|
||||
}
|
||||
Vec256<float> map(float (*f)(float)) const {
|
||||
__at_align32__ float tmp[size()];
|
||||
store(tmp);
|
||||
|
@ -283,6 +283,19 @@ public:
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
Vec256<float> isnan() const {
|
||||
__at_align32__ float tmp[size()];
|
||||
__at_align32__ float res[size()];
|
||||
store(tmp);
|
||||
for (int i = 0; i < size(); i++) {
|
||||
if (_isnan(tmp[i])) {
|
||||
std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
|
||||
} else {
|
||||
std::memset(static_cast<void*>(&res[i]), 0, sizeof(float));
|
||||
}
|
||||
}
|
||||
return loadu(res);
|
||||
};
|
||||
Vec256<float> map(float (*f)(float)) const {
|
||||
__at_align32__ float tmp[size()];
|
||||
store(tmp);
|
||||
|
@ -5,6 +5,55 @@
|
||||
|
||||
|
||||
namespace at {
|
||||
namespace meta {
|
||||
TORCH_META_FUNC(adaptive_max_pool2d) (const Tensor& input, IntArrayRef output_size) {
|
||||
for (int64_t i = 0; i < input.ndimension(); i++) {
|
||||
TORCH_CHECK(
|
||||
input.size(i) > 0,
|
||||
"adaptive_max_pool2d: expected input to have non-empty spatial dimensions, "
|
||||
"but input has sizes ",
|
||||
input.sizes(),
|
||||
" with dimension ",
|
||||
i,
|
||||
" being "
|
||||
"empty");
|
||||
}
|
||||
|
||||
TORCH_CHECK(
|
||||
(input.ndimension() == 3 || input.ndimension() == 4),
|
||||
"non-empty 3D or 4D (batch mode) tensor expected for input");
|
||||
|
||||
TORCH_CHECK(
|
||||
output_size.size() == 2,
|
||||
"adaptive_max_pool2d: internal error: output_size.size() must be 2");
|
||||
|
||||
int dimH = 1;
|
||||
int64_t sizeB = 1;
|
||||
int64_t sizeD = 0;
|
||||
|
||||
if (input.ndimension() == 4) {
|
||||
sizeB = input.size(0);
|
||||
dimH++;
|
||||
}
|
||||
|
||||
sizeD = input.size(dimH - 1);
|
||||
|
||||
int64_t osizeH = output_size[0];
|
||||
int64_t osizeW = output_size[1];
|
||||
|
||||
/* resize output */
|
||||
if (input.ndimension() == 3) {
|
||||
set_output(0, {sizeD, osizeH, osizeW}, input.options());
|
||||
/* indices will contain i,j locations for each output point */
|
||||
set_output(1, {sizeD, osizeH, osizeW}, input.options().dtype(kLong));
|
||||
} else {
|
||||
set_output(0, {sizeB, sizeD, osizeH, osizeW}, input.options());
|
||||
/* indices will contain i,j locations for each output point */
|
||||
set_output(1, {sizeB, sizeD, osizeH, osizeW}, input.options().dtype(kLong));
|
||||
}
|
||||
}
|
||||
} // namespace meta
|
||||
|
||||
namespace native {
|
||||
|
||||
namespace {
|
||||
@ -115,102 +164,6 @@ static void adaptive_max_pool2d_out_frame(
|
||||
});
|
||||
}
|
||||
|
||||
void adaptive_max_pool2d_out_cpu_template(
|
||||
Tensor& output,
|
||||
Tensor& indices,
|
||||
const Tensor& input,
|
||||
IntArrayRef output_size)
|
||||
{
|
||||
int dimW = 2;
|
||||
int dimH = 1;
|
||||
int64_t sizeB = 1;
|
||||
int64_t sizeD = 0;
|
||||
int64_t isizeH = 0;
|
||||
int64_t isizeW = 0;
|
||||
|
||||
int64_t istrideD = 0;
|
||||
int64_t istrideH = 0;
|
||||
int64_t istrideW = 0;
|
||||
int64_t istrideB = 0;
|
||||
|
||||
for (int64_t i = 0; i < input.ndimension(); i++) {
|
||||
TORCH_CHECK(input.size(i) > 0,
|
||||
"adaptive_max_pool2d: expected input to have non-empty spatial dimensions, "
|
||||
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
|
||||
"empty");
|
||||
}
|
||||
|
||||
TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
|
||||
"non-empty 3D or 4D (batch mode) tensor expected for input");
|
||||
|
||||
TORCH_CHECK(output_size.size() == 2,
|
||||
"adaptive_max_pool2d: internal error: output_size.size() must be 2");
|
||||
|
||||
if (input.ndimension() == 4)
|
||||
{
|
||||
istrideB = input.stride(0);
|
||||
sizeB = input.size(0);
|
||||
dimW++;
|
||||
dimH++;
|
||||
}
|
||||
|
||||
/* sizes */
|
||||
sizeD = input.size(dimH-1);
|
||||
isizeH = input.size(dimH);
|
||||
isizeW = input.size(dimW);
|
||||
/* strides */
|
||||
istrideD = input.stride(dimH-1);
|
||||
istrideH = input.stride(dimH);
|
||||
istrideW = input.stride(dimW);
|
||||
|
||||
int64_t osizeH = output_size[0];
|
||||
int64_t osizeW = output_size[1];
|
||||
|
||||
/* resize output */
|
||||
if (input.ndimension() == 3)
|
||||
{
|
||||
output.resize_({sizeD, osizeH, osizeW});
|
||||
/* indices will contain i,j locations for each output point */
|
||||
indices.resize_({sizeD, osizeH, osizeW});
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
adaptive_max_pool2d_single_out_frame<scalar_t>(input_data, output_data,
|
||||
indices_data,
|
||||
sizeD,
|
||||
isizeH, isizeW,
|
||||
osizeH, osizeW,
|
||||
istrideD,
|
||||
istrideH, istrideW);
|
||||
}
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
output.resize_({sizeB, sizeD, osizeH, osizeW});
|
||||
/* indices will contain i,j locations for each output point */
|
||||
indices.resize_({sizeB, sizeD, osizeH, osizeW});
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
adaptive_max_pool2d_out_frame<scalar_t>(input_data, output_data,
|
||||
indices_data,
|
||||
sizeB, sizeD,
|
||||
isizeH, isizeW,
|
||||
osizeH, osizeW,
|
||||
istrideB, istrideD,
|
||||
istrideH, istrideW);
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void adaptive_max_pool2d_backward_single_out_frame(
|
||||
scalar_t *gradInput_p,
|
||||
@ -346,31 +299,83 @@ Tensor& adaptive_max_pool2d_backward_out_cpu_template(
|
||||
|
||||
} // namespace
|
||||
|
||||
std::tuple<Tensor&, Tensor&> adaptive_max_pool2d_out_cpu(const Tensor& input,
|
||||
IntArrayRef output_size,
|
||||
Tensor& output,
|
||||
Tensor& indices)
|
||||
{
|
||||
adaptive_max_pool2d_out_cpu_template(
|
||||
output,
|
||||
indices,
|
||||
input,
|
||||
output_size);
|
||||
return std::tuple<Tensor&, Tensor&>(output, indices);
|
||||
}
|
||||
TORCH_IMPL_FUNC(adaptive_max_pool2d_out_cpu)
|
||||
(const Tensor& input, IntArrayRef output_size, const Tensor& output, const Tensor& indices) {
|
||||
int dimW = 2;
|
||||
int dimH = 1;
|
||||
int64_t sizeB = 1;
|
||||
int64_t sizeD = 0;
|
||||
int64_t isizeH = 0;
|
||||
int64_t isizeW = 0;
|
||||
|
||||
std::tuple<Tensor, Tensor> adaptive_max_pool2d_cpu(
|
||||
const Tensor& input,
|
||||
IntArrayRef output_size)
|
||||
{
|
||||
Tensor output = at::empty({0}, input.options());
|
||||
Tensor indices = at::empty({0}, input.options().dtype(kLong));
|
||||
adaptive_max_pool2d_out_cpu_template(
|
||||
output,
|
||||
indices,
|
||||
input,
|
||||
output_size);
|
||||
return std::tuple<Tensor, Tensor>(output, indices);
|
||||
int64_t istrideD = 0;
|
||||
int64_t istrideH = 0;
|
||||
int64_t istrideW = 0;
|
||||
int64_t istrideB = 0;
|
||||
|
||||
if (input.ndimension() == 4) {
|
||||
istrideB = input.stride(0);
|
||||
sizeB = input.size(0);
|
||||
dimW++;
|
||||
dimH++;
|
||||
}
|
||||
|
||||
/* sizes */
|
||||
sizeD = input.size(dimH - 1);
|
||||
isizeH = input.size(dimH);
|
||||
isizeW = input.size(dimW);
|
||||
/* strides */
|
||||
istrideD = input.stride(dimH - 1);
|
||||
istrideH = input.stride(dimH);
|
||||
istrideW = input.stride(dimW);
|
||||
|
||||
int64_t osizeH = output_size[0];
|
||||
int64_t osizeW = output_size[1];
|
||||
|
||||
/* resize output */
|
||||
if (input.ndimension() == 3) {
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
adaptive_max_pool2d_single_out_frame<scalar_t>(
|
||||
input_data,
|
||||
output_data,
|
||||
indices_data,
|
||||
sizeD,
|
||||
isizeH,
|
||||
isizeW,
|
||||
osizeH,
|
||||
osizeW,
|
||||
istrideD,
|
||||
istrideH,
|
||||
istrideW);
|
||||
});
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
adaptive_max_pool2d_out_frame<scalar_t>(
|
||||
input_data,
|
||||
output_data,
|
||||
indices_data,
|
||||
sizeB,
|
||||
sizeD,
|
||||
isizeH,
|
||||
isizeW,
|
||||
osizeH,
|
||||
osizeW,
|
||||
istrideB,
|
||||
istrideD,
|
||||
istrideH,
|
||||
istrideW);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Tensor& adaptive_max_pool2d_backward_out_cpu(const Tensor& gradOutput_,
|
||||
|
@ -5,6 +5,57 @@
|
||||
|
||||
|
||||
namespace at {
|
||||
namespace meta {
|
||||
TORCH_META_FUNC(adaptive_max_pool3d) (const Tensor& input, IntArrayRef output_size) {
|
||||
for (int64_t i = 0; i < input.ndimension(); i++) {
|
||||
TORCH_CHECK(
|
||||
input.size(i) > 0,
|
||||
"adaptive_max_pool3d: expected input to have non-empty spatial dimensions, "
|
||||
"but input has sizes ",
|
||||
input.sizes(),
|
||||
" with dimension ",
|
||||
i,
|
||||
" being "
|
||||
"empty");
|
||||
}
|
||||
|
||||
TORCH_CHECK(
|
||||
(input.ndimension() == 4 || input.ndimension() == 5),
|
||||
"non-empty 4D or 5D (batch mode) tensor expected for input");
|
||||
|
||||
TORCH_CHECK(
|
||||
output_size.size() == 3,
|
||||
"adaptive_max_pool3d: internal error: output_size.size() must be 3");
|
||||
|
||||
int dimD = 0;
|
||||
int64_t sizeB = 1;
|
||||
int64_t sizeD = 0;
|
||||
|
||||
if (input.ndimension() == 5) {
|
||||
sizeB = input.size(0);
|
||||
dimD++;
|
||||
}
|
||||
|
||||
/* sizes */
|
||||
sizeD = input.size(dimD);
|
||||
|
||||
int64_t osizeT = output_size[0];
|
||||
int64_t osizeH = output_size[1];
|
||||
int64_t osizeW = output_size[2];
|
||||
|
||||
/* resize output */
|
||||
if (input.ndimension() == 4) {
|
||||
set_output(0, {sizeD, osizeT, osizeH, osizeW}, input.options());
|
||||
/* indices will contain max input locations for each output point */
|
||||
set_output(1, {sizeD, osizeT, osizeH, osizeW}, input.options().dtype(kLong));
|
||||
} else {
|
||||
set_output(0, {sizeB, sizeD, osizeT, osizeH, osizeW}, input.options());
|
||||
/* indices will contain max input locations for each output point */
|
||||
set_output(1, {sizeB, sizeD, osizeT, osizeH, osizeW}, input.options().dtype(kLong));
|
||||
}
|
||||
}
|
||||
} // namespace meta
|
||||
|
||||
namespace native {
|
||||
|
||||
namespace {
|
||||
@ -393,31 +444,97 @@ Tensor& adaptive_max_pool3d_backward_out_cpu_template(
|
||||
|
||||
} // namespace
|
||||
|
||||
std::tuple<Tensor&, Tensor&> adaptive_max_pool3d_out_cpu(const Tensor& input,
|
||||
IntArrayRef output_size,
|
||||
Tensor& output,
|
||||
Tensor& indices)
|
||||
{
|
||||
adaptive_max_pool3d_out_cpu_template(
|
||||
output,
|
||||
indices,
|
||||
input,
|
||||
output_size);
|
||||
return std::tuple<Tensor&, Tensor&>(output, indices);
|
||||
}
|
||||
TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
|
||||
(const Tensor& input, IntArrayRef output_size, const Tensor& output, const Tensor& indices) {
|
||||
int dimD = 0;
|
||||
int dimT = 1;
|
||||
int dimH = 2;
|
||||
int dimW = 3;
|
||||
int64_t sizeB = 1;
|
||||
int64_t sizeD = 0;
|
||||
int64_t isizeT = 0;
|
||||
int64_t isizeH = 0;
|
||||
int64_t isizeW = 0;
|
||||
|
||||
std::tuple<Tensor, Tensor> adaptive_max_pool3d_cpu(
|
||||
const Tensor& input,
|
||||
IntArrayRef output_size)
|
||||
{
|
||||
Tensor output = at::empty({0}, input.options());
|
||||
Tensor indices = at::empty({0}, input.options().dtype(kLong));
|
||||
adaptive_max_pool3d_out_cpu_template(
|
||||
output,
|
||||
indices,
|
||||
input,
|
||||
output_size);
|
||||
return std::tuple<Tensor, Tensor>(output, indices);
|
||||
int64_t istrideB = 0;
|
||||
int64_t istrideD = 0;
|
||||
int64_t istrideT = 0;
|
||||
int64_t istrideH = 0;
|
||||
int64_t istrideW = 0;
|
||||
|
||||
if (input.ndimension() == 5) {
|
||||
istrideB = input.stride(0);
|
||||
sizeB = input.size(0);
|
||||
dimD++;
|
||||
dimT++;
|
||||
dimH++;
|
||||
dimW++;
|
||||
}
|
||||
|
||||
/* sizes */
|
||||
sizeD = input.size(dimD);
|
||||
isizeT = input.size(dimT);
|
||||
isizeH = input.size(dimH);
|
||||
isizeW = input.size(dimW);
|
||||
/* strides */
|
||||
istrideD = input.stride(dimD);
|
||||
istrideT = input.stride(dimT);
|
||||
istrideH = input.stride(dimH);
|
||||
istrideW = input.stride(dimW);
|
||||
|
||||
int64_t osizeT = output_size[0];
|
||||
int64_t osizeH = output_size[1];
|
||||
int64_t osizeW = output_size[2];
|
||||
|
||||
if (input.ndimension() == 4) {
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
adaptive_max_pool3d_single_out_frame<scalar_t>(
|
||||
input_data,
|
||||
output_data,
|
||||
indices_data,
|
||||
sizeD,
|
||||
isizeT,
|
||||
isizeH,
|
||||
isizeW,
|
||||
osizeT,
|
||||
osizeH,
|
||||
osizeW,
|
||||
istrideD,
|
||||
istrideT,
|
||||
istrideH,
|
||||
istrideW);
|
||||
});
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
adaptive_max_pool3d_out_frame<scalar_t>(
|
||||
input_data,
|
||||
output_data,
|
||||
indices_data,
|
||||
sizeB,
|
||||
sizeD,
|
||||
isizeT,
|
||||
isizeH,
|
||||
isizeW,
|
||||
osizeT,
|
||||
osizeH,
|
||||
osizeW,
|
||||
istrideB,
|
||||
istrideD,
|
||||
istrideT,
|
||||
istrideH,
|
||||
istrideW);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Tensor& adaptive_max_pool3d_backward_out_cpu(const Tensor& gradOutput_,
|
||||
|
@ -918,8 +918,7 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor
|
||||
// - 2-dimensional (2D) tensor or batch of 2D tensors (matrix case)
|
||||
// original torch.solve supported only the matrix case, while NumPy works for both cases
|
||||
// for the batched input we need to be able to distinguish them
|
||||
auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim()-1); // input.shape[:-1]
|
||||
bool vector_case = other.dim() == 1 || (input.dim()-1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
|
||||
bool vector_case = linalg_solve_is_vector_rhs(input, other);
|
||||
|
||||
bool is_batched_column_major = false;
|
||||
if (vector_case) {
|
||||
@ -929,7 +928,7 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor
|
||||
}
|
||||
|
||||
// if 'other' is a batch of 2D tensors, then 'input' can be non-batched and will be broadcasted
|
||||
auto expected_shape = expected_batched_rhs_shape;
|
||||
auto expected_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
|
||||
if (!vector_case && other.dim() > 2) {
|
||||
expected_shape = other.sizes();
|
||||
}
|
||||
@ -1020,8 +1019,7 @@ Tensor& linalg_solve_out(const Tensor& input, const Tensor& other, Tensor& resul
|
||||
|
||||
// Now check LAPACK/MAGMA error codes
|
||||
// batchCheckErrors(Tensor, char*) calls 'infos = infos.to(kCPU)'
|
||||
auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim()-1); // input.shape[:-1]
|
||||
bool vector_case = other.dim() == 1 || (input.dim()-1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
|
||||
bool vector_case = linalg_solve_is_vector_rhs(input, other);
|
||||
if (vector_case ? result.dim() > 1 : result.dim() > 2) {
|
||||
batchCheckErrors(infos, "linalg_solve");
|
||||
} else {
|
||||
@ -1606,9 +1604,8 @@ std::tuple<Tensor&, Tensor&> triangular_solve_out(const Tensor& self, const Tens
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template<typename scalar_t>
|
||||
static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
|
||||
std::vector<int64_t>& infos) {
|
||||
template <typename scalar_t>
|
||||
static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n) {
|
||||
#ifndef USE_LAPACK
|
||||
AT_ERROR("qr: LAPACK library not found in compilation");
|
||||
#else
|
||||
@ -1627,6 +1624,7 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
|
||||
int lwork = -1;
|
||||
scalar_t wkopt;
|
||||
lapackGeqrf<scalar_t>(m, n, self_data, m, tau_data, &wkopt, lwork, &info);
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
|
||||
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
|
||||
Tensor work = at::empty({lwork}, self.options());
|
||||
|
||||
@ -1636,10 +1634,10 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
|
||||
|
||||
// now compute the actual R and TAU
|
||||
lapackGeqrf<scalar_t>(m, n, self_working_ptr, m, tau_working_ptr, work.data_ptr<scalar_t>(), lwork, &info);
|
||||
infos[i] = info;
|
||||
if (info != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// info from lapackGeqrf only reports if the i-th parameter is wrong
|
||||
// so we don't need to check it all the time
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -1647,7 +1645,6 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
|
||||
std::tuple<Tensor, Tensor> _linalg_qr_helper_cpu(const Tensor& self, std::string mode) {
|
||||
bool compute_q, reduced;
|
||||
std::tie(compute_q, reduced) = _parse_qr_mode(mode);
|
||||
std::vector<int64_t> infos(batchCount(self), 0);
|
||||
int64_t m = self.size(-2), n = self.size(-1);
|
||||
|
||||
// Setup inputs for apply_geqrf
|
||||
@ -1682,13 +1679,8 @@ std::tuple<Tensor, Tensor> _linalg_qr_helper_cpu(const Tensor& self, std::string
|
||||
q_working_copy.narrow(-1, 0, n).copy_(self);
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "qr_cpu", [&]{
|
||||
apply_geqrf<scalar_t>(q_working_copy, tau_working_copy, m, n, infos);
|
||||
apply_geqrf<scalar_t>(q_working_copy, tau_working_copy, m, n);
|
||||
});
|
||||
if (self.dim() > 2) {
|
||||
batchCheckErrors(infos, "qr_cpu");
|
||||
} else {
|
||||
singleCheckErrors(infos[0], "qr_cpu");
|
||||
}
|
||||
|
||||
R = q_working_copy.slice(-2, 0, n_columns_q).slice(-1, 0, n).triu();
|
||||
if (!compute_q) {
|
||||
@ -2977,204 +2969,378 @@ Tensor& _lstsq_helper_cpu(
|
||||
#endif
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
|
||||
const Tensor& self, const Tensor& b,
|
||||
c10::optional<double> cond,
|
||||
c10::optional<std::string> driver) {
|
||||
TORCH_CHECK(
|
||||
self.device().type() == b.device().type(),
|
||||
"torch.linalg.lstsq: input tensors should be on the same device"
|
||||
);
|
||||
TORCH_CHECK(
|
||||
self.scalar_type() == b.scalar_type(),
|
||||
"torch.linalg.lstsq: input tensors should be of the same dtype"
|
||||
);
|
||||
TORCH_CHECK(
|
||||
self.dim() >= 2,
|
||||
"torch.linalg.lstsq: input `self` Tensor should be at least 2D"
|
||||
);
|
||||
TORCH_CHECK(
|
||||
b.dim() >= 1,
|
||||
"torch.linalg.lstsq: input `b` Tensor should be at least 1D"
|
||||
);
|
||||
auto dim_diff = self.dim() - b.dim();
|
||||
TORCH_CHECK(
|
||||
0 <= dim_diff && dim_diff <= 1,
|
||||
"torch.linalg.lstsq: self.dim() must be greater or equal to b.dim() and "
|
||||
"(self.dim() - b.dim()) <= 1"
|
||||
);
|
||||
Tensor b_2d = dim_diff ? b.unsqueeze(-1) : b;
|
||||
TORCH_CHECK(
|
||||
self.size(-2) == b_2d.size(-2),
|
||||
dim_diff ? "torch.linalg.lstsq: self.size(-2) should match b.size(-1)" :
|
||||
"torch.linalg.lstsq: self.size(-2) should match b.size(-2)"
|
||||
);
|
||||
/*
|
||||
Solves a least squares problem. That is minimizing the squared Frobenius norm of |B - A X|.
|
||||
|
||||
// if `driver` is empty, we use `driver_opt` to be set to
|
||||
// c10::nullopt if working with CUDA tensors,
|
||||
Input args:
|
||||
* 'input' - Tensor containing batches of m-by-n matrix A.
|
||||
* 'other' - Tensor containing batches of max(m, n)-by-nrhs matrix B.
|
||||
* 'cond' - relative tolerance for determining rank of A.
|
||||
* 'driver' - the name of the LAPACK driver that is used to compute the solution.
|
||||
Output args (modified in-place):
|
||||
* 'solution' - Tensor to store the solution matrix X.
|
||||
* 'residuals' - Tensor to store values of the residual sum of squares for each column of the solution.
|
||||
* 'rank' - Tensor to store the rank of A.
|
||||
* 'singular_values' - Tensor to store the singular values of A.
|
||||
* 'infos' - Tensor to store error codes of linear algebra math library.
|
||||
|
||||
For further details, please see the LAPACK documentation for GELS/GELSY/GELSS/GELSD routines.
|
||||
*/
|
||||
static void linalg_lstsq_out_info(
|
||||
Tensor& solution,
|
||||
Tensor& residuals,
|
||||
Tensor& rank,
|
||||
Tensor& singular_values,
|
||||
Tensor& infos,
|
||||
const Tensor& input,
|
||||
const Tensor& other,
|
||||
double rcond,
|
||||
std::string& driver) {
|
||||
// These internal asserts make explicit the assumptions in the implementation
|
||||
// Error check with the actual error messages are done on the higher level of
|
||||
// the hierarchy of calls
|
||||
TORCH_INTERNAL_ASSERT(input.dim() >= 2);
|
||||
TORCH_INTERNAL_ASSERT(other.dim() >= 1);
|
||||
|
||||
auto dim_diff = input.dim() - other.dim();
|
||||
TORCH_INTERNAL_ASSERT(0 <= dim_diff && dim_diff <= 1);
|
||||
|
||||
TORCH_INTERNAL_ASSERT(input.scalar_type() == other.scalar_type());
|
||||
TORCH_INTERNAL_ASSERT(input.device() == other.device());
|
||||
|
||||
TORCH_INTERNAL_ASSERT(solution.scalar_type() == input.scalar_type());
|
||||
TORCH_INTERNAL_ASSERT(solution.device() == input.device());
|
||||
|
||||
TORCH_INTERNAL_ASSERT(residuals.device() == input.device());
|
||||
|
||||
TORCH_INTERNAL_ASSERT(rank.scalar_type() == at::kLong);
|
||||
TORCH_INTERNAL_ASSERT(rank.device() == input.device());
|
||||
|
||||
auto real_dtype = toValueType(input.scalar_type());
|
||||
TORCH_INTERNAL_ASSERT(singular_values.scalar_type() == real_dtype);
|
||||
TORCH_INTERNAL_ASSERT(singular_values.device() == input.device());
|
||||
|
||||
TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
|
||||
TORCH_INTERNAL_ASSERT(infos.device() == input.device());
|
||||
TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));
|
||||
TORCH_INTERNAL_ASSERT(infos.is_contiguous());
|
||||
|
||||
bool vector_case = linalg_solve_is_vector_rhs(input, other);
|
||||
// we need to unsqueeze 'other' because 2-dimensional tensors are expected in the implementation
|
||||
Tensor other_2d = vector_case ? other.unsqueeze(-1) : other;
|
||||
|
||||
TORCH_INTERNAL_ASSERT(input.size(-2) == other_2d.size(-2));
|
||||
|
||||
std::vector<int64_t> expected_solution_shape = broadcast_batch_size(input, other_2d, input.dim() - 2);
|
||||
// the actual shape of the solution returned is (*, n,) or (*, n, nrhs)
|
||||
// but LAPACK requires extra dimensions to store raw residuals
|
||||
// so the expected shape is (*, max(m, n),) or (*, max(m, n), nrhs)
|
||||
auto m = input.size(-2);
|
||||
auto n = input.size(-1);
|
||||
auto nrhs = other.size(-1);
|
||||
expected_solution_shape.push_back(std::max(m, n));
|
||||
if (!vector_case) {
|
||||
expected_solution_shape.push_back(nrhs);
|
||||
}
|
||||
|
||||
// if 'solution' has no elements we can modify it
|
||||
if (solution.numel() == 0) {
|
||||
if (vector_case) {
|
||||
solution.resize_(expected_solution_shape, MemoryFormat::Contiguous);
|
||||
} else {
|
||||
auto shape_transposed = expected_solution_shape;
|
||||
std::swap(shape_transposed.end()[-1], shape_transposed.end()[-2]);
|
||||
solution.resize_(shape_transposed, MemoryFormat::Contiguous);
|
||||
solution.transpose_(-2, -1);
|
||||
}
|
||||
}
|
||||
|
||||
// if 'solution' is non-empty it must have the expected shape
|
||||
TORCH_INTERNAL_ASSERT(solution.sizes().equals(expected_solution_shape));
|
||||
|
||||
// 'solution' must be in batched column major order (Fortran contiguous) for 2D inputs
|
||||
// or C contiguous for 1D input
|
||||
if (vector_case) {
|
||||
TORCH_INTERNAL_ASSERT(solution.is_contiguous());
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT(solution.transpose(-2, -1).is_contiguous());
|
||||
}
|
||||
|
||||
// for 1-dimensional 'other', we need to unsqueeze the 'solution' before passing to "apply_solve"
|
||||
if (vector_case) {
|
||||
solution = solution.unsqueeze_(-1);
|
||||
}
|
||||
|
||||
// _linalg_lstsq_helper_ performs calculations in-place and 'solution' must be a copy of other_2d
|
||||
solution.narrow(-2, 0, other_2d.size(-2)).copy_(other_2d);
|
||||
|
||||
// if 'rank' is empty we might resize it
|
||||
auto input_batch_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2);
|
||||
if (rank.numel() == 0 && driver != "gels") { // gels driver doesn't set 'rank'
|
||||
rank.resize_(input_batch_shape, MemoryFormat::Contiguous);
|
||||
}
|
||||
|
||||
// if 'rank' is non-empty it must have the expected shape and be contiguous
|
||||
if (driver != "gels") {
|
||||
TORCH_INTERNAL_ASSERT(rank.sizes().equals(input_batch_shape));
|
||||
TORCH_INTERNAL_ASSERT(rank.is_contiguous());
|
||||
}
|
||||
|
||||
// if 'singular_values' is empty we might resize it
|
||||
auto singular_values_shape = input_batch_shape.vec();
|
||||
singular_values_shape.push_back(std::min(m, n));
|
||||
if (singular_values.numel() == 0 && (driver == "gelsd" || driver == "gelss")) {
|
||||
singular_values.resize_(singular_values_shape, MemoryFormat::Contiguous);
|
||||
}
|
||||
|
||||
// if 'singular_values' is non-empty it must have the expected shape and be contiguous
|
||||
if (driver == "gelsd" || driver == "gelss") {
|
||||
TORCH_INTERNAL_ASSERT(singular_values.sizes().equals(singular_values_shape));
|
||||
TORCH_INTERNAL_ASSERT(singular_values.is_contiguous());
|
||||
}
|
||||
|
||||
// 'input' is modified in-place so we need a column-major copy
|
||||
auto input_working_copy = copyBatchedColumnMajor(input);
|
||||
|
||||
// now the actual call that computes the result in-place (apply_lstsq)
|
||||
at::_lstsq_helper_(solution, rank, singular_values, infos, input_working_copy, rcond, driver);
|
||||
|
||||
if (m > n && driver != "gelsy") {
|
||||
// LAPACK stores residuals data for postprocessing in rows n:(m-n)
|
||||
auto raw_residuals = solution.narrow(/*dim=*/-2, /*start=*/n, /*length*/m - n);
|
||||
if (raw_residuals.is_complex()) {
|
||||
raw_residuals.mul_(raw_residuals.conj());
|
||||
raw_residuals = at::real(raw_residuals);
|
||||
} else {
|
||||
raw_residuals.pow_(2);
|
||||
}
|
||||
at::sum_out(residuals, raw_residuals, /*dim=*/-2, /*keepdim=*/false, /*dtype*/real_dtype);
|
||||
}
|
||||
solution = solution.narrow(/*dim=*/-2, /*start=*/0, /*length*/n);
|
||||
if (m == 0) {
|
||||
solution.zero_();
|
||||
}
|
||||
|
||||
// for 1-dimensional 'other', we need to squeeze the solution after "apply_lstsq"
|
||||
if (vector_case) {
|
||||
solution = solution.squeeze_(-1);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string get_default_lstsq_driver(c10::optional<std::string> driver, const Tensor& input) {
|
||||
// if `driver` is empty, we set driver_str to "gels" if working with CUDA tensors,
|
||||
// otherwise to "gelsy" driver.
|
||||
// CUDA tensors are treated specially because MAGMA
|
||||
// has only 'gels' driver supported.
|
||||
c10::optional<std::string> driver_opt = driver;
|
||||
std::string driver_str;
|
||||
// check whether the user provided name is a valid driver name
|
||||
if (driver.has_value()) {
|
||||
auto driver_str = driver.value();
|
||||
driver_str = driver.value();
|
||||
// convert `driver_str` to lower case inplace.
|
||||
std::transform(driver_str.begin(), driver_str.end(), driver_str.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
static std::unordered_set<std::string> allowed_drivers = {
|
||||
"gels", "gelsy", "gelsd", "gelss"
|
||||
};
|
||||
if (at::kCPU == self.device().type()) {
|
||||
if (input.device() == at::kCPU) {
|
||||
TORCH_CHECK(
|
||||
allowed_drivers.find(driver_str) != allowed_drivers.end(),
|
||||
"torch.linalg.lstsq: parameter `driver` should be one of "
|
||||
"(gels, gelsy, gelsd, gelss)"
|
||||
);
|
||||
}
|
||||
//else if (at::kCUDA == self.device().type()) {
|
||||
else {
|
||||
} else { // else if (input.is_cuda())
|
||||
TORCH_CHECK(
|
||||
driver_str == "gels",
|
||||
"torch.linalg.lstsq: `driver` other than `gels` is not supported on CUDA"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// if driver name is not provided, set to default 'gelsy' if on CPU,
|
||||
// or to `gels` if on CUDA.
|
||||
driver_str = input.is_cuda() ? "gels" : "gelsy";
|
||||
}
|
||||
// if driver name is not provided, set to default 'gelsy' if on CPU,
|
||||
// or to `gels` if on CUDA.
|
||||
else {
|
||||
driver_opt = (at::kCPU == self.device().type())
|
||||
? c10::optional<std::string>("gelsy")
|
||||
: c10::optional<std::string>("gels");
|
||||
return driver_str;
|
||||
}
|
||||
|
||||
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> linalg_lstsq_out(
|
||||
const Tensor& input,
|
||||
const Tensor& other,
|
||||
c10::optional<double> rcond,
|
||||
c10::optional<std::string> driver,
|
||||
Tensor& solution,
|
||||
Tensor& residuals,
|
||||
Tensor& rank,
|
||||
Tensor& singular_values) {
|
||||
TORCH_CHECK(input.dim() >= 2, "torch.linalg.lstsq: input must have at least 2 dimensions.");
|
||||
TORCH_CHECK(other.dim() >= 1, "torch.linalg.lstsq: other must have at least 1 dimension.");
|
||||
TORCH_CHECK(
|
||||
input.scalar_type() == other.scalar_type(),
|
||||
"torch.linalg.lstsq: Expected input and other to have the same dtype, but got input's dtype ",
|
||||
input.scalar_type(),
|
||||
" and other's dtype ",
|
||||
other.scalar_type());
|
||||
|
||||
auto dim_diff = input.dim() - other.dim();
|
||||
TORCH_CHECK(
|
||||
0 <= dim_diff && dim_diff <= 1,
|
||||
"torch.linalg.lstsq: input.dim() must be greater or equal to other.dim() and (input.dim() - other.dim()) <= 1");
|
||||
Tensor other_2d = dim_diff ? other.unsqueeze(-1) : other;
|
||||
TORCH_CHECK(
|
||||
input.size(-2) == other_2d.size(-2),
|
||||
dim_diff ? "torch.linalg.lstsq: input.size(-2) should match other.size(-1)"
|
||||
: "torch.linalg.lstsq: input.size(-2) should match other.size(-2)");
|
||||
|
||||
checkSameDevice("torch.linalg.lstsq", other, input, "other");
|
||||
checkSameDevice("torch.linalg.lstsq", solution, input, "solution");
|
||||
checkSameDevice("torch.linalg.lstsq", residuals, input, "residuals");
|
||||
checkSameDevice("torch.linalg.lstsq", rank, input, "rank");
|
||||
checkSameDevice("torch.linalg.lstsq", singular_values, input, "singular_values");
|
||||
|
||||
// 'solution' is expected to have same dtype as input
|
||||
checkLinalgCompatibleDtype("torch.linalg.lstsq", solution, input, "solution");
|
||||
|
||||
// 'residuals' is expected to have real float dtype
|
||||
ScalarType real_dtype = c10::toValueType(input.scalar_type());
|
||||
checkLinalgCompatibleDtype("torch.linalg.lstsq", residuals.scalar_type(), real_dtype, "solution");
|
||||
|
||||
// 'rank' is expected to have integer dtype
|
||||
// actual LAPACK calls use int32_t type for rank, but we promote it to int64_t
|
||||
// to be consistent with torch.linalg.matrix_rank output dtype
|
||||
ScalarType rank_expected_type = ScalarType::Long;
|
||||
checkLinalgCompatibleDtype("torch.linalg.lstsq", rank.scalar_type(), rank_expected_type, "rank");
|
||||
|
||||
// 'singular_values' is expected to have real float dtype
|
||||
checkLinalgCompatibleDtype("torch.linalg.lstsq", singular_values.scalar_type(), real_dtype, "singular_values");
|
||||
|
||||
std::string driver_name = get_default_lstsq_driver(driver, input);
|
||||
|
||||
// set default rcond value
|
||||
// TODO: Change this to match non-legacy NumPy behaviour
|
||||
double rcond_value = rcond.has_value() && (rcond.value() > 0)
|
||||
? rcond.value()
|
||||
: _get_epsilon(c10::toValueType(input.scalar_type()));
|
||||
|
||||
auto infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt));
|
||||
|
||||
// now check whether the provided output tensors can be used directly
|
||||
|
||||
// Two types of 'other' tensors are supported:
|
||||
// - 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
|
||||
// - 2-dimensional (2D) tensor or batch of 2D tensors (matrix case)
|
||||
// original torch.lstsq supported only the matrix case, while NumPy works for both cases
|
||||
// for the batched input we need to be able to distinguish them
|
||||
// auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
|
||||
// bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
|
||||
bool vector_case = linalg_solve_is_vector_rhs(input, other);
|
||||
|
||||
// provided output tensor can be used directly if:
|
||||
// 1. the shape matches the expected shape
|
||||
// 2. the dtype matches the expected dtype
|
||||
// 3. the tensor is contiguous
|
||||
|
||||
// Checks for the 'solution' tensor
|
||||
std::vector<int64_t> expected_solution_shape = broadcast_batch_size(input, other_2d, input.dim() - 2);
|
||||
// the actual shape of the shape of the solution returned in (*, n,) or (*, n, nrhs)
|
||||
// but LAPACK requires extra dimensions so the expected shape is (*, max(m, n),) or (*, max(m, n), nrhs)
|
||||
expected_solution_shape.push_back(std::max(input.size(-1), input.size(-2)));
|
||||
if (!vector_case && other.dim() > 2) {
|
||||
expected_solution_shape.push_back(other.size(-1));
|
||||
}
|
||||
|
||||
// CUDA has only `gels` driver now which ONLY works with overdetermined systems
|
||||
if (at::kCUDA == self.device().type()) {
|
||||
TORCH_CHECK(
|
||||
self.size(-2) >= self.size(-1),
|
||||
"torch.linalg.lstsq: only overdetermined systems (m >= n) are allowed on CUDA"
|
||||
);
|
||||
bool solution_equal_expected_shape = solution.sizes().equals(expected_solution_shape);
|
||||
bool solution_input_same_type = (solution.scalar_type() == input.scalar_type());
|
||||
|
||||
bool is_solution_batched_column_major = false;
|
||||
if (vector_case) {
|
||||
is_solution_batched_column_major = solution.is_contiguous();
|
||||
} else if (!vector_case && solution.dim() >= 2) {
|
||||
is_solution_batched_column_major = solution.transpose(-2, -1).is_contiguous();
|
||||
}
|
||||
|
||||
// LAPACK/MAGMA requries inputs to be in the column-major-order.
|
||||
auto self_working_copy = copyBatchedColumnMajor(self);
|
||||
// 'residuals' is not checked here because at::sum_out(residuals, ...) does that
|
||||
|
||||
// Tensor b must be of size (..., max(m, n), nrhs)
|
||||
// and in the column-major order.
|
||||
// We allow the batch dims of `self` to broadcast over the batch
|
||||
// dims of `b` so that it is possible to solve multiple systems with
|
||||
// with the same lhs (encoded by `self`) / rhs (encoded by `b`).
|
||||
// `b_working_copy` is modified in-place and the combination of
|
||||
// batch broadcasting plus LAPACK/MAGMA requirements impose the following
|
||||
// restrictions on sizes/strides of `b`:
|
||||
// 1. b.size = (broadcasted_batch_size(self, b), max(m, n), nrhs).
|
||||
// 2. b.stride should correspond to an almost contiguous Tensor in the column-major-order,
|
||||
// i.e. b.stride = b.transpose(-2, -1).contiguous().transpose(-2, -1).strides()
|
||||
auto m = self.size(-2);
|
||||
auto n = self.size(-1);
|
||||
auto b_working_copy = copyBatchedColumnMajor(b_2d,
|
||||
/*nrows=*/std::max(m, n),
|
||||
/*desired_batch_sizes=*/broadcast_batch_size(self, b_2d, self.dim() - 2));
|
||||
auto input_batch_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2);
|
||||
|
||||
double rcond = cond.has_value() && (cond.value() > 0)
|
||||
? cond.value()
|
||||
: _get_epsilon(c10::toValueType(self.scalar_type()));
|
||||
|
||||
auto batch_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend() - 2);
|
||||
Tensor rank = at::empty({0}, self.options().dtype(at::kLong));
|
||||
if (driver_opt.value() != "gels") {
|
||||
rank.resize_(batch_shape, MemoryFormat::Contiguous);
|
||||
// Checks for the 'rank' tensor
|
||||
// rank is a scalar value for each matrix in the batch so
|
||||
// rank's expected shape is equal to input.shape[0:input.ndim-2]
|
||||
bool rank_equal_expected_shape = true;
|
||||
bool rank_equal_expected_type = true;
|
||||
bool rank_is_contiguous = true;
|
||||
if (driver_name != "gels") { // gels driver doesn't set 'rank'
|
||||
rank_equal_expected_shape = rank.sizes().equals(input_batch_shape);
|
||||
rank_equal_expected_type = (rank.scalar_type() == at::kLong);
|
||||
rank_is_contiguous = rank.is_contiguous();
|
||||
}
|
||||
|
||||
auto singular_values_shape = batch_shape.vec();
|
||||
singular_values_shape.push_back(std::min(m, n));
|
||||
auto real_dtype = c10::toValueType(self.scalar_type());
|
||||
Tensor singular_values = at::empty({0}, self.options().dtype(real_dtype));
|
||||
if (driver_opt.value() == "gelsd" || driver_opt.value() == "gelss") {
|
||||
singular_values.resize_(singular_values_shape, MemoryFormat::Contiguous);
|
||||
// Checks for the 'singular_values' tensor
|
||||
// singular values are computed only with "gelsd" and "gelss" drivers currently
|
||||
bool singular_values_equal_expected_shape = true;
|
||||
bool singular_values_equal_expected_type = true;
|
||||
bool singular_values_is_contiguous = true;
|
||||
if (driver_name == "gelsd" || driver_name == "gelss") {
|
||||
auto singular_values_shape = input_batch_shape.vec();
|
||||
singular_values_shape.push_back(std::min(input.size(-1), input.size(-2)));
|
||||
singular_values_equal_expected_shape = singular_values.sizes().equals(singular_values_shape);
|
||||
singular_values_equal_expected_type = (singular_values.scalar_type() == real_dtype);
|
||||
singular_values_is_contiguous = singular_values.is_contiguous();
|
||||
}
|
||||
|
||||
Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt).device(kCPU));
|
||||
// if solution is not empty and not in batched column major format
|
||||
bool copy_needed = (solution.numel() != 0 && !is_solution_batched_column_major);
|
||||
copy_needed |= !solution_input_same_type; // or solution does not have the same dtype as input
|
||||
copy_needed |= (solution.numel() != 0 && !solution_equal_expected_shape); // or solution does not have the expected shape
|
||||
|
||||
Tensor x, residuals;
|
||||
copy_needed |= !rank_equal_expected_type;
|
||||
copy_needed |= (rank.numel() != 0 && !rank_equal_expected_shape);
|
||||
copy_needed |= (rank.numel() != 0 && !rank_is_contiguous);
|
||||
|
||||
// path if neither `self` nor `b` is empty
|
||||
if (self.numel() && b.numel()) {
|
||||
x = at::_lstsq_helper_(b_working_copy, rank, singular_values, infos, self_working_copy, rcond, driver_opt.value());
|
||||
if (m > n && driver_opt.value() != "gelsy") {
|
||||
residuals = x.narrow(-2, n, std::max(m, n) - n).abs().pow_(2).sum(-2);
|
||||
}
|
||||
x = x.narrow(-2, 0, n);
|
||||
}
|
||||
// if either `self` or `b` is empty, return an empty tensor or,
|
||||
// if non-zero sizes, return a tensor of zeros.
|
||||
else {
|
||||
x = b_working_copy.zero_().narrow(-2, 0, n);
|
||||
copy_needed |= !singular_values_equal_expected_type;
|
||||
copy_needed |= (singular_values.numel() != 0 && !singular_values_equal_expected_shape);
|
||||
copy_needed |= (singular_values.numel() != 0 && !singular_values_is_contiguous);
|
||||
|
||||
if (copy_needed) { // we have to allocate temporary tensors
|
||||
Tensor solution_tmp = at::empty({0}, input.options());
|
||||
Tensor residuals_tmp = at::empty({0}, input.options().dtype(real_dtype));
|
||||
Tensor rank_tmp = at::empty({0}, input.options().dtype(at::kLong));
|
||||
Tensor singular_values_tmp = at::empty({0}, input.options().dtype(real_dtype));
|
||||
|
||||
linalg_lstsq_out_info(solution_tmp, residuals_tmp, rank_tmp, singular_values_tmp, infos, input, other, rcond_value, driver_name);
|
||||
|
||||
at::native::resize_output(solution, solution_tmp.sizes());
|
||||
solution.copy_(solution_tmp);
|
||||
|
||||
at::native::resize_output(residuals, residuals_tmp.sizes());
|
||||
residuals.copy_(residuals_tmp);
|
||||
|
||||
at::native::resize_output(rank, rank_tmp.sizes());
|
||||
rank.copy_(rank_tmp);
|
||||
|
||||
at::native::resize_output(singular_values, singular_values_tmp.sizes());
|
||||
singular_values.copy_(singular_values_tmp);
|
||||
} else {
|
||||
// else use the provided output storage directly
|
||||
linalg_lstsq_out_info(solution, residuals, rank, singular_values, infos, input, other, rcond_value, driver_name);
|
||||
}
|
||||
|
||||
auto return_empty_if_undefined = [&self](Tensor& t,
|
||||
c10::optional<at::ScalarType> dtype = c10::nullopt,
|
||||
c10::optional<std::vector<int64_t>> shape = c10::nullopt) {
|
||||
if (t.defined()) {
|
||||
return t;
|
||||
}
|
||||
else {
|
||||
auto output_dtype = dtype.has_value() ? dtype.value() : self.scalar_type();
|
||||
if (shape.has_value()) {
|
||||
return at::empty(shape.value(), self.options().dtype(output_dtype));
|
||||
}
|
||||
else {
|
||||
return at::empty({0}, self.options().dtype(output_dtype));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Some output stays undefined for some values of driver.
|
||||
// Instead of returning undefined tensors which get exposed as
|
||||
// Nones in the Python interface, we return empty tensors.
|
||||
// This way we follow the convention of output types in the
|
||||
// torch.linalg namespace.
|
||||
// NOTE: we run drivers only if both inputs are non-empty!
|
||||
// Hence the code below explicitly handles each and every output
|
||||
// if `self` is empty.
|
||||
|
||||
// Numpy and Scipy always return ranks for empty matrices,
|
||||
// even for drivers which are not rank-revealing.
|
||||
if (self.numel()) {
|
||||
rank = return_empty_if_undefined(rank, at::kLong);
|
||||
}
|
||||
else {
|
||||
rank = at::zeros(batch_shape, self.options().dtype(at::kLong));
|
||||
}
|
||||
|
||||
// undefined residuals could only be an empty Tensor of shape (0)
|
||||
residuals = return_empty_if_undefined(residuals);
|
||||
|
||||
if (!self.numel()
|
||||
&& (driver_opt.value() == "gelss" || driver_opt.value() == "gelsd")) {
|
||||
// when `self` is empty, return singular_values of shape
|
||||
// (*self.shape[:-2], 0) only if driver is in ('gelss', 'gelsd')
|
||||
auto singular_values_empty_shape = batch_shape.vec();
|
||||
singular_values_empty_shape.push_back(0);
|
||||
singular_values = return_empty_if_undefined(
|
||||
singular_values,
|
||||
at::toValueType(self.scalar_type()),
|
||||
singular_values_empty_shape);
|
||||
}
|
||||
else {
|
||||
// otherwise return an empty tensor of shape (0)
|
||||
singular_values = return_empty_if_undefined(
|
||||
singular_values,
|
||||
at::toValueType(self.scalar_type()));
|
||||
}
|
||||
|
||||
if (self.dim() > 2) {
|
||||
if (infos.numel() > 1) {
|
||||
batchCheckErrors(infos, "torch.linalg.lstsq");
|
||||
} else {
|
||||
singleCheckErrors(infos.item().toInt(), "torch.linalg.lstsq");
|
||||
singleCheckErrors(infos.item<int64_t>(), "torch.linalg.lstsq");
|
||||
}
|
||||
|
||||
return std::make_tuple(x, residuals, rank, singular_values);
|
||||
return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(solution, residuals, rank, singular_values);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
|
||||
const Tensor& input, const Tensor& other,
|
||||
c10::optional<double> rcond,
|
||||
c10::optional<std::string> driver) {
|
||||
Tensor solution = at::empty({0}, input.options());
|
||||
Tensor residuals = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
|
||||
Tensor rank = at::empty({0}, input.options().dtype(at::kLong));
|
||||
Tensor singular_values = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
|
||||
std::tie(solution, residuals, rank, singular_values) =
|
||||
at::linalg_lstsq_outf(input, other, rcond, driver, solution, residuals, rank, singular_values);
|
||||
return std::make_tuple(solution, residuals, rank, singular_values);
|
||||
}
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -1,9 +1,7 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/NamedTensorUtils.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <tuple>
|
||||
|
||||
|
||||
namespace at {
|
||||
@ -11,117 +9,10 @@ namespace native {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename scalar_t>
|
||||
static void max_pool2d_with_indices_single_out_frame(
|
||||
scalar_t *input_p,
|
||||
scalar_t *output_p,
|
||||
int64_t *ind_p,
|
||||
int64_t nslices,
|
||||
int64_t iwidth,
|
||||
int64_t iheight,
|
||||
int64_t owidth,
|
||||
int64_t oheight,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH,
|
||||
int dilationW,
|
||||
int dilationH
|
||||
)
|
||||
{
|
||||
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
|
||||
for (auto k = start; k < end; k++)
|
||||
{
|
||||
/* loop over output */
|
||||
int64_t i, j;
|
||||
scalar_t *ip = input_p + k*iwidth*iheight;
|
||||
for(i = 0; i < oheight; i++)
|
||||
{
|
||||
for(j = 0; j < owidth; j++)
|
||||
{
|
||||
int64_t hstart = i * dH - padH;
|
||||
int64_t wstart = j * dW - padW;
|
||||
int64_t hend = std::min(hstart + (kH - 1) * dilationH + 1, iheight);
|
||||
int64_t wend = std::min(wstart + (kW - 1) * dilationW + 1, iwidth);
|
||||
while(hstart < 0)
|
||||
hstart += dilationH;
|
||||
while(wstart < 0)
|
||||
wstart += dilationW;
|
||||
|
||||
/* local pointers */
|
||||
scalar_t *op = output_p + k*owidth*oheight + i*owidth + j;
|
||||
int64_t *indp = ind_p + k*owidth*oheight + i*owidth + j;
|
||||
|
||||
/* compute local max: */
|
||||
int64_t maxindex = hstart*iwidth + wstart;
|
||||
scalar_t maxval = -std::numeric_limits<scalar_t>::infinity();
|
||||
for(int64_t y = hstart; y < hend; y += dilationH)
|
||||
{
|
||||
for(int64_t x = wstart; x < wend; x += dilationW)
|
||||
{
|
||||
int64_t tcntr = y*iwidth + x;
|
||||
scalar_t val = *(ip + tcntr);
|
||||
if ((val > maxval) || std::isnan(val))
|
||||
{
|
||||
maxval = val;
|
||||
maxindex = tcntr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* set output to local max */
|
||||
*op = maxval;
|
||||
|
||||
/* store location of max */
|
||||
*indp = maxindex;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void max_pool2d_with_indices_out_frame(
|
||||
scalar_t *input_data,
|
||||
scalar_t *output_data,
|
||||
int64_t *indices_data,
|
||||
int64_t nbatch,
|
||||
int64_t nInputPlane,
|
||||
int64_t inputWidth,
|
||||
int64_t inputHeight,
|
||||
int64_t outputWidth,
|
||||
int64_t outputHeight,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH,
|
||||
int dilationW,
|
||||
int dilationH)
|
||||
{
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (auto p = start; p < end; p++) {
|
||||
max_pool2d_with_indices_single_out_frame(
|
||||
input_data+p*nInputPlane*inputWidth*inputHeight,
|
||||
output_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
kW, kH, dW, dH,
|
||||
padW, padH,
|
||||
dilationW, dilationH);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void max_pool2d_with_indices_out_cpu_template(
|
||||
Tensor& output,
|
||||
Tensor& indices,
|
||||
const Tensor& input_,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
@ -152,152 +43,50 @@ void max_pool2d_with_indices_out_cpu_template(
|
||||
const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
|
||||
const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
|
||||
|
||||
TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
|
||||
TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
|
||||
"non-empty 3D or 4D (batch mode) tensor expected for input");
|
||||
|
||||
TORCH_CHECK(input.dtype() == output.dtype(),
|
||||
"expected dtype ", input.dtype(), " for `output` but got dtype ", output.dtype());
|
||||
|
||||
/* sizes */
|
||||
const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
|
||||
const int64_t nInputPlane = input_.size(-3);
|
||||
const int64_t inputHeight = input_.size(-2);
|
||||
const int64_t inputWidth = input_.size(-1);
|
||||
const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
|
||||
const int64_t nInputPlane = input.size(-3);
|
||||
const int64_t inputHeight = input.size(-2);
|
||||
const int64_t inputWidth = input.size(-1);
|
||||
|
||||
const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
|
||||
const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
|
||||
|
||||
pool2d_shape_check(
|
||||
input_,
|
||||
input,
|
||||
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
|
||||
nInputPlane,
|
||||
inputHeight, inputWidth,
|
||||
outputHeight, outputWidth, input_.suggest_memory_format());
|
||||
outputHeight, outputWidth, input.suggest_memory_format());
|
||||
|
||||
/* get contiguous input */
|
||||
Tensor input = input_.contiguous();
|
||||
|
||||
/* resize output */
|
||||
if (input.ndimension() == 3)
|
||||
{
|
||||
/* resize output and indices */
|
||||
if (input.ndimension() == 3) {
|
||||
output.resize_({nInputPlane, outputHeight, outputWidth});
|
||||
/* indices will contain the locations for each output point */
|
||||
indices.resize_({nInputPlane, outputHeight, outputWidth});
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
|
||||
"max_pool2d_with_indices_cpu",
|
||||
[&] {
|
||||
/* get raw pointers */
|
||||
scalar_t *input_data = input.data_ptr<scalar_t>();
|
||||
scalar_t *output_data = output.data_ptr<scalar_t>();
|
||||
int64_t *indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
max_pool2d_with_indices_single_out_frame(
|
||||
input_data, output_data,
|
||||
indices_data,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
kW, kH, dW, dH,
|
||||
padW, padH,
|
||||
dilationW, dilationH);
|
||||
}
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
|
||||
} else {
|
||||
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, input.suggest_memory_format());
|
||||
/* indices will contain the locations for each output point */
|
||||
indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
|
||||
"max_pool2d_with_indices_cpu",
|
||||
[&] {
|
||||
scalar_t *input_data = input.data_ptr<scalar_t>();
|
||||
scalar_t *output_data = output.data_ptr<scalar_t>();
|
||||
int64_t *indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
max_pool2d_with_indices_out_frame(
|
||||
input_data,
|
||||
output_data,
|
||||
indices_data,
|
||||
nbatch,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
kW, kH, dW, dH,
|
||||
padW, padH,
|
||||
dilationW, dilationH); }
|
||||
);
|
||||
indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, input.suggest_memory_format());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void max_pool2d_with_indices_backward_single_out_frame(
|
||||
scalar_t *gradInput_p,
|
||||
scalar_t *gradOutput_p,
|
||||
int64_t *ind_p,
|
||||
int64_t nInputPlane,
|
||||
int64_t inputWidth,
|
||||
int64_t inputHeight,
|
||||
int64_t outputWidth,
|
||||
int64_t outputHeight,
|
||||
int dW,
|
||||
int dH)
|
||||
{
|
||||
at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
|
||||
for (auto k = start; k < end; k++)
|
||||
{
|
||||
scalar_t *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
|
||||
scalar_t *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
|
||||
int64_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
|
||||
|
||||
/* calculate max points */
|
||||
int64_t i, j;
|
||||
for(i = 0; i < outputHeight; i++)
|
||||
{
|
||||
for(j = 0; j < outputWidth; j++)
|
||||
{
|
||||
/* retrieve position of max */
|
||||
int64_t maxp = ind_p_k[i*outputWidth + j];
|
||||
if (maxp != -1) {
|
||||
/* update gradient */
|
||||
gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static void max_pool2d_with_indices_backward_out_frame(
|
||||
scalar_t *gradInput_data,
|
||||
scalar_t *gradOutput_data,
|
||||
int64_t *indices_data,
|
||||
int64_t nbatch,
|
||||
int64_t nInputPlane,
|
||||
int64_t inputWidth,
|
||||
int64_t inputHeight,
|
||||
int64_t outputWidth,
|
||||
int64_t outputHeight,
|
||||
int dW,
|
||||
int dH)
|
||||
{
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
|
||||
for (auto p = start; p < end; p++) {
|
||||
max_pool2d_with_indices_backward_single_out_frame<scalar_t>(
|
||||
gradInput_data+p*nInputPlane*inputWidth*inputHeight,
|
||||
gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
indices_data+p*nInputPlane*outputWidth*outputHeight,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
dW, dH);
|
||||
}
|
||||
});
|
||||
max_pool2d_kernel(
|
||||
kCPU, output, indices, input,
|
||||
kW, kH,
|
||||
dW, dH,
|
||||
padW, padH,
|
||||
dilationW, dilationH);
|
||||
}
|
||||
|
||||
Tensor& max_pool2d_with_indices_backward_out_cpu_template(
|
||||
Tensor& gradInput,
|
||||
const Tensor& gradOutput_,
|
||||
const Tensor& gradOutput,
|
||||
const Tensor& input,
|
||||
const Tensor& indices,
|
||||
IntArrayRef kernel_size,
|
||||
@ -333,11 +122,13 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(
|
||||
TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
|
||||
"non-empty 3D or 4D (batch mode) tensor expected for input");
|
||||
|
||||
/* get contiguous gradOutput */
|
||||
const Tensor gradOutput = gradOutput_.contiguous();
|
||||
TORCH_CHECK(input.dtype() == gradOutput.dtype(),
|
||||
"expected dtype ", input.dtype(), " for `gradOutput` but got dtype ", gradOutput.dtype());
|
||||
TORCH_CHECK(input.dtype() == gradInput.dtype(),
|
||||
"expected dtype ", input.dtype(), " for `gradInput` but got dtype ", gradInput.dtype());
|
||||
|
||||
/* resize */
|
||||
gradInput.resize_as_(input);
|
||||
gradInput.resize_(input.sizes(), input.suggest_memory_format());
|
||||
gradInput.zero_();
|
||||
|
||||
/* sizes */
|
||||
@ -354,7 +145,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(
|
||||
|
||||
max_pool2d_backward_shape_check(
|
||||
input,
|
||||
gradOutput_,
|
||||
gradOutput,
|
||||
indices,
|
||||
nbatch,
|
||||
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
|
||||
@ -363,48 +154,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(
|
||||
outputHeight_for_shape_check, outputWidth_for_shape_check,
|
||||
input.suggest_memory_format());
|
||||
|
||||
/* backprop */
|
||||
if (input.ndimension() == 3)
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
|
||||
"max_pool2d_with_indices_backward",
|
||||
[&] {
|
||||
/* get raw pointers */
|
||||
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
|
||||
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
||||
int64_t *indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
max_pool2d_with_indices_backward_single_out_frame(
|
||||
gradInput_data, gradOutput_data,
|
||||
indices_data,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
dW, dH);
|
||||
}
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
|
||||
"max_pool2d_with_indices_backward",
|
||||
[&] {
|
||||
/* get raw pointers */
|
||||
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
|
||||
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
|
||||
int64_t *indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
max_pool2d_with_indices_backward_out_frame<scalar_t>(
|
||||
gradInput_data, gradOutput_data,
|
||||
indices_data,
|
||||
nbatch,
|
||||
nInputPlane,
|
||||
inputWidth, inputHeight,
|
||||
outputWidth, outputHeight,
|
||||
dW, dH);
|
||||
}
|
||||
);
|
||||
}
|
||||
max_pool2d_backward_kernel(kCPU, gradInput, gradOutput, indices);
|
||||
|
||||
return gradInput;
|
||||
}
|
||||
@ -461,7 +211,8 @@ std::tuple<Tensor, Tensor> max_pool2d_with_indices_cpu(
|
||||
return std::tuple<Tensor, Tensor>(output, indices);
|
||||
}
|
||||
|
||||
Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
|
||||
Tensor& max_pool2d_with_indices_backward_out_cpu(
|
||||
const Tensor& gradOutput,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
@ -473,7 +224,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
|
||||
{
|
||||
max_pool2d_with_indices_backward_out_cpu_template(
|
||||
gradInput,
|
||||
gradOutput_,
|
||||
gradOutput,
|
||||
input,
|
||||
indices,
|
||||
kernel_size,
|
||||
@ -485,7 +236,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
|
||||
}
|
||||
|
||||
Tensor max_pool2d_with_indices_backward_cpu(
|
||||
const Tensor& gradOutput_,
|
||||
const Tensor& gradOutput,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
@ -494,10 +245,10 @@ Tensor max_pool2d_with_indices_backward_cpu(
|
||||
bool ceil_mode,
|
||||
const Tensor& indices)
|
||||
{
|
||||
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
auto gradInput = at::empty({0}, input.options());
|
||||
max_pool2d_with_indices_backward_out_cpu_template(
|
||||
gradInput,
|
||||
gradOutput_,
|
||||
gradOutput,
|
||||
input,
|
||||
indices,
|
||||
kernel_size,
|
||||
@ -508,5 +259,8 @@ Tensor max_pool2d_with_indices_backward_cpu(
|
||||
return gradInput;
|
||||
}
|
||||
|
||||
DEFINE_DISPATCH(max_pool2d_kernel);
|
||||
DEFINE_DISPATCH(max_pool2d_backward_kernel);
|
||||
|
||||
} // at::native
|
||||
} // at
|
||||
|
@ -514,4 +514,20 @@ static inline void checkLinalgCompatibleDtype(const std::string& fn_name, Scalar
|
||||
out_name, " with dtype ", out_type);
|
||||
}
|
||||
|
||||
/*
|
||||
Two types of 'other' tensors are supported when solving
|
||||
a system of linear equations matmul(input, x) = other:
|
||||
* 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
|
||||
* 2-dimensional (2D) tensor or batch of 2D tensors (matrix case).
|
||||
The original torch.solve supported only the matrix case, while NumPy works for both cases.
|
||||
For the batched input we need to be able to distinguish them.
|
||||
Let input.shape = (batch_dimensions, m, n), then 'other' is of vector type if other.shape == (batch_dimensions, m).
|
||||
This rule is compatible with NumPy, see https://github.com/numpy/numpy/blob/v1.20.0/numpy/linalg/linalg.py#L384-L389
|
||||
*/
|
||||
static inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& other) {
|
||||
auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
|
||||
bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
|
||||
return vector_case;
|
||||
}
|
||||
|
||||
}} // namespace at::native
|
||||
|
@ -1,14 +1,20 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/div_rtn.h>
|
||||
#include <tuple>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
using max_pool2d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input,
|
||||
int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH);
|
||||
using max_pool2d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
|
||||
|
||||
DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel);
|
||||
DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename dest_t, typename src_t>
|
||||
|
359
aten/src/ATen/native/cpu/MaxPoolKernel.cpp
Normal file
359
aten/src/ATen/native/cpu/MaxPoolKernel.cpp
Normal file
@ -0,0 +1,359 @@
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/cpu/vec256/vec256.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/cpu/utils.h>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_max_pool(
|
||||
Tensor& output_,
|
||||
Tensor indices_,
|
||||
const Tensor& input_,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
int dilationW, int dilationH) {
|
||||
auto input = input_.contiguous();
|
||||
auto output = output_.contiguous();
|
||||
auto indices = indices_.contiguous();
|
||||
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
int64_t numel = output.numel();
|
||||
int64_t ndim = input.ndimension();
|
||||
// treat batch size and channels as one dimension
|
||||
int64_t channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
|
||||
int64_t input_height = input.size(-2);
|
||||
int64_t input_width = input.size(-1);
|
||||
int64_t output_height = output.size(-2);
|
||||
int64_t output_width = output.size(-1);
|
||||
|
||||
// parallel on dim N, C, H, W
|
||||
at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t c = 0;
|
||||
int64_t oh = 0;
|
||||
int64_t ow = 0;
|
||||
data_index_init(begin, c, channels, oh, output_height, ow, output_width);
|
||||
|
||||
for (int64_t i = begin; i < end; i++) {
|
||||
int64_t ih0 = oh * dH - padH;
|
||||
int64_t iw0 = ow * dW - padW;
|
||||
int64_t ih1 = std::min(ih0 + (kH - 1) * dilationH + 1, input_height);
|
||||
int64_t iw1 = std::min(iw0 + (kW - 1) * dilationW + 1, input_width);
|
||||
while(ih0 < 0) { ih0 += dilationH; }
|
||||
while(iw0 < 0) { iw0 += dilationW; }
|
||||
|
||||
// local pointers
|
||||
scalar_t* input_ptr = input_data + c * input_height * input_width;
|
||||
|
||||
// compute local max
|
||||
int64_t maxindex = ih0 * input_width + iw0;
|
||||
scalar_t maxval = -std::numeric_limits<scalar_t>::infinity();
|
||||
for (int64_t ih = ih0; ih < ih1; ih += dilationH) {
|
||||
for (int64_t iw = iw0; iw < iw1; iw += dilationW) {
|
||||
int64_t index = ih * input_width + iw;
|
||||
scalar_t val = input_ptr[index];
|
||||
if ((val > maxval) || std::isnan(val)) {
|
||||
maxval = val;
|
||||
maxindex = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// set output to local max and store location of max
|
||||
output_data[i] = maxval;
|
||||
indices_data[i] = maxindex;
|
||||
|
||||
// move on to next output index
|
||||
data_index_step(c, channels, oh, output_height, ow, output_width);
|
||||
}
|
||||
});
|
||||
|
||||
if (!output_.is_contiguous()) {
|
||||
output_.copy_(output);
|
||||
}
|
||||
if (!indices_.is_contiguous()) {
|
||||
indices_.copy_(indices);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_max_pool_channels_last(
|
||||
Tensor& output_,
|
||||
Tensor indices_,
|
||||
const Tensor& input_,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
int dilationW, int dilationH) {
|
||||
TORCH_CHECK(input_.ndimension() == 4,
|
||||
"max pooling with channels last format supports tensors with 4 dims");
|
||||
auto memory_format = at::MemoryFormat::ChannelsLast;
|
||||
auto input = input_.contiguous(memory_format);
|
||||
auto output = output_.contiguous(memory_format);
|
||||
auto indices = indices_.contiguous(memory_format);
|
||||
|
||||
auto input_data = input.data_ptr<scalar_t>();
|
||||
auto output_data = output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
int64_t nbatch = input.size(0);
|
||||
int64_t channels = input.size(1);
|
||||
int64_t input_height = input.size(2);
|
||||
int64_t input_width = input.size(3);
|
||||
int64_t output_height = output.size(2);
|
||||
int64_t output_width = output.size(3);
|
||||
|
||||
using Vec = vec256::Vec256<scalar_t>;
|
||||
using integer_t = vec256::int_same_size_t<scalar_t>;
|
||||
using iVec = vec256::Vec256<integer_t>;
|
||||
// for the convience of vectorization, use integer of the same size of scalar_t,
|
||||
// e.g. int32_t for float, int64_t for double
|
||||
// need to make sure doesn't overflow
|
||||
TORCH_CHECK(input_height <= std::ceil((double)std::numeric_limits<integer_t>::max() / (double)input_width));
|
||||
|
||||
// parallel on dim N, H, W
|
||||
at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t n = 0;
|
||||
int64_t oh = 0;
|
||||
int64_t ow = 0;
|
||||
data_index_init(begin, n, nbatch, oh, output_height, ow, output_width);
|
||||
|
||||
int64_t size = channels;
|
||||
int64_t len = size - (size % Vec::size());
|
||||
// temp buffer holding index with integer_t
|
||||
std::unique_ptr<integer_t []> index_buffer(new integer_t[len]);
|
||||
|
||||
for (int64_t i = begin; i < end; i++) {
|
||||
int64_t ih0 = oh * dH - padH;
|
||||
int64_t iw0 = ow * dW - padW;
|
||||
int64_t ih1 = std::min(ih0 + (kH - 1) * dilationH + 1, input_height);
|
||||
int64_t iw1 = std::min(iw0 + (kW - 1) * dilationW + 1, input_width);
|
||||
while(ih0 < 0) { ih0 += dilationH; }
|
||||
while(iw0 < 0) { iw0 += dilationW; }
|
||||
|
||||
scalar_t* out = output_data + i * channels;
|
||||
int64_t* ind = indices_data + i * channels;
|
||||
|
||||
// Pass I: init out lane
|
||||
iVec index0_vec = iVec(ih0 * input_width + iw0);
|
||||
Vec out_vec = Vec(-std::numeric_limits<scalar_t>::infinity());
|
||||
int64_t d1 = 0;
|
||||
for (; d1 < len; d1 += Vec::size()) {
|
||||
index0_vec.store(index_buffer.get() + d1);
|
||||
out_vec.store(out + d1);
|
||||
}
|
||||
for (; d1 < size; d1++) {
|
||||
ind[d1] = ih0 * input_width + iw0;
|
||||
out[d1] = -std::numeric_limits<scalar_t>::infinity();
|
||||
}
|
||||
// Pass II: compute local max
|
||||
for (int64_t ih = ih0; ih < ih1; ih += dilationH) {
|
||||
for (int64_t iw = iw0; iw < iw1; iw += dilationW) {
|
||||
scalar_t* in = input_data + n * input_height * input_width * channels +
|
||||
ih * input_width * channels + iw * channels;
|
||||
|
||||
int64_t d2 = 0;
|
||||
for (; d2 < len; d2 += Vec::size()) {
|
||||
iVec index_vec = iVec(ih * input_width + iw);
|
||||
Vec val_vec = Vec::loadu(in + d2);
|
||||
iVec maxindex_vec = iVec::loadu(index_buffer.get() + d2);
|
||||
Vec maxval_vec = Vec::loadu(out + d2);
|
||||
|
||||
// true = all ones, false = all zeros
|
||||
Vec mask = (val_vec > maxval_vec) | val_vec.isnan();
|
||||
iVec imask = vec256::cast<integer_t>(mask);
|
||||
Vec out_vec = Vec::blendv(maxval_vec, val_vec, mask);
|
||||
iVec ind_vec = iVec::blendv(maxindex_vec, index_vec, imask);
|
||||
|
||||
out_vec.store(out + d2);
|
||||
ind_vec.store(index_buffer.get() + d2);
|
||||
}
|
||||
for (; d2 < size; d2++) {
|
||||
int64_t index = ih * input_width + iw;
|
||||
scalar_t val = in[d2];
|
||||
int64_t maxindex = ind[d2];
|
||||
scalar_t maxval = out[d2];
|
||||
|
||||
bool mask = (val > maxval) || std::isnan(val);
|
||||
out[d2] = mask ? val : maxval;
|
||||
ind[d2] = mask ? index : maxindex;
|
||||
}
|
||||
}
|
||||
}
|
||||
// convert indice data type
|
||||
vec256::convert<integer_t, int64_t>(index_buffer.get(), ind, len);
|
||||
|
||||
// move on to next output index
|
||||
data_index_step(n, nbatch, oh, output_height, ow, output_width);
|
||||
}
|
||||
});
|
||||
|
||||
if (!output_.is_contiguous(memory_format)) {
|
||||
output_.copy_(output);
|
||||
}
|
||||
if (!indices_.is_contiguous(memory_format)) {
|
||||
indices_.copy_(indices);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_max_pool_backward(
|
||||
Tensor& grad_input_,
|
||||
const Tensor& grad_output_,
|
||||
const Tensor& indices_) {
|
||||
auto grad_output = grad_output_.contiguous();
|
||||
auto indices = indices_.contiguous();
|
||||
auto grad_input = grad_input_.contiguous();
|
||||
|
||||
auto grad_output_data = grad_output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
auto grad_input_data = grad_input.data_ptr<scalar_t>();
|
||||
|
||||
int64_t ndim = grad_output.ndimension();
|
||||
// treat batch size and channels as one dimension
|
||||
int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
|
||||
int64_t input_height = grad_input.size(-2);
|
||||
int64_t input_width = grad_input.size(-1);
|
||||
int64_t output_height = grad_output.size(-2);
|
||||
int64_t output_width = grad_output.size(-1);
|
||||
|
||||
// parallel on dim of N, C
|
||||
at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
|
||||
for (int64_t c = begin; c < end; c++) {
|
||||
scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
|
||||
scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
|
||||
int64_t * indices_ptr = indices_data + c * output_height * output_width;
|
||||
|
||||
for (int64_t oh = 0; oh < output_height; oh++) {
|
||||
for (int64_t ow = 0; ow < output_width; ow++) {
|
||||
// retrieve position of max
|
||||
int64_t index = oh * output_width + ow;
|
||||
int64_t maxindex = indices_ptr[index];
|
||||
if (maxindex != -1) {
|
||||
// update gradient
|
||||
grad_input_ptr[maxindex] += grad_output_ptr[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (!grad_input_.is_contiguous()) {
|
||||
grad_input_.copy_(grad_input);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_max_pool_backward_channels_last(
|
||||
Tensor& grad_input_,
|
||||
const Tensor& grad_output_,
|
||||
const Tensor& indices_) {
|
||||
TORCH_CHECK(grad_output_.ndimension() == 4,
|
||||
"max pooling backward with channels last format supports tensors with 4 dims.");
|
||||
auto memory_format = at::MemoryFormat::ChannelsLast;
|
||||
auto grad_input = grad_input_.contiguous(memory_format);
|
||||
auto grad_output = grad_output_.contiguous(memory_format);
|
||||
auto indices = indices_.contiguous(memory_format);
|
||||
|
||||
auto grad_input_data = grad_input.data_ptr<scalar_t>();
|
||||
auto grad_output_data = grad_output.data_ptr<scalar_t>();
|
||||
auto indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
int64_t nbatch = grad_input.size(0);
|
||||
int64_t channels = grad_input.size(1);
|
||||
int64_t input_height = grad_input.size(2);
|
||||
int64_t input_width = grad_input.size(3);
|
||||
int64_t output_height = grad_output.size(2);
|
||||
int64_t output_width = grad_output.size(3);
|
||||
|
||||
// parallel on dim N
|
||||
at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
|
||||
for (int64_t n = begin; n < end; n++) {
|
||||
scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels;
|
||||
scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
|
||||
int64_t* indices_ptr = indices_data + n * output_height * output_width * channels;
|
||||
|
||||
for (int64_t oh = 0; oh < output_height; oh++) {
|
||||
for (int64_t ow = 0; ow < output_width; ow++) {
|
||||
scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
|
||||
int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels;
|
||||
// TODO: gcc vectorization
|
||||
for (int64_t c = 0; c < channels; c++) {
|
||||
int64_t maxindex = ind[c];
|
||||
if (maxindex != -1) {
|
||||
grad_input_ptr[maxindex * channels + c] += gout[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (!grad_input_.is_contiguous(memory_format)) {
|
||||
grad_input_.copy_(grad_input);
|
||||
}
|
||||
}
|
||||
|
||||
void max_pool2d_kernel_impl(
|
||||
Tensor& output,
|
||||
Tensor& indices,
|
||||
const Tensor& input,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
int dilationW, int dilationH) {
|
||||
switch (input.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool2d", [&] {
|
||||
cpu_max_pool<scalar_t>(output, indices, input, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool2d_channels_last", [&] {
|
||||
cpu_max_pool_channels_last<scalar_t>(output, indices, input, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
|
||||
});
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
|
||||
}
|
||||
}
|
||||
|
||||
void max_pool2d_backward_kernel_impl(
|
||||
Tensor& grad_input,
|
||||
const Tensor& grad_output,
|
||||
const Tensor& indices) {
|
||||
switch (grad_output.suggest_memory_format()) {
|
||||
case at::MemoryFormat::Contiguous: {
|
||||
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_pool2d_backward", [&] {
|
||||
cpu_max_pool_backward<scalar_t>(grad_input, grad_output, indices);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case at::MemoryFormat::ChannelsLast: {
|
||||
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_pool2d_backward_channels_last", [&] {
|
||||
cpu_max_pool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
|
||||
});
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
REGISTER_DISPATCH(max_pool2d_kernel, &max_pool2d_kernel_impl);
|
||||
REGISTER_DISPATCH(max_pool2d_backward_kernel, &max_pool2d_backward_kernel_impl);
|
||||
|
||||
}} // at::native
|
@ -190,109 +190,6 @@ __global__ void atomicadaptivemaxgradinput(
|
||||
}
|
||||
}
|
||||
|
||||
// 4d tensor B x D x H x W
|
||||
|
||||
void adaptive_max_pool2d_out_cuda_template(
|
||||
Tensor& output,
|
||||
Tensor& indices,
|
||||
const Tensor& input,
|
||||
IntArrayRef output_size)
|
||||
{
|
||||
TensorArg output_arg{ output, "output", 1 };
|
||||
TensorArg indices_arg{ indices, "indices", 2 };
|
||||
TensorArg input_arg{ input, "input", 3 };
|
||||
|
||||
checkAllSameGPU("adaptive_max_pool2d_cuda", {output_arg, indices_arg, input_arg});
|
||||
|
||||
for (int64_t i = 0; i < input.ndimension(); i++) {
|
||||
TORCH_CHECK(input.size(i) > 0,
|
||||
"adaptive_max_pool2d_cuda(): expected input to have non-empty spatial dimensions, "
|
||||
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
|
||||
"empty");
|
||||
}
|
||||
|
||||
TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
|
||||
"non-empty 3D or 4D (batch mode) tensor expected for input");
|
||||
|
||||
TORCH_CHECK(output_size.size() == 2,
|
||||
"adaptive_max_pool2d: internal error: output_size.size() must be 2");
|
||||
|
||||
int64_t osizeH = output_size[0];
|
||||
int64_t osizeW = output_size[1];
|
||||
|
||||
if (input.ndimension() == 3) {
|
||||
int64_t sizeD = input.size(0);
|
||||
int64_t isizeH = input.size(1);
|
||||
int64_t isizeW = input.size(2);
|
||||
|
||||
int64_t istrideD = input.stride(0);
|
||||
int64_t istrideH = input.stride(1);
|
||||
int64_t istrideW = input.stride(2);
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
|
||||
"adaptive_max_pool2d_cuda",
|
||||
[&] {
|
||||
output.resize_({sizeD, osizeH, osizeW});
|
||||
indices.resize_({sizeD, osizeH, osizeW});
|
||||
|
||||
scalar_t *input_data = input.data_ptr<scalar_t>();
|
||||
scalar_t *output_data = output.data_ptr<scalar_t>();
|
||||
int64_t *indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
// cuda blocks & threads:
|
||||
int blocksH = (int)(16L / sizeD);
|
||||
blocksH = blocksH < 1 ? 1 : blocksH;
|
||||
dim3 blocks(sizeD, blocksH);
|
||||
dim3 threads(32, 8);
|
||||
|
||||
// run maxpool kernel
|
||||
adaptivemaxpool <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> (
|
||||
input_data, output_data,
|
||||
indices_data,
|
||||
isizeH, isizeW, osizeH, osizeW,
|
||||
istrideD, istrideH, istrideW);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
);
|
||||
} else {
|
||||
Tensor input_ = input.contiguous();
|
||||
int64_t sizeB = input_.size(0);
|
||||
int64_t sizeD = input_.size(1);
|
||||
int64_t isizeH = input_.size(2);
|
||||
int64_t isizeW = input_.size(3);
|
||||
|
||||
int64_t istrideD = input_.stride(1);
|
||||
int64_t istrideH = input_.stride(2);
|
||||
int64_t istrideW = input_.stride(3);
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input_.scalar_type(),
|
||||
"adaptive_max_pool2d_cuda",
|
||||
[&] {
|
||||
output.resize_({sizeB, sizeD, osizeH, osizeW});
|
||||
indices.resize_({sizeB, sizeD, osizeH, osizeW});
|
||||
|
||||
scalar_t *input_data = input_.data_ptr<scalar_t>();
|
||||
scalar_t *output_data = output.data_ptr<scalar_t>();
|
||||
int64_t *indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
// cuda blocks & threads:
|
||||
int blocksH = (int)(16L / sizeD);
|
||||
blocksH = blocksH < 1 ? 1 : blocksH;
|
||||
dim3 blocks(sizeB*sizeD, blocksH);
|
||||
dim3 threads(32, 8);
|
||||
|
||||
// run maxpool kernel
|
||||
adaptivemaxpool <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> (
|
||||
input_data, output_data,
|
||||
indices_data,
|
||||
isizeH, isizeW, osizeH, osizeW,
|
||||
istrideD, istrideH, istrideW);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void adaptive_max_pool2d_backward_out_cuda_template(
|
||||
Tensor& gradInput,
|
||||
const Tensor& gradOutput_,
|
||||
@ -409,31 +306,108 @@ void adaptive_max_pool2d_backward_out_cuda_template(
|
||||
|
||||
} // namespace
|
||||
|
||||
std::tuple<Tensor&, Tensor&> adaptive_max_pool2d_out_cuda(const Tensor& input,
|
||||
IntArrayRef output_size,
|
||||
Tensor& output,
|
||||
Tensor& indices)
|
||||
{
|
||||
adaptive_max_pool2d_out_cuda_template(
|
||||
output,
|
||||
indices,
|
||||
input,
|
||||
output_size);
|
||||
return std::tuple<Tensor&, Tensor&>(output, indices);
|
||||
}
|
||||
// 4d tensor B x D x H x W
|
||||
|
||||
std::tuple<Tensor, Tensor> adaptive_max_pool2d_cuda(
|
||||
const Tensor& input,
|
||||
IntArrayRef output_size)
|
||||
{
|
||||
Tensor output = at::empty({0}, input.options());
|
||||
Tensor indices = at::empty({0}, input.options().dtype(kLong));
|
||||
adaptive_max_pool2d_out_cuda_template(
|
||||
output,
|
||||
indices,
|
||||
input,
|
||||
output_size);
|
||||
return std::tuple<Tensor, Tensor>(output, indices);
|
||||
TORCH_IMPL_FUNC(adaptive_max_pool2d_out_cuda)
|
||||
(const Tensor& input,
|
||||
IntArrayRef output_size,
|
||||
const Tensor& output,
|
||||
const Tensor& indices) {
|
||||
TensorArg output_arg{output, "output", 1};
|
||||
TensorArg indices_arg{indices, "indices", 2};
|
||||
TensorArg input_arg{input, "input", 3};
|
||||
|
||||
checkAllSameGPU(
|
||||
"adaptive_max_pool2d_cuda", {output_arg, indices_arg, input_arg});
|
||||
|
||||
int64_t osizeH = output_size[0];
|
||||
int64_t osizeW = output_size[1];
|
||||
|
||||
if (input.ndimension() == 3) {
|
||||
int64_t sizeD = input.size(0);
|
||||
int64_t isizeH = input.size(1);
|
||||
int64_t isizeW = input.size(2);
|
||||
|
||||
int64_t istrideD = input.stride(0);
|
||||
int64_t istrideH = input.stride(1);
|
||||
int64_t istrideW = input.stride(2);
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
kHalf, kBFloat16, input.scalar_type(), "adaptive_max_pool2d_cuda", [&] {
|
||||
scalar_t* input_data = input.data_ptr<scalar_t>();
|
||||
scalar_t* output_data = output.data_ptr<scalar_t>();
|
||||
int64_t* indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
// cuda blocks & threads:
|
||||
int blocksH = (int)(16L / sizeD);
|
||||
blocksH = blocksH < 1 ? 1 : blocksH;
|
||||
dim3 blocks(sizeD, blocksH);
|
||||
dim3 threads(32, 8);
|
||||
|
||||
// run maxpool kernel
|
||||
adaptivemaxpool<<<
|
||||
blocks,
|
||||
threads,
|
||||
0,
|
||||
at::cuda::getCurrentCUDAStream()>>>(
|
||||
input_data,
|
||||
output_data,
|
||||
indices_data,
|
||||
isizeH,
|
||||
isizeW,
|
||||
osizeH,
|
||||
osizeW,
|
||||
istrideD,
|
||||
istrideH,
|
||||
istrideW);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
});
|
||||
} else {
|
||||
Tensor input_ = input.contiguous();
|
||||
int64_t sizeB = input_.size(0);
|
||||
int64_t sizeD = input_.size(1);
|
||||
int64_t isizeH = input_.size(2);
|
||||
int64_t isizeW = input_.size(3);
|
||||
|
||||
int64_t istrideD = input_.stride(1);
|
||||
int64_t istrideH = input_.stride(2);
|
||||
int64_t istrideW = input_.stride(3);
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
kHalf,
|
||||
kBFloat16,
|
||||
input_.scalar_type(),
|
||||
"adaptive_max_pool2d_cuda",
|
||||
[&] {
|
||||
scalar_t* input_data = input_.data_ptr<scalar_t>();
|
||||
scalar_t* output_data = output.data_ptr<scalar_t>();
|
||||
int64_t* indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
// cuda blocks & threads:
|
||||
int blocksH = (int)(16L / sizeD);
|
||||
blocksH = blocksH < 1 ? 1 : blocksH;
|
||||
dim3 blocks(sizeB * sizeD, blocksH);
|
||||
dim3 threads(32, 8);
|
||||
|
||||
// run maxpool kernel
|
||||
adaptivemaxpool<<<
|
||||
blocks,
|
||||
threads,
|
||||
0,
|
||||
at::cuda::getCurrentCUDAStream()>>>(
|
||||
input_data,
|
||||
output_data,
|
||||
indices_data,
|
||||
isizeH,
|
||||
isizeW,
|
||||
osizeH,
|
||||
osizeW,
|
||||
istrideD,
|
||||
istrideH,
|
||||
istrideW);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Tensor& adaptive_max_pool2d_backward_out_cuda(const Tensor& gradOutput_,
|
||||
|
@ -291,90 +291,6 @@ void atomicadaptivemaxgradinput_loop(
|
||||
}
|
||||
}
|
||||
|
||||
// 5d tensor B x D x T x H x W
|
||||
|
||||
void adaptive_max_pool3d_out_cuda_template(
|
||||
Tensor& output,
|
||||
Tensor& indices,
|
||||
const Tensor& input_,
|
||||
IntArrayRef output_size)
|
||||
{
|
||||
TensorArg output_arg{ output, "output", 1 };
|
||||
TensorArg indices_arg{ indices, "indices", 2 };
|
||||
TensorArg input_arg{ input_, "input_", 3 };
|
||||
|
||||
checkAllSameGPU("adaptive_max_pool3d_cuda", {output_arg, indices_arg, input_arg});
|
||||
|
||||
for (int64_t i = 0; i < input_.ndimension(); i++) {
|
||||
TORCH_CHECK(input_.size(i) > 0,
|
||||
"adaptive_max_pool3d_cuda(): expected input to have non-empty spatial dimensions, "
|
||||
"but input has sizes ", input_.sizes(), " with dimension ", i, " being "
|
||||
"empty");
|
||||
}
|
||||
|
||||
TORCH_CHECK((input_.ndimension() == 4 || input_.ndimension() == 5),
|
||||
"non-empty 4D or 5D (batch mode) tensor expected for input");
|
||||
|
||||
TORCH_CHECK(output_size.size() == 3,
|
||||
"adaptive_max_pool3d: internal error: output_size.size() must be 3");
|
||||
|
||||
int64_t osizeT = output_size[0];
|
||||
int64_t osizeH = output_size[1];
|
||||
int64_t osizeW = output_size[2];
|
||||
|
||||
int64_t sizeD, isizeT, isizeH, isizeW;
|
||||
int64_t istrideD, istrideT, istrideH, istrideW;
|
||||
int64_t totalZ;
|
||||
|
||||
const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous();
|
||||
|
||||
if (input.ndimension() == 4) {
|
||||
sizeD = input.size(0);
|
||||
isizeT = input.size(1);
|
||||
isizeH = input.size(2);
|
||||
isizeW = input.size(3);
|
||||
|
||||
istrideD = input.stride(0);
|
||||
istrideT = input.stride(1);
|
||||
istrideH = input.stride(2);
|
||||
istrideW = input.stride(3);
|
||||
|
||||
output.resize_({sizeD, osizeT, osizeH, osizeW});
|
||||
indices.resize_({sizeD, osizeT, osizeH, osizeW});
|
||||
|
||||
totalZ = sizeD * osizeT;
|
||||
} else {
|
||||
int64_t sizeB = input.size(0);
|
||||
sizeD = input.size(1);
|
||||
isizeT = input.size(2);
|
||||
isizeH = input.size(3);
|
||||
isizeW = input.size(4);
|
||||
|
||||
istrideD = input.stride(1);
|
||||
istrideT = input.stride(2);
|
||||
istrideH = input.stride(3);
|
||||
istrideW = input.stride(4);
|
||||
|
||||
output.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
|
||||
indices.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
|
||||
|
||||
totalZ = sizeB * sizeD * osizeT;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
|
||||
"adaptive_max_pool3d_cuda",
|
||||
[&] {
|
||||
scalar_t *input_data = input.data_ptr<scalar_t>();
|
||||
scalar_t *output_data = output.data_ptr<scalar_t>();
|
||||
int64_t *indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
adaptivemaxpool_loop(
|
||||
input_data, output_data, indices_data, totalZ, isizeT, isizeH, isizeW,
|
||||
osizeT, osizeH, osizeW, istrideD, istrideT, istrideH, istrideW);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
void adaptive_max_pool3d_backward_out_cuda_template(
|
||||
Tensor& gradInput,
|
||||
const Tensor& gradOutput_,
|
||||
@ -460,31 +376,79 @@ void adaptive_max_pool3d_backward_out_cuda_template(
|
||||
|
||||
} // namespace
|
||||
|
||||
std::tuple<Tensor&, Tensor&> adaptive_max_pool3d_out_cuda(const Tensor& input,
|
||||
IntArrayRef output_size,
|
||||
Tensor& output,
|
||||
Tensor& indices)
|
||||
{
|
||||
adaptive_max_pool3d_out_cuda_template(
|
||||
output,
|
||||
indices,
|
||||
input,
|
||||
output_size);
|
||||
return std::tuple<Tensor&, Tensor&>(output, indices);
|
||||
}
|
||||
// 5d tensor B x D x T x H x W
|
||||
|
||||
std::tuple<Tensor, Tensor> adaptive_max_pool3d_cuda(
|
||||
const Tensor& input,
|
||||
IntArrayRef output_size)
|
||||
{
|
||||
Tensor output = at::empty({0}, input.options());
|
||||
Tensor indices = at::empty({0}, input.options().dtype(kLong));
|
||||
adaptive_max_pool3d_out_cuda_template(
|
||||
output,
|
||||
indices,
|
||||
input,
|
||||
output_size);
|
||||
return std::tuple<Tensor, Tensor>(output, indices);
|
||||
TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cuda)
|
||||
(const Tensor& input,
|
||||
IntArrayRef output_size,
|
||||
const Tensor& output,
|
||||
const Tensor& indices) {
|
||||
TensorArg output_arg{output, "output", 1};
|
||||
TensorArg indices_arg{indices, "indices", 2};
|
||||
TensorArg input_arg{input, "input", 3};
|
||||
|
||||
checkAllSameGPU(
|
||||
"adaptive_max_pool3d_cuda", {output_arg, indices_arg, input_arg});
|
||||
|
||||
int64_t osizeT = output_size[0];
|
||||
int64_t osizeH = output_size[1];
|
||||
int64_t osizeW = output_size[2];
|
||||
|
||||
int64_t sizeD, isizeT, isizeH, isizeW;
|
||||
int64_t istrideD, istrideT, istrideH, istrideW;
|
||||
int64_t totalZ;
|
||||
|
||||
const Tensor& input_ = input.ndimension() == 4 ? input : input.contiguous();
|
||||
|
||||
if (input_.ndimension() == 4) {
|
||||
sizeD = input_.size(0);
|
||||
isizeT = input_.size(1);
|
||||
isizeH = input_.size(2);
|
||||
isizeW = input_.size(3);
|
||||
|
||||
istrideD = input_.stride(0);
|
||||
istrideT = input_.stride(1);
|
||||
istrideH = input_.stride(2);
|
||||
istrideW = input_.stride(3);
|
||||
|
||||
totalZ = sizeD * osizeT;
|
||||
} else {
|
||||
int64_t sizeB = input_.size(0);
|
||||
sizeD = input_.size(1);
|
||||
isizeT = input_.size(2);
|
||||
isizeH = input_.size(3);
|
||||
isizeW = input_.size(4);
|
||||
|
||||
istrideD = input_.stride(1);
|
||||
istrideT = input_.stride(2);
|
||||
istrideH = input_.stride(3);
|
||||
istrideW = input_.stride(4);
|
||||
|
||||
totalZ = sizeB * sizeD * osizeT;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
kHalf, kBFloat16, input_.scalar_type(), "adaptive_max_pool3d_cuda", [&] {
|
||||
scalar_t* input_data = input_.data_ptr<scalar_t>();
|
||||
scalar_t* output_data = output.data_ptr<scalar_t>();
|
||||
int64_t* indices_data = indices.data_ptr<int64_t>();
|
||||
|
||||
adaptivemaxpool_loop(
|
||||
input_data,
|
||||
output_data,
|
||||
indices_data,
|
||||
totalZ,
|
||||
isizeT,
|
||||
isizeH,
|
||||
isizeW,
|
||||
osizeT,
|
||||
osizeH,
|
||||
osizeW,
|
||||
istrideD,
|
||||
istrideT,
|
||||
istrideH,
|
||||
istrideW);
|
||||
});
|
||||
}
|
||||
|
||||
Tensor& adaptive_max_pool3d_backward_out_cuda(const Tensor& gradOutput_,
|
||||
|
@ -1192,7 +1192,41 @@ void magmaGels<c10::complex<double>>(
|
||||
reinterpret_cast<magmaDoubleComplex*>(hwork), lwork, info);
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
/*
|
||||
MAGMA can return errors both as a return value and in the info argument.
|
||||
The return value and info should always be identical.
|
||||
In general, the meaning is as given in this table.
|
||||
Predefined error codes are large negative numbers. Using the symbolic
|
||||
constants below is preferred, but the numeric values can be found in
|
||||
include/magma_types.h.
|
||||
|
||||
Info | Description
|
||||
----------- | -----------
|
||||
info = 0 (MAGMA_SUCCESS) | Successful exit
|
||||
info < 0, but small | For info = -i, the i-th argument had an illegal value
|
||||
info > 0 | Function-specific error such as singular matrix
|
||||
MAGMA_ERR_DEVICE_ALLOC | Could not allocate GPU device memory
|
||||
MAGMA_ERR_HOST_ALLOC | Could not allocate CPU host memory
|
||||
MAGMA_ERR_ILLEGAL_VALUE | An argument had an illegal value (deprecated; instead it should return -i to say the i-th argument was bad)
|
||||
MAGMA_ERR_INVALID_PTR | Can't free pointer
|
||||
MAGMA_ERR_NOT_IMPLEMENTED | Function or option not implemented
|
||||
MAGMA_ERR_NOT_SUPPORTED | Function or option not supported on the current architecture
|
||||
*/
|
||||
void checkMagmaInternalError(magma_int_t info, const std::string& magma_function_name) {
|
||||
// if info > 0 the error is function-specific, do nothing in this case
|
||||
TORCH_CHECK(info >= 0,
|
||||
"MAGMA error: ",
|
||||
magma_strerror(info),
|
||||
", info = ", info,
|
||||
", when calling ", magma_function_name);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
#endif // USE_MAGMA
|
||||
|
||||
#define ALLOCATE_ARRAY(name, type, size) \
|
||||
auto storage_##name = pin_memory<type>(size); \
|
||||
@ -1968,7 +2002,7 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau, int64_t n_columns)
|
||||
|
||||
template <typename scalar_t>
|
||||
static void apply_qr(Tensor& Q, Tensor& R, int64_t q_size_minus_2, int64_t r_size_minus_1, int64_t n_columns,
|
||||
bool compute_q, std::vector<int64_t>& infos) {
|
||||
bool compute_q) {
|
||||
#ifndef USE_MAGMA
|
||||
AT_ERROR("qr: MAGMA library not found in "
|
||||
"compilation. Please rebuild with MAGMA.");
|
||||
@ -1996,10 +2030,7 @@ AT_ERROR("qr: MAGMA library not found in "
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
scalar_t* r_working_ptr = &r_data[i * r_matrix_stride];
|
||||
magmaGeqrf<scalar_t>(m, n, r_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/true);
|
||||
infos[i] = info;
|
||||
if (info != 0) {
|
||||
return;
|
||||
}
|
||||
checkMagmaInternalError(info, "geqrf");
|
||||
}
|
||||
if (!compute_q) {
|
||||
// this is for mode='r'
|
||||
@ -2017,15 +2048,10 @@ AT_ERROR("qr: MAGMA library not found in "
|
||||
for (int64_t i = 0; i < batch_size; i++) {
|
||||
scalar_t* q_working_ptr = &q_data[i * q_matrix_stride];
|
||||
magmaGeqrf<scalar_t>(m, n, q_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/false);
|
||||
infos[i] = info;
|
||||
if (info != 0) {
|
||||
return;
|
||||
}
|
||||
checkMagmaInternalError(info, "geqrf");
|
||||
|
||||
magmaOrgqr<scalar_t>(m, n_columns, k, q_working_ptr, m, tau_data, work_data, nb, &info);
|
||||
infos[i] = info;
|
||||
if (info != 0) {
|
||||
return;
|
||||
}
|
||||
checkMagmaInternalError(info, "orgqr");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -2033,7 +2059,6 @@ AT_ERROR("qr: MAGMA library not found in "
|
||||
std::tuple<Tensor,Tensor> _linalg_qr_helper_cuda(const Tensor& self, std::string mode) {
|
||||
bool compute_q, reduced;
|
||||
std::tie(compute_q, reduced) = _parse_qr_mode(mode);
|
||||
std::vector<int64_t> infos(batchCount(self), 0);
|
||||
|
||||
// Setup input geometry and inputs for apply_qr
|
||||
std::vector<int64_t> q_sizes, q_strides;
|
||||
@ -2066,13 +2091,8 @@ std::tuple<Tensor,Tensor> _linalg_qr_helper_cuda(const Tensor& self, std::string
|
||||
int64_t n = r_working_copy.size(-1);
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "qr_cuda", [&]{
|
||||
apply_qr<scalar_t>(q_working_copy, r_working_copy, m, n, n_columns_q, compute_q, infos);
|
||||
apply_qr<scalar_t>(q_working_copy, r_working_copy, m, n, n_columns_q, compute_q);
|
||||
});
|
||||
if (self.dim() > 2) {
|
||||
batchCheckErrors(infos, "qr_cuda");
|
||||
} else {
|
||||
singleCheckErrors(infos[0], "qr_cuda");
|
||||
}
|
||||
|
||||
if (compute_q) {
|
||||
q_working_copy = q_working_copy.narrow(-1, 0, n_columns_q);
|
||||
@ -2647,6 +2667,11 @@ TORCH_CHECK(false, "torch.linalg.lstsq: MAGMA library not found in "
|
||||
auto trans = MagmaNoTrans;
|
||||
auto m = magma_int_cast(a.size(-2), "m");
|
||||
auto n = magma_int_cast(a.size(-1), "n");
|
||||
|
||||
TORCH_CHECK(
|
||||
m >= n,
|
||||
"torch.linalg.lstsq: only overdetermined systems (input.size(-2) >= input.size(-1)) are allowed on CUDA");
|
||||
|
||||
auto nrhs = magma_int_cast(b.size(-1), "nrhs");
|
||||
auto ldda = std::max<magma_int_t>(1, m);
|
||||
auto lddb = std::max<magma_int_t>(1, std::max(m, n));
|
||||
|
@ -669,7 +669,7 @@ cunn_SoftMaxBackward(scalar_t *gradInput, outscalar_t *output, outscalar_t *grad
|
||||
const int grad_output_shift = ((uint64_t)gradOutput) % ALIGN_BYTES / sizeof(outscalar_t);
|
||||
|
||||
accscalar_t threadSum = ilpReduce<AddFloat, ILP, outscalar_t, accscalar_t>(
|
||||
shift, gradOutput, classes, AddFloat<outscalar_t, accscalar_t>(), accscalar_t(0));
|
||||
grad_output_shift, gradOutput, classes, AddFloat<outscalar_t, accscalar_t>(), accscalar_t(0));
|
||||
accscalar_t sum_k = blockReduce<Add, accscalar_t>(
|
||||
sdata, threadSum, Add<accscalar_t>(), accscalar_t(0));
|
||||
|
||||
|
@ -99,7 +99,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
|
||||
indices.resize_as_(self);
|
||||
indices.zero_();
|
||||
}
|
||||
return {values, indices};
|
||||
return std::forward_as_tuple(values, indices);
|
||||
}
|
||||
|
||||
Tensor self_;
|
||||
@ -153,7 +153,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
|
||||
}
|
||||
|
||||
if (numel == 0) {
|
||||
return {values, indices};
|
||||
return std::forward_as_tuple(values, indices);
|
||||
}
|
||||
|
||||
int64_t numel_or_intmax = std::min(numel, static_cast<int64_t>(std::numeric_limits<int>::max()));
|
||||
@ -206,7 +206,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
|
||||
if (indices_tmp.defined()) {
|
||||
indices.copy_(indices_tmp);
|
||||
}
|
||||
return {values, indices};
|
||||
return std::forward_as_tuple(values, indices);
|
||||
}
|
||||
|
||||
std::tuple<Tensor &,Tensor &> sort_out_cuda(const Tensor & self, int64_t dim, bool descending, Tensor & values, Tensor & indices) {
|
||||
|
@ -590,7 +590,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
|
||||
// we compute the "linear index" of the output element,
|
||||
// and convert it to the equivalent "linear index" of the input element.
|
||||
ushort offset = 4 * s2 + idx;
|
||||
ushort linear_idx = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
|
||||
int64_t linear_idx = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
|
||||
if(linear_idx >= numel1){
|
||||
value[idx] = 0;
|
||||
continue;
|
||||
@ -615,6 +615,98 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
|
||||
}
|
||||
}
|
||||
|
||||
constant bool transpose_in_is_arr = (ushort_arg_3 > 1 || ushort_arg_4 > 4);
|
||||
constant bool transpose_in_is_tex = !transpose_in_is_arr;
|
||||
constant bool transpose_out_is_arr = (ushort_arg_5 > 1 || ushort_arg_6 > 4);
|
||||
constant bool transpose_out_is_tex = !transpose_out_is_arr;
|
||||
kernel void transpose(texture2d_array<half, access::read>in_arr[[texture(0),function_constant(transpose_in_is_arr)]],
|
||||
texture2d<half, access::read> in_tex[[texture(0), function_constant(transpose_in_is_tex)]],
|
||||
texture2d_array<half, access::write>out_arr[[texture(1),function_constant(transpose_out_is_arr)]],
|
||||
texture2d<half, access::write> out_tex[[texture(1), function_constant(transpose_out_is_tex)]],
|
||||
constant ushort* inSizeBuffer [[buffer(0)]],
|
||||
constant ushort* outSizeBuffer [[buffer(1)]],
|
||||
device ushort* indexBuffer [[buffer(2)]],
|
||||
ushort3 gid[[thread_position_in_grid]]) {
|
||||
|
||||
const ushort dim0 = ushort_arg_0;
|
||||
const ushort dim1 = ushort_arg_1;
|
||||
const ushort dim = ushort_arg_2;
|
||||
const ushort N1 = ushort_arg_3;
|
||||
const ushort C1 = ushort_arg_4;
|
||||
const ushort N2 = ushort_arg_5;
|
||||
const ushort C2 = ushort_arg_6;
|
||||
ushort W1,W2,H1,H2;
|
||||
if(transpose_in_is_arr) {
|
||||
W1 = in_arr.get_width();
|
||||
H1 = in_arr.get_height();
|
||||
} else {
|
||||
W1 = in_tex.get_width();
|
||||
H1 = in_tex.get_height();
|
||||
}
|
||||
if(transpose_out_is_arr) {
|
||||
W2 = out_arr.get_width();
|
||||
H2 = out_arr.get_height();
|
||||
} else {
|
||||
W2 = out_tex.get_width();
|
||||
H2 = out_tex.get_height();
|
||||
}
|
||||
if (gid.x >= W2 || gid.y >= H2) {
|
||||
return;
|
||||
}
|
||||
const int numel = H2 * W2 * C2 * N2;
|
||||
const ushort slices2 = divRoundUp(C2, 4);
|
||||
const ushort slices1 = divRoundUp(C1, 4);
|
||||
const ushort n2 = gid.z / slices2;
|
||||
const ushort s2 = gid.z - n2 * slices2;
|
||||
half4 value;
|
||||
for (int idx = 0; idx < 4; ++idx){
|
||||
ushort offset = 4 * s2 + idx;
|
||||
int64_t linear_idx2 = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
|
||||
if(linear_idx2 >= numel) {
|
||||
value[idx] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
ushort d2 = 0;
|
||||
for(int j = dim-1; j>=0; --j){
|
||||
d2 = outSizeBuffer[j];
|
||||
indexBuffer[j] = linear_idx2 % d2;
|
||||
linear_idx2 /= d2;
|
||||
}
|
||||
|
||||
// swap dims
|
||||
ushort tmp = indexBuffer[dim0];
|
||||
indexBuffer[dim0] = indexBuffer[dim1];
|
||||
indexBuffer[dim1] = tmp;
|
||||
|
||||
int64_t linear_idx1 = 0;
|
||||
ushort m = 1;
|
||||
ushort d1 = 0;
|
||||
for(int k = dim-1; k>=0; --k) {
|
||||
d1 = indexBuffer[k];
|
||||
linear_idx1 += d1 * m;
|
||||
m *= inSizeBuffer[k];
|
||||
}
|
||||
|
||||
auto x1 = linear_idx1 % W1;
|
||||
auto y1 = ((int)(linear_idx1/W1)) % H1;
|
||||
auto c1 = ((int)(linear_idx1/W1/H1) % C1);
|
||||
auto n1 = ((int)(linear_idx1/W1/H1/C1) % N1);
|
||||
auto z1 = (int)c1 / 4 + n1 * slices1;
|
||||
auto pos = c1 % 4;
|
||||
if(transpose_in_is_arr) {
|
||||
value[idx] = in_arr.read(ushort2(x1, y1), z1)[pos];
|
||||
} else {
|
||||
value[idx] = in_tex.read(ushort2(x1, y1))[pos];
|
||||
}
|
||||
}
|
||||
if(transpose_out_is_arr) {
|
||||
out_arr.write(value, gid.xy, gid.z);
|
||||
} else {
|
||||
out_tex.write(value, gid.xy);
|
||||
}
|
||||
}
|
||||
|
||||
)PT_METAL_SHADERS";
|
||||
|
||||
#endif /* MPSCNNShaders_h */
|
||||
|
@ -1,15 +1,22 @@
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/native/metal/mpscnn/MPSCNNContext.h>
|
||||
#include <ATen/native/metal/MetalCommandBuffer.h>
|
||||
#include <ATen/native/metal/MetalTensorImpl.h>
|
||||
#include <ATen/native/metal/MetalTensorImplStorage.h>
|
||||
#include <vector>
|
||||
|
||||
#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
|
||||
typedef float16_t fp16_t;
|
||||
#else
|
||||
typedef uint16_t fp16_t;
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace metal {
|
||||
|
||||
std::vector<uint16_t> Fp32ToFp16(const std::vector<float>& src);
|
||||
std::vector<float> Fp16ToFp32(const std::vector<uint16_t>& src);
|
||||
std::vector<fp16_t> Fp32ToFp16(const std::vector<float>& src);
|
||||
std::vector<float> Fp16ToFp32(const std::vector<fp16_t>& src);
|
||||
|
||||
std::vector<float> NCHWToNC4(
|
||||
const float* src,
|
||||
@ -67,6 +74,15 @@ static inline MetalCommandBuffer* getCommandBufferFromTensor(
|
||||
return cmdBuffer;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
id<MTLBuffer>makeMTLBuffer(const std::vector<T>& src) {
|
||||
id<MTLBuffer> buffer = [[MPSCNNContext sharedInstance].device
|
||||
newBufferWithLength:src.size() * sizeof(T)
|
||||
options:MTLResourceOptionCPUCacheModeWriteCombined];
|
||||
memcpy(buffer.contents, src.data(), src.size() * sizeof(T));
|
||||
return buffer;
|
||||
}
|
||||
|
||||
} // namespace metal
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
@ -1,28 +1,27 @@
|
||||
#import <ATen/native/metal/MetalUtils.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
|
||||
#import <Accelerate/Accelerate.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace metal {
|
||||
|
||||
std::vector<uint16_t> Fp32ToFp16(const std::vector<float>& src) {
|
||||
unsigned long count = src.size();
|
||||
std::vector<uint16_t> output(count, 0);
|
||||
vImage_Buffer float32{(void*)src.data(), 1, count, count * sizeof(float)};
|
||||
vImage_Buffer float16{
|
||||
(void*)output.data(), 1, count, count * sizeof(uint16_t)};
|
||||
if (vImageConvert_PlanarFtoPlanar16F(&float32, &float16, 0) !=
|
||||
kvImageNoError) {
|
||||
TORCH_CHECK(false);
|
||||
}
|
||||
|
||||
std::vector<fp16_t> Fp32ToFp16(const std::vector<float>& src) {
|
||||
unsigned long count = src.size();
|
||||
std::vector<fp16_t> output(count, 0);
|
||||
vImage_Buffer float32{(void*)src.data(), 1, count, count * sizeof(float)};
|
||||
vImage_Buffer float16{(void*)output.data(), 1, count, count * sizeof(fp16_t)};
|
||||
if (vImageConvert_PlanarFtoPlanar16F(&float32, &float16, 0) !=
|
||||
kvImageNoError) {
|
||||
TORCH_CHECK(false);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
std::vector<float> Fp16ToFp32(const std::vector<uint16_t>& src) {
|
||||
std::vector<float> Fp16ToFp32(const std::vector<fp16_t>& src) {
|
||||
unsigned long count = src.size();
|
||||
std::vector<float> output(count, 0);
|
||||
vImage_Buffer float16{(void*)src.data(), 1, count, count * sizeof(uint16_t)};
|
||||
vImage_Buffer float16{(void*)src.data(), 1, count, count * sizeof(fp16_t)};
|
||||
vImage_Buffer float32{(void*)output.data(), 1, count, count * sizeof(float)};
|
||||
if (vImageConvert_Planar16FtoPlanarF(&float16, &float32, 0) !=
|
||||
kvImageNoError) {
|
||||
|
@ -1,3 +1,4 @@
|
||||
#import <ATen/native/metal/MetalUtils.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSCNNUtils.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSCNNClampOp.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
|
||||
@ -35,9 +36,9 @@
|
||||
[encoder setTexture:[_X texture] atIndex:0];
|
||||
[encoder setTexture:[_Y texture] atIndex:1];
|
||||
id<MTLBuffer> clampBuffer = [[MPSCNNContext sharedInstance].device
|
||||
newBufferWithLength:2 * sizeof(fp16)
|
||||
newBufferWithLength:2 * sizeof(fp16_t)
|
||||
options:MTLResourceOptionCPUCacheModeWriteCombined];
|
||||
fp16* clampBufferPtr = (fp16*)[clampBuffer contents];
|
||||
fp16_t* clampBufferPtr = (fp16_t*)[clampBuffer contents];
|
||||
clampBufferPtr[0] = _min.floatValue;
|
||||
clampBufferPtr[1] = _max.floatValue;
|
||||
[encoder setBuffer:clampBuffer offset:0 atIndex:0];
|
||||
|
@ -1,12 +1,6 @@
|
||||
#import <Metal/Metal.h>
|
||||
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
|
||||
|
||||
#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
|
||||
typedef float16_t fp16;
|
||||
#else
|
||||
typedef uint16_t fp16;
|
||||
#endif
|
||||
|
||||
@protocol MPSCNNOp<NSObject>
|
||||
|
||||
@property(nonatomic, strong) MPSCNNKernel* kernel;
|
||||
|
@ -1,6 +1,7 @@
|
||||
#import <ATen/Tensor.h>
|
||||
#import <ATen/native/metal/MetalCommandBuffer.h>
|
||||
#import <ATen/native/metal/MetalTensorImpl.h>
|
||||
#import <ATen/native/metal/MetalUtils.h>
|
||||
|
||||
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
|
||||
|
||||
@ -10,7 +11,7 @@ namespace metal {
|
||||
|
||||
MPSImage* createStaticImage(const std::vector<int64_t>& sizes);
|
||||
MPSImage* createStaticImage(
|
||||
const uint16_t* src,
|
||||
const fp16_t* src,
|
||||
const std::vector<int64_t>& sizes);
|
||||
MPSImage* createStaticImage(
|
||||
const float* src,
|
||||
@ -35,7 +36,7 @@ MPSTemporaryImage* createTemporaryImage(
|
||||
|
||||
void copyToHost(float* dst, MPSImage* image);
|
||||
|
||||
std::vector<uint16_t> staticImageToFp16Array(MPSImage* image);
|
||||
std::vector<fp16_t> staticImageToFp16Array(MPSImage* image);
|
||||
at::Tensor staticImageToTensor(MPSImage* image);
|
||||
|
||||
static inline MPSImage* imageFromTensor(const Tensor& tensor) {
|
||||
|
@ -25,7 +25,7 @@ MPSImage* createStaticImage(const std::vector<int64_t>& sizes) {
|
||||
}
|
||||
|
||||
MPSImage* createStaticImage(
|
||||
const uint16_t* src,
|
||||
const fp16_t* src,
|
||||
const std::vector<int64_t>& sizes) {
|
||||
int64_t N = sizes[0];
|
||||
int64_t C = sizes[1];
|
||||
@ -45,7 +45,7 @@ MPSImage* createStaticImage(
|
||||
|
||||
int64_t slices = (C + 3) / 4 * N;
|
||||
int64_t numComponents = image.featureChannels < 3 ? image.featureChannels : 4;
|
||||
int64_t bytesPerRow = W * numComponents * sizeof(uint16_t);
|
||||
int64_t bytesPerRow = W * numComponents * sizeof(fp16_t);
|
||||
uint8_t* ptr = (uint8_t*)src;
|
||||
for (int i = 0; i < slices; ++i) {
|
||||
[image.texture replaceRegion:MTLRegionMake2D(0, 0, W, H)
|
||||
@ -256,7 +256,7 @@ void copyToHost(float* dst, MPSImage* image) {
|
||||
memcpy(dst, buffer.contents, buffer.length);
|
||||
}
|
||||
|
||||
std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {
|
||||
std::vector<fp16_t> staticImageToFp16Array(MPSImage* image) {
|
||||
if (image.pixelFormat == MTLPixelFormatR16Float ||
|
||||
image.pixelFormat == MTLPixelFormatRG16Float ||
|
||||
image.pixelFormat == MTLPixelFormatRGBA16Float) {
|
||||
@ -265,8 +265,8 @@ std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {
|
||||
int64_t numComponents =
|
||||
image.featureChannels < 3 ? image.featureChannels : 4;
|
||||
int64_t count = image.width * image.height * image.numberOfImages * C;
|
||||
std::vector<uint16_t> output(count, 0);
|
||||
int64_t bytesPerRow = image.width * numComponents * sizeof(uint16_t);
|
||||
std::vector<fp16_t> output(count, 0);
|
||||
int64_t bytesPerRow = image.width * numComponents * sizeof(fp16_t);
|
||||
uint8_t* buffer = (uint8_t*)output.data();
|
||||
for (int i = 0; i < slices * image.numberOfImages; ++i) {
|
||||
[image.texture getBytes:buffer
|
||||
@ -285,8 +285,8 @@ std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {
|
||||
|
||||
at::Tensor staticImageToTensor(MPSImage* image) {
|
||||
auto outputSize = [image sizes];
|
||||
std::vector<uint16_t> fp16 = staticImageToFp16Array(image);
|
||||
auto fp32 = metal::Fp16ToFp32(fp16);
|
||||
std::vector<fp16_t> fp16Array = staticImageToFp16Array(image);
|
||||
auto fp32 = metal::Fp16ToFp32(fp16Array);
|
||||
std::vector<float> fp32_nchw = metal::NC4ToNCHW(fp32.data(), outputSize);
|
||||
auto tensor = at::empty(outputSize);
|
||||
int64_t size_bytes = c10::multiply_integers(outputSize) * sizeof(float);
|
||||
|
@ -24,6 +24,9 @@ bool test_div();
|
||||
bool test_div_broadcast();
|
||||
bool test_div_broadcast2();
|
||||
bool test_t();
|
||||
bool test_transpose();
|
||||
bool test_transpose2();
|
||||
bool test_transpose3();
|
||||
bool test_view();
|
||||
bool test_view2();
|
||||
bool test_view3();
|
||||
@ -43,5 +46,8 @@ bool test_upsampling_nearest2d_vec();
|
||||
bool test_adaptive_avg_pool2d();
|
||||
bool test_hardtanh_();
|
||||
bool test_reshape();
|
||||
bool test_mean_dim();
|
||||
bool test_mean_dim2();
|
||||
bool test_mean_dim3();
|
||||
|
||||
#endif
|
||||
|
@ -4,7 +4,6 @@
|
||||
#import <ATen/native/metal/mpscnn/MPSImageUtils.h>
|
||||
#import <ATen/native/metal/mpscnn/tests/MPSCNNTests.h>
|
||||
#import <ATen/native/metal/ops/MetalConvolution.h>
|
||||
#import <ATen/native/metal/ops/MetalTranspose.h>
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
|
||||
@ -490,7 +489,7 @@ bool test_t() {
|
||||
auto X1 = at::rand({H, W}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
auto Y1 = at::t(X1).contiguous();
|
||||
auto X2 = X1.metal();
|
||||
auto Y2 = at::native::metal::t(X2).cpu();
|
||||
auto Y2 = at::t(X2).cpu();
|
||||
return almostEqual(Y1, Y2);
|
||||
});
|
||||
if (!b) {
|
||||
@ -500,6 +499,39 @@ bool test_t() {
|
||||
return result;
|
||||
}
|
||||
|
||||
bool test_transpose() {
|
||||
__block std::vector<int64_t> size {1, 2, 2, 5};
|
||||
return TEST(size, __PRETTY_FUNCTION__, ^bool{
|
||||
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
auto Y1 = at::transpose(X1, 1, 3).contiguous();
|
||||
auto X2 = X1.metal();
|
||||
auto Y2 = at::transpose(X2, 1, 3).cpu();
|
||||
return almostEqual(Y1, Y2);
|
||||
});
|
||||
}
|
||||
|
||||
bool test_transpose2() {
|
||||
__block std::vector<int64_t> size {1, 2, 58, 28, 28};
|
||||
return TEST(size, __PRETTY_FUNCTION__, ^bool{
|
||||
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
auto Y1 = at::transpose(X1, 1, 2).contiguous();
|
||||
auto X2 = X1.metal();
|
||||
auto Y2 = at::transpose(X2, 1, 2).cpu();
|
||||
return almostEqual(Y1, Y2);
|
||||
});
|
||||
}
|
||||
|
||||
bool test_transpose3() {
|
||||
__block std::vector<int64_t> size {4, 5, 6};
|
||||
return TEST(size, __PRETTY_FUNCTION__, ^bool{
|
||||
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
auto Y1 = at::transpose(X1, 2, 0).contiguous();
|
||||
auto X2 = X1.metal();
|
||||
auto Y2 = at::transpose(X2, 2, 0).cpu();
|
||||
return almostEqual(Y1, Y2);
|
||||
});
|
||||
}
|
||||
|
||||
bool test_view() {
|
||||
// array -> array
|
||||
__block std::vector<int64_t> size{1, 10, 2, 2};
|
||||
@ -775,3 +807,38 @@ bool test_hardtanh_() {
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool test_mean_dim() {
|
||||
__block std::vector<int64_t> size{1, 5, 2, 2};
|
||||
return TEST(size, __PRETTY_FUNCTION__, ^bool {
|
||||
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
auto Y1 = at::mean(X1, {2,3}, true);
|
||||
auto X2 = X1.metal();
|
||||
auto Y2 = at::mean(X2, {2,3}, true).cpu();
|
||||
return almostEqual(Y1, Y2);
|
||||
});
|
||||
}
|
||||
|
||||
bool test_mean_dim2() {
|
||||
__block std::vector<int64_t> size{1, 5, 2, 2};
|
||||
return TEST(size, __PRETTY_FUNCTION__, ^bool {
|
||||
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
auto Y1 = at::mean(X1, {1,3}, false);
|
||||
auto X2 = X1.metal();
|
||||
auto Y2 = at::mean(X2, {1,3}, false).cpu();
|
||||
return almostEqual(Y1, Y2);
|
||||
});
|
||||
}
|
||||
|
||||
bool test_mean_dim3() {
|
||||
__block std::vector<int64_t> size{1, 5, 2, 2};
|
||||
return TEST(size, __PRETTY_FUNCTION__, ^bool {
|
||||
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
|
||||
auto Y1 = at::mean(X1, {0,1,2,3});
|
||||
PRINT_TENSOR("Y1", Y1);
|
||||
auto X2 = X1.metal();
|
||||
auto Y2 = at::mean(X2, {0,1,2,3}).cpu();
|
||||
PRINT_TENSOR("Y2", Y2);
|
||||
return almostEqual(Y1, Y2);
|
||||
});
|
||||
}
|
||||
|
84
aten/src/ATen/native/metal/ops/MetalReduce.mm
Normal file
84
aten/src/ATen/native/metal/ops/MetalReduce.mm
Normal file
@ -0,0 +1,84 @@
|
||||
#include <ATen/Tensor.h>
|
||||
#import <ATen/native/metal/MetalCommandBuffer.h>
|
||||
#import <ATen/native/metal/MetalTensorImpl.h>
|
||||
#import <ATen/native/metal/MetalTensorImplStorage.h>
|
||||
#import <ATen/native/metal/MetalUtils.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSImage+Tensor.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSImageUtils.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/native/ReduceOpsUtils.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace metal {
|
||||
|
||||
API_AVAILABLE(ios(11.3), macos(10.13))
|
||||
static inline MPSNNReduceUnary* kernelForReducedDim(int dim) {
|
||||
id<MTLDevice> device = [MPSCNNContext sharedInstance].device;
|
||||
if (dim == 3) {
|
||||
return [[MPSNNReduceRowMean alloc] initWithDevice:device];
|
||||
} else if (dim == 2) {
|
||||
return [[MPSNNReduceColumnMean alloc] initWithDevice:device];
|
||||
} else if (dim == 1) {
|
||||
return [[MPSNNReduceFeatureChannelsMean alloc] initWithDevice:device];
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
|
||||
Tensor wrapper_mean_dim(
|
||||
const Tensor& input,
|
||||
IntArrayRef dims,
|
||||
bool keepdim,
|
||||
c10::optional<ScalarType> dtype) {
|
||||
if (@available(iOS 11.3, *)) {
|
||||
MPSImage* X = imageFromTensor(input);
|
||||
auto textureSize = input.sizes().vec();
|
||||
TORCH_CHECK(textureSize.size() == 4);
|
||||
// TODO: [T87340633] Support reducing the batch dimension
|
||||
TORCH_CHECK(textureSize[0] == 1);
|
||||
auto mask = make_dim_mask(dims, input.dim());
|
||||
MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
|
||||
MPSImage* Y = nil;
|
||||
for (int dim : dims) {
|
||||
textureSize[dim] = 1;
|
||||
MPSNNReduceUnary* kernel = kernelForReducedDim(dim);
|
||||
if (kernel) {
|
||||
Y = createTemporaryImage(commandBuffer, textureSize);
|
||||
[kernel encodeToCommandBuffer:commandBuffer.buffer
|
||||
sourceImage:X
|
||||
destinationImage:Y];
|
||||
X = Y;
|
||||
}
|
||||
}
|
||||
MetalTensorImplStorage mt{textureSize};
|
||||
mt.texture()->setCommandBuffer(commandBuffer);
|
||||
mt.texture()->copyFromTexture(Y);
|
||||
auto shape = DimVector(input.sizes());
|
||||
for (int dim = shape.size() - 1; dim >= 0; dim--) {
|
||||
if (mask[dim]) {
|
||||
if (keepdim) {
|
||||
shape[dim] = 1;
|
||||
} else {
|
||||
shape.erase(shape.begin() + dim);
|
||||
}
|
||||
}
|
||||
}
|
||||
auto output = makeTensor(std::move(mt), input.options()).view(shape);
|
||||
return output;
|
||||
} else {
|
||||
// TODO: [T87350528] Fallback to shader kernels for 10.0 users
|
||||
TORCH_CHECK(
|
||||
false, "MPSNNReduceUnary is only available on iOS 11.3 and above");
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(aten, Metal, m) {
|
||||
m.impl("mean.dim", TORCH_FN(wrapper_mean_dim));
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
#ifndef MetalCopy_h
|
||||
#define MetalCopy_h
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace metal {
|
||||
|
||||
// TODO: Remove the header once we are able to call it through dispatcher
|
||||
Tensor t(const Tensor& input);
|
||||
|
||||
} // namespace metal
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
||||
#endif
|
@ -3,37 +3,94 @@
|
||||
#import <ATen/native/metal/MetalTensorImplStorage.h>
|
||||
#import <ATen/native/metal/MetalUtils.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSCNNUtils.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSImage+Tensor.h>
|
||||
#import <ATen/native/metal/mpscnn/MPSImageUtils.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace metal {
|
||||
|
||||
Tensor transpose(const Tensor& input, int64_t dim0, int64_t dim1) {
|
||||
TORCH_CHECK(input.is_metal());
|
||||
auto ndims = input.dim();
|
||||
dim0 = maybe_wrap_dim(dim0, ndims);
|
||||
dim1 = maybe_wrap_dim(dim1, ndims);
|
||||
if (dim0 == dim1) {
|
||||
return input;
|
||||
}
|
||||
auto outputSizes = input.sizes().vec();
|
||||
std::swap(outputSizes[dim0], outputSizes[dim1]);
|
||||
MPSImage* X = imageFromTensor(input);
|
||||
MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
|
||||
if (input.dim() == 2) {
|
||||
MetalTensorImplStorage mt{outputSizes};
|
||||
mt.texture()->allocateTemporaryTextureStorage(outputSizes, commandBuffer);
|
||||
MPSImage* Y = mt.texture()->image();
|
||||
MPSImageTranspose* transpose = [[MPSImageTranspose alloc]
|
||||
initWithDevice:[MPSCNNContext sharedInstance].device];
|
||||
[transpose encodeToCommandBuffer:commandBuffer.buffer
|
||||
sourceImage:X
|
||||
destinationImage:Y];
|
||||
auto output = makeTensor(std::move(mt), input.options());
|
||||
return output;
|
||||
} else {
|
||||
id<MTLBuffer> sizeBuf1 = makeMTLBuffer<ushort>(
|
||||
std::vector<ushort>{input.sizes().begin(), input.sizes().end()});
|
||||
id<MTLBuffer> sizeBuf2 = makeMTLBuffer<ushort>(
|
||||
std::vector<ushort>{outputSizes.begin(), outputSizes.end()});
|
||||
id<MTLBuffer> indexBuf = makeMTLBuffer(std::vector<ushort>(input.dim(), 1));
|
||||
MetalTensorImplStorage mt{outputSizes};
|
||||
mt.texture()->allocateTemporaryTextureStorage(outputSizes, commandBuffer);
|
||||
MPSImage* Y = mt.texture()->image();
|
||||
id<MTLComputeCommandEncoder> encoder =
|
||||
[commandBuffer.buffer computeCommandEncoder];
|
||||
id<MTLComputePipelineState> state =
|
||||
[[MPSCNNContext sharedInstance] specializedPipelineState:@"transpose"
|
||||
Constants:@[
|
||||
@(dim0),
|
||||
@(dim1),
|
||||
@(input.dim()),
|
||||
@(X.numberOfImages),
|
||||
@(X.featureChannels),
|
||||
@(Y.numberOfImages),
|
||||
@(Y.featureChannels),
|
||||
]];
|
||||
|
||||
[encoder setComputePipelineState:state];
|
||||
[encoder setTexture:[X texture] atIndex:0];
|
||||
[encoder setTexture:[Y texture] atIndex:1];
|
||||
[encoder setBuffer:sizeBuf1 offset:0 atIndex:0];
|
||||
[encoder setBuffer:sizeBuf2 offset:0 atIndex:1];
|
||||
[encoder setBuffer:indexBuf offset:0 atIndex:2];
|
||||
|
||||
const auto& launchParams =
|
||||
mpscnn::spatialPointwiseKernelLaunchParams(state, Y);
|
||||
[encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
|
||||
threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
|
||||
[encoder endEncoding];
|
||||
[X markRead];
|
||||
[Y markRead];
|
||||
|
||||
auto output = makeTensor(std::move(mt), input.options());
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
Tensor t(const Tensor& input) {
|
||||
TORCH_CHECK(input.is_metal());
|
||||
TORCH_CHECK(input.is_metal());
|
||||
TORCH_CHECK(input.dim() == 2);
|
||||
auto strides = input.strides().vec();
|
||||
auto sizes = input.sizes().vec();
|
||||
MPSImage* X = imageFromTensor(input);
|
||||
TORCH_CHECK(X.numberOfImages == 1);
|
||||
TORCH_CHECK(X.featureChannels == 1);
|
||||
MetalTensorImplStorage mt({sizes[1], sizes[0]});
|
||||
MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
|
||||
mt.texture()->allocateTemporaryTextureStorage(
|
||||
{1, 1, sizes[1], sizes[0]}, commandBuffer);
|
||||
MPSImage* Y = mt.texture()->image();
|
||||
MPSImageTranspose* transpose = [[MPSImageTranspose alloc]
|
||||
initWithDevice:[MPSCNNContext sharedInstance].device];
|
||||
[transpose encodeToCommandBuffer:commandBuffer.buffer
|
||||
sourceImage:X
|
||||
destinationImage:Y];
|
||||
auto output = makeTensor(std::move(mt), input.options());
|
||||
return output;
|
||||
return metal::transpose(input, 0, input.dim() < 2 ? 0 : 1);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(aten, Metal, m) {
|
||||
m.impl("t", TORCH_FN(t));
|
||||
m.impl("transpose.int", TORCH_FN(transpose));
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5034,9 +5034,6 @@
|
||||
structured_delegate: digamma.out
|
||||
variants: method
|
||||
|
||||
- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
|
||||
variants: method
|
||||
|
||||
- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
|
||||
variants: method
|
||||
dispatch:
|
||||
@ -5804,6 +5801,11 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: polygamma
|
||||
|
||||
- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
|
||||
variants: method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: polygamma_
|
||||
|
||||
- func: erfinv(Tensor self) -> Tensor
|
||||
structured_delegate: erfinv.out
|
||||
variants: method, function
|
||||
@ -7439,6 +7441,7 @@
|
||||
# Return: (Tensor output, Tensor indices)
|
||||
- func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
|
||||
python_module: nn
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU: adaptive_max_pool2d_out_cpu
|
||||
CUDA: adaptive_max_pool2d_out_cuda
|
||||
@ -7446,9 +7449,7 @@
|
||||
# Return: (Tensor output, Tensor indices)
|
||||
- func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
|
||||
python_module: nn
|
||||
dispatch:
|
||||
CPU: adaptive_max_pool2d_cpu
|
||||
CUDA: adaptive_max_pool2d_cuda
|
||||
structured_delegate: adaptive_max_pool2d.out
|
||||
|
||||
- func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
|
||||
python_module: nn
|
||||
@ -7465,6 +7466,7 @@
|
||||
# Return: (Tensor output, Tensor indices)
|
||||
- func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
|
||||
python_module: nn
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU: adaptive_max_pool3d_out_cpu
|
||||
CUDA: adaptive_max_pool3d_out_cuda
|
||||
@ -7472,9 +7474,7 @@
|
||||
# Return: (Tensor output, Tensor indices)
|
||||
- func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
|
||||
python_module: nn
|
||||
dispatch:
|
||||
CPU: adaptive_max_pool3d_cpu
|
||||
CUDA: adaptive_max_pool3d_cuda
|
||||
structured_delegate: adaptive_max_pool3d.out
|
||||
|
||||
- func: adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
|
||||
python_module: nn
|
||||
@ -8637,6 +8637,12 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: linalg_lstsq
|
||||
|
||||
- func: linalg_lstsq.out(Tensor self, Tensor b, float? cond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values)
|
||||
python_module: linalg
|
||||
variants: function
|
||||
dispatch:
|
||||
CPU, CUDA: linalg_lstsq_out
|
||||
|
||||
- func: _lstsq_helper_(Tensor(a!) self, Tensor(b!) rank, Tensor(c!) singular_values, Tensor(d!) infos, Tensor a, float cond, str driver_name) -> Tensor(a!)
|
||||
variants: function
|
||||
dispatch:
|
||||
|
@ -12,6 +12,8 @@ namespace {
|
||||
template <typename T>
|
||||
class MinMax : public ::testing::Test {};
|
||||
template <typename T>
|
||||
class Nan : public ::testing::Test {};
|
||||
template <typename T>
|
||||
class Interleave : public ::testing::Test {};
|
||||
template <typename T>
|
||||
class SignManipulation : public ::testing::Test {};
|
||||
@ -67,6 +69,7 @@ namespace {
|
||||
TYPED_TEST_CASE(Comparison, RealFloatIntTestedTypes);
|
||||
TYPED_TEST_CASE(Bitwise, FloatIntTestedTypes);
|
||||
TYPED_TEST_CASE(MinMax, RealFloatIntTestedTypes);
|
||||
TYPED_TEST_CASE(Nan, RealFloatTestedTypes);
|
||||
TYPED_TEST_CASE(Interleave, RealFloatIntTestedTypes);
|
||||
TYPED_TEST_CASE(SignManipulation, FloatIntTestedTypes);
|
||||
TYPED_TEST_CASE(Rounding, RealFloatTestedTypes);
|
||||
@ -435,9 +438,29 @@ namespace {
|
||||
[](const vec& v) { return v.erfinv(); },
|
||||
createDefaultUnaryTestCase<vec>(TestSeed(), false, true));
|
||||
}
|
||||
|
||||
|
||||
|
||||
TYPED_TEST(Nan, IsNan) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
CACHE_ALIGN VT test_vals[vec::size()];
|
||||
CACHE_ALIGN VT expected_vals[vec::size()];
|
||||
auto vals = 1 << (vec::size());
|
||||
for (int val = 0; val < vals; ++val) {
|
||||
for (int i = 0; i < vec::size(); ++i) {
|
||||
if (val & (1 << i)) {
|
||||
test_vals[i] = std::numeric_limits<VT>::quiet_NaN();
|
||||
// All bits are set to 1 if true, otherwise 0.
|
||||
// same rule as at::Vec256<T>::binary_pred.
|
||||
std::memset(static_cast<void*>(&expected_vals[i]), 0xFF, sizeof(VT));
|
||||
} else {
|
||||
test_vals[i] = (VT)0.123;
|
||||
std::memset(static_cast<void*>(&expected_vals[i]), 0, sizeof(VT));
|
||||
}
|
||||
}
|
||||
vec actual = vec::loadu(test_vals).isnan();
|
||||
vec expected = vec::loadu(expected_vals);
|
||||
AssertVec256<vec>(NAME_INFO(isnan), expected, actual).check();
|
||||
}
|
||||
}
|
||||
TYPED_TEST(LGamma, LGamma) {
|
||||
using vec = TypeParam;
|
||||
using UVT = UvalueType<vec>;
|
||||
|
@ -32,8 +32,7 @@ static void FusedOverhead(benchmark::State& state) {
|
||||
}
|
||||
|
||||
static void UnfusedOverhead(benchmark::State& state) {
|
||||
torch::NoGradGuard ng;
|
||||
torch::AutoNonVariableTypeMode nv;
|
||||
c10::InferenceMode guard;
|
||||
overrideCanFuseOnCPU(false);
|
||||
|
||||
Module m("m");
|
||||
|
@ -244,7 +244,7 @@ if __name__ == '__main__':
|
||||
vlrnns = ['vl_cudnn', 'vl_jit', 'vl_py']
|
||||
|
||||
if args.print_json:
|
||||
print_stderr = lambda *args, **kwargs: None # noqa
|
||||
print_stderr = lambda *args, **kwargs: None # noqa: E731,F811
|
||||
print_stderr(args)
|
||||
|
||||
bench_args = copy.deepcopy(vars(args))
|
||||
|
@ -239,7 +239,6 @@ def varlen_lstm_factory(cell, script):
|
||||
def dynamic_rnn(sequences: List[Tensor], hiddens: Tuple[Tensor, Tensor], wih: Tensor,
|
||||
whh: Tensor, bih: Tensor, bhh: Tensor
|
||||
) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]:
|
||||
# noqa
|
||||
hx, cx = hiddens
|
||||
hxs = hx.unbind(1)
|
||||
cxs = cx.unbind(1)
|
||||
|
@ -1,11 +1,11 @@
|
||||
import operator_benchmark as op_bench
|
||||
from pt import ( # noqa
|
||||
add_test, as_strided_test, batchnorm_test, binary_test, cat_test, # noqa
|
||||
channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test, # noqa
|
||||
fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test, # noqa
|
||||
softmax_test, hardsigmoid_test, hardswish_test, layernorm_test, # noqa
|
||||
groupnorm_test, interpolate_test, instancenorm_test, remainder_test, softmax_test, # noqa
|
||||
split_test, sum_test, tensor_to_test # noqa
|
||||
from pt import ( # noqa: F401
|
||||
add_test, as_strided_test, batchnorm_test, binary_test, cat_test,
|
||||
channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test,
|
||||
fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test,
|
||||
softmax_test, hardsigmoid_test, hardswish_test, layernorm_test,
|
||||
groupnorm_test, interpolate_test, instancenorm_test, remainder_test,
|
||||
split_test, sum_test, tensor_to_test
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,5 +1,5 @@
|
||||
import operator_benchmark as op_bench
|
||||
from pt import ( # noqa
|
||||
from pt import ( # noqa: F401
|
||||
qactivation_test,
|
||||
qarithmetic_test,
|
||||
qbatchnorm_test,
|
||||
|
@ -1,9 +1,9 @@
|
||||
import operator_benchmark as op_bench
|
||||
from pt import ( # noqa
|
||||
unary_test, # noqa
|
||||
from pt import ( # noqa: F401
|
||||
unary_test,
|
||||
)
|
||||
import benchmark_all_other_test # noqa
|
||||
import benchmark_all_quantized_test # noqa
|
||||
import benchmark_all_other_test # noqa: F401
|
||||
import benchmark_all_quantized_test # noqa: F401
|
||||
|
||||
if __name__ == "__main__":
|
||||
op_bench.benchmark_runner.main()
|
||||
|
@ -7,7 +7,7 @@ import copy
|
||||
import ast
|
||||
|
||||
# needs to be imported after torch
|
||||
import torch.utils.cpp_extension as cpp_extension # noqa
|
||||
import torch.utils.cpp_extension as cpp_extension # noqa: F401
|
||||
|
||||
import benchmark_utils
|
||||
from collections import namedtuple
|
||||
|
@ -1,7 +1,7 @@
|
||||
import time
|
||||
import json
|
||||
import torch
|
||||
import cpp_extension # noqa
|
||||
import cpp_extension # noqa: F401
|
||||
|
||||
|
||||
"""PyTorch performance microbenchmarks.
|
||||
|
@ -1,6 +1,6 @@
|
||||
import operator_benchmark as op_bench
|
||||
import benchmark_caffe2 as op_bench_c2
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
|
||||
from caffe2.python import core
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import benchmark_caffe2 as op_bench_c2
|
||||
import operator_benchmark as op_bench
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
|
||||
from caffe2.python import core
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import benchmark_caffe2 as op_bench_c2
|
||||
import operator_benchmark as op_bench
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
|
||||
from caffe2.python import core
|
||||
import numpy
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import benchmark_caffe2 as op_bench_c2
|
||||
import operator_benchmark as op_bench
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
|
||||
from caffe2.python import core, dyndep
|
||||
|
||||
dyndep.InitOpsLibrary("@/caffe2/caffe2/fb/operators:clip_ranges_op")
|
||||
|
@ -1,7 +1,7 @@
|
||||
import operator_benchmark as op_bench
|
||||
import benchmark_caffe2 as op_bench_c2
|
||||
import random
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
|
||||
from caffe2.python import core
|
||||
|
||||
|
||||
@ -17,9 +17,9 @@ cross_product_configs = {
|
||||
cat_configs_short = op_bench.config_list(
|
||||
attr_names=['sizes', 'N', 'axis'],
|
||||
attrs=[
|
||||
[(1, 1, 1), 2, 0], # noqa
|
||||
[(512, 512, 2), 2, 1], # noqa
|
||||
[(128, 1024, 2), 2, 1], # noqa
|
||||
[(1, 1, 1), 2, 0], # noqa: E241
|
||||
[(512, 512, 2), 2, 1], # noqa: E241
|
||||
[(128, 1024, 2), 2, 1], # noqa: E241
|
||||
],
|
||||
cross_product_configs=cross_product_configs,
|
||||
tags=['short'],
|
||||
@ -29,14 +29,14 @@ cat_configs_short = op_bench.config_list(
|
||||
cat_configs_static_runtime = op_bench.config_list(
|
||||
attr_names=['sizes', 'N', 'axis', 'add_axis'],
|
||||
attrs=[
|
||||
[(1, 40), 5, 1, 1], # noqa
|
||||
[[(1, 160), (1, 14)], -1, 1, 0], # noqa
|
||||
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1, 0], # noqa
|
||||
[[(1, 580), (1, 174)], -1, 1, 0], # noqa
|
||||
[(20, 40), 5, 1, 1], # noqa
|
||||
[[(20, 160), (20, 14)], -1, 1, 0], # noqa
|
||||
[[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1, 0], # noqa
|
||||
[[(20, 580), (20, 174)], -1, 1, 0], # noqa
|
||||
[(1, 40), 5, 1, 1],
|
||||
[[(1, 160), (1, 14)], -1, 1, 0],
|
||||
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1, 0],
|
||||
[[(1, 580), (1, 174)], -1, 1, 0],
|
||||
[(20, 40), 5, 1, 1],
|
||||
[[(20, 160), (20, 14)], -1, 1, 0],
|
||||
[[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1, 0],
|
||||
[[(20, 580), (20, 174)], -1, 1, 0],
|
||||
],
|
||||
cross_product_configs=cross_product_configs,
|
||||
tags=['static_runtime'],
|
||||
@ -45,22 +45,22 @@ cat_configs_static_runtime = op_bench.config_list(
|
||||
cat_configs_long = op_bench.config_list(
|
||||
attr_names=['sizes', 'N', 'axis'],
|
||||
attrs=[
|
||||
[(2**10, 2**10, 2), 2, 0], # noqa
|
||||
[(2**10+1, 2**10-1, 2), 2, 1], # noqa
|
||||
[(2**10, 2**10, 2), 2, 2], # noqa
|
||||
[(2**10, 2**10, 2), 2, 0], # noqa: E241
|
||||
[(2**10+1, 2**10-1, 2), 2, 1], # noqa: E226,E241
|
||||
[(2**10, 2**10, 2), 2, 2], # noqa: E241
|
||||
|
||||
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa
|
||||
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa: E201,E226,E241
|
||||
5, 0],
|
||||
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa
|
||||
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa: E201,E226,E241,E272
|
||||
5, 1],
|
||||
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa
|
||||
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa: E201,E241,E272
|
||||
5, 2],
|
||||
|
||||
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa
|
||||
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa: E241
|
||||
50, 0],
|
||||
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa
|
||||
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa: E241,E272
|
||||
50, 1],
|
||||
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa
|
||||
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa: E226,E241,E272
|
||||
50, 2],
|
||||
],
|
||||
cross_product_configs=cross_product_configs,
|
||||
@ -71,9 +71,9 @@ cat_configs_long = op_bench.config_list(
|
||||
cat_configs_multidim = op_bench.config_list(
|
||||
attr_names=['sizes', 'N', 'axis', 'dtype'],
|
||||
attrs=[
|
||||
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa
|
||||
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa
|
||||
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa
|
||||
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa: E241
|
||||
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa: E241
|
||||
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa: E226,E241
|
||||
],
|
||||
cross_product_configs=cross_product_configs,
|
||||
tags=['multidim'],
|
||||
|
@ -1,7 +1,7 @@
|
||||
|
||||
import operator_benchmark as op_bench
|
||||
import benchmark_caffe2 as op_bench_c2
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
|
||||
from caffe2.python import core
|
||||
|
||||
"""Microbenchmarks for MatMul operator"""
|
||||
|
@ -1,6 +1,6 @@
|
||||
import benchmark_caffe2 as op_bench_c2
|
||||
import operator_benchmark as op_bench
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
|
||||
from caffe2.python import core
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import benchmark_caffe2 as op_bench_c2
|
||||
import operator_benchmark as op_bench
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
|
||||
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
|
||||
from caffe2.python import core
|
||||
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
# TODO (mingzhe09088): get rid of noqa
|
||||
import benchmark_runner # noqa
|
||||
from benchmark_pytorch import TorchBenchmarkBase # noqa
|
||||
from benchmark_test_generator import * # noqa
|
||||
from benchmark_utils import * # noqa
|
||||
import benchmark_runner # noqa: F401
|
||||
from benchmark_pytorch import TorchBenchmarkBase # noqa: F401
|
||||
from benchmark_test_generator import * # noqa: F401,F403
|
||||
from benchmark_utils import * # noqa: F401,F403
|
||||
|
@ -14,9 +14,9 @@ cross_product_configs = {
|
||||
cat_configs_short = op_bench.config_list(
|
||||
attr_names=['sizes', 'N', 'dim'],
|
||||
attrs=[
|
||||
[(1, 1, 1), 2, 0], # noqa
|
||||
[(512, 512, 2), 2, 1], # noqa
|
||||
[(128, 1024, 2), 2, 1], # noqa
|
||||
[(1, 1, 1), 2, 0], # noqa: E241
|
||||
[(512, 512, 2), 2, 1], # noqa: E241
|
||||
[(128, 1024, 2), 2, 1], # noqa: E241
|
||||
],
|
||||
cross_product_configs=cross_product_configs,
|
||||
tags=['short'],
|
||||
@ -26,12 +26,12 @@ cat_configs_short = op_bench.config_list(
|
||||
cat_configs_static_runtime = op_bench.config_list(
|
||||
attr_names=['sizes', 'N', 'dim'],
|
||||
attrs=[
|
||||
[[(1, 160), (1, 14)], -1, 1], # noqa
|
||||
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1], # noqa
|
||||
[[(1, 580), (1, 174)], -1, 1], # noqa
|
||||
[[(20, 160), (20, 14)], -1, 1], # noqa
|
||||
[[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1], # noqa
|
||||
[[(20, 580), (20, 174)], -1, 1], # noqa
|
||||
[[(1, 160), (1, 14)], -1, 1],
|
||||
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1],
|
||||
[[(1, 580), (1, 174)], -1, 1],
|
||||
[[(20, 160), (20, 14)], -1, 1],
|
||||
[[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1],
|
||||
[[(20, 580), (20, 174)], -1, 1],
|
||||
],
|
||||
cross_product_configs=cross_product_configs,
|
||||
tags=['static_runtime'],
|
||||
@ -40,22 +40,22 @@ cat_configs_static_runtime = op_bench.config_list(
|
||||
cat_configs_long = op_bench.config_list(
|
||||
attr_names=['sizes', 'N', 'dim'],
|
||||
attrs=[
|
||||
[(2**10, 2**10, 2), 2, 0], # noqa
|
||||
[(2**10+1, 2**10-1, 2), 2, 1], # noqa
|
||||
[(2**10, 2**10, 2), 2, 2], # noqa
|
||||
[(2**10, 2**10, 2), 2, 0], # noqa: E241
|
||||
[(2**10+1, 2**10-1, 2), 2, 1], # noqa: E226,E241
|
||||
[(2**10, 2**10, 2), 2, 2], # noqa: E241
|
||||
|
||||
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa
|
||||
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa: E201,E226,E241
|
||||
5, 0],
|
||||
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa
|
||||
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa: E201,E226,E241,E272
|
||||
5, 1],
|
||||
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa
|
||||
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa: E201,E241,E272
|
||||
5, 2],
|
||||
|
||||
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa
|
||||
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa: E241
|
||||
50, 0],
|
||||
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa
|
||||
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa: E241,E272
|
||||
50, 1],
|
||||
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa
|
||||
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa: E226,E241,E272
|
||||
50, 2],
|
||||
],
|
||||
cross_product_configs=cross_product_configs,
|
||||
@ -66,9 +66,9 @@ cat_configs_long = op_bench.config_list(
|
||||
cat_configs_multidim = op_bench.config_list(
|
||||
attr_names=['sizes', 'N', 'dim'],
|
||||
attrs=[
|
||||
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa
|
||||
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa
|
||||
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa
|
||||
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa: E241
|
||||
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa: E241
|
||||
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa: E226,E241
|
||||
],
|
||||
cross_product_configs=cross_product_configs,
|
||||
tags=['multidim'],
|
||||
|
@ -8,17 +8,17 @@ r"""Microbenchmarks for the quantized activations."""
|
||||
qactivation_long_configs = op_bench.cross_product_configs(
|
||||
dims=(
|
||||
# VGG-16 relu's with original shape: (-1, 3, 224, 224)
|
||||
( 64, 224, 224), # ReLU-1 # noqa
|
||||
(128, 112, 112), # ReLU-6 # noqa
|
||||
(256, 56, 56), # ReLU-11 # noqa
|
||||
(512, 28, 28), # ReLU-18 # noqa
|
||||
(512, 14, 14), # ReLU-25 # noqa
|
||||
( 64, 224, 224), # ReLU-1 # noqa: E201
|
||||
(128, 112, 112), # ReLU-6
|
||||
(256, 56, 56), # ReLU-11 # noqa: E241
|
||||
(512, 28, 28), # ReLU-18 # noqa: E241
|
||||
(512, 14, 14), # ReLU-25 # noqa: E241
|
||||
# Batch = 16
|
||||
(16, 64, 224, 224), # ReLU-1 # noqa
|
||||
(16, 128, 112, 112), # ReLU-6 # noqa
|
||||
(16, 256, 56, 56), # ReLU-11 # noqa
|
||||
(16, 512, 28, 28), # ReLU-18 # noqa
|
||||
(16, 512, 14, 14), # ReLU-25 # noqa
|
||||
(16, 64, 224, 224), # ReLU-1 # noqa: E241
|
||||
(16, 128, 112, 112), # ReLU-6
|
||||
(16, 256, 56, 56), # ReLU-11 # noqa: E241
|
||||
(16, 512, 28, 28), # ReLU-18 # noqa: E241
|
||||
(16, 512, 14, 14), # ReLU-25 # noqa: E241
|
||||
),
|
||||
contig=(False, True),
|
||||
inplace=(False, True),
|
||||
|
@ -43,7 +43,7 @@ qobserver_per_tensor_configs_short = op_bench.config_list(
|
||||
cross_product_configs={
|
||||
'qscheme': (torch.per_tensor_affine, torch.per_tensor_symmetric)
|
||||
},
|
||||
**qobserver_short_configs_dict, # noqa
|
||||
**qobserver_short_configs_dict,
|
||||
)
|
||||
|
||||
qobserver_per_tensor_configs_long = op_bench.cross_product_configs(
|
||||
@ -67,7 +67,7 @@ q_hist_observer_per_tensor_configs_short = op_bench.config_list(
|
||||
cross_product_configs={
|
||||
'qscheme': (torch.per_tensor_affine, torch.per_tensor_symmetric)
|
||||
},
|
||||
**q_hist_observer_short_configs_dict, # noqa
|
||||
**q_hist_observer_short_configs_dict,
|
||||
)
|
||||
|
||||
q_hist_observer_per_tensor_configs_long = op_bench.cross_product_configs(
|
||||
|
@ -6,11 +6,11 @@ import operator_benchmark as op_bench
|
||||
qpool2d_long_configs = op_bench.config_list(
|
||||
attrs=(
|
||||
# C H W k s p
|
||||
( 1, 3, 3, (3, 3), (1, 1), (0, 0)), # dummy # noqa
|
||||
( 3, 64, 64, (3, 3), (2, 2), (1, 1)), # dummy # noqa
|
||||
( 1, 3, 3, (3, 3), (1, 1), (0, 0)), # dummy # noqa: E201,E241
|
||||
( 3, 64, 64, (3, 3), (2, 2), (1, 1)), # dummy # noqa: E201,E241
|
||||
# VGG16 pools with original input shape: (-1, 3, 224, 224)
|
||||
( 64, 224, 224, (2, 2), (2, 2), (0, 0)), # MaxPool2d-4 # noqa
|
||||
(256, 56, 56, (2, 2), (2, 2), (0, 0)), # MaxPool2d-16 # noqa
|
||||
( 64, 224, 224, (2, 2), (2, 2), (0, 0)), # MaxPool2d-4 # noqa: E201
|
||||
(256, 56, 56, (2, 2), (2, 2), (0, 0)), # MaxPool2d-16 # noqa: E241
|
||||
),
|
||||
attr_names=('C', 'H', 'W', # Input layout
|
||||
'k', 's', 'p'), # Pooling parameters
|
||||
@ -23,7 +23,7 @@ qpool2d_long_configs = op_bench.config_list(
|
||||
)
|
||||
|
||||
qpool2d_short_configs = op_bench.config_list(
|
||||
attrs=((1, 3, 3, (3, 3), (1, 1), (0, 0)),), # dummy # noqa
|
||||
attrs=((1, 3, 3, (3, 3), (1, 1), (0, 0)),), # dummy
|
||||
attr_names=('C', 'H', 'W', # Input layout
|
||||
'k', 's', 'p'), # Pooling parameters
|
||||
cross_product_configs={
|
||||
@ -37,15 +37,15 @@ qpool2d_short_configs = op_bench.config_list(
|
||||
qadaptive_avgpool2d_long_configs = op_bench.cross_product_configs(
|
||||
input_size=(
|
||||
# VGG16 pools with original input shape: (-1, 3, 224, 224)
|
||||
(112, 112), # MaxPool2d-9 # noqa
|
||||
(112, 112), # MaxPool2d-9
|
||||
),
|
||||
output_size=(
|
||||
(448, 448),
|
||||
# VGG16 pools with original input shape: (-1, 3, 224, 224)
|
||||
(224, 224), # MaxPool2d-4 # noqa
|
||||
(112, 112), # MaxPool2d-9 # noqa
|
||||
( 56, 56), # MaxPool2d-16 # noqa
|
||||
( 14, 14), # MaxPool2d-30 # noqa
|
||||
(224, 224), # MaxPool2d-4
|
||||
(112, 112), # MaxPool2d-9
|
||||
( 56, 56), # MaxPool2d-16 # noqa: E201,E241
|
||||
( 14, 14), # MaxPool2d-30 # noqa: E201,E241
|
||||
),
|
||||
N=(1, 4),
|
||||
C=(1, 3, 64, 128),
|
||||
|
@ -10,8 +10,8 @@ from typing import List
|
||||
stack_configs_static_runtime = op_bench.config_list(
|
||||
attr_names=['sizes', 'N'],
|
||||
attrs=[
|
||||
[(20, 40), 5], # noqa
|
||||
[(1, 40), 5], # noqa
|
||||
[(20, 40), 5],
|
||||
[(1, 40), 5],
|
||||
],
|
||||
cross_product_configs={
|
||||
'device': ['cpu', 'cuda'],
|
||||
@ -23,9 +23,9 @@ stack_configs_static_runtime = op_bench.config_list(
|
||||
stack_configs_short = op_bench.config_list(
|
||||
attr_names=['sizes', 'N'],
|
||||
attrs=[
|
||||
[(1, 1, 1), 2], # noqa
|
||||
[(512, 512, 2), 2], # noqa
|
||||
[(128, 1024, 2), 2], # noqa
|
||||
[(1, 1, 1), 2], # noqa: E241
|
||||
[(512, 512, 2), 2], # noqa: E241
|
||||
[(128, 1024, 2), 2], # noqa: E241
|
||||
],
|
||||
cross_product_configs={
|
||||
'device': ['cpu', 'cuda'],
|
||||
@ -37,9 +37,9 @@ stack_configs_short = op_bench.config_list(
|
||||
stack_configs_long = op_bench.config_list(
|
||||
attr_names=['sizes', 'N'],
|
||||
attrs=[
|
||||
[(2**10, 2**10, 2), 2], # noqa
|
||||
[(2**10+1, 2**10-1, 2), 2], # noqa
|
||||
[(2**10, 2**10, 2), 2], # noqa
|
||||
[(2**10, 2**10, 2), 2], # noqa: E241
|
||||
[(2**10+1, 2**10-1, 2), 2], # noqa: E226,E241
|
||||
[(2**10, 2**10, 2), 2], # noqa: E241
|
||||
],
|
||||
cross_product_configs={
|
||||
'device': ['cpu', 'cuda'],
|
||||
@ -52,9 +52,9 @@ stack_configs_long = op_bench.config_list(
|
||||
stack_configs_multidim = op_bench.config_list(
|
||||
attr_names=['sizes', 'N'],
|
||||
attrs=[
|
||||
[(2**6, 2**5, 2**2, 2**4, 2**5), 2], # noqa
|
||||
[(2**4, 2**5, 2**2, 2**4, 2**5), 8], # noqa
|
||||
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17], # noqa
|
||||
[(2**6, 2**5, 2**2, 2**4, 2**5), 2], # noqa: E241
|
||||
[(2**4, 2**5, 2**2, 2**4, 2**5), 8], # noqa: E241
|
||||
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17], # noqa: E226,E241
|
||||
],
|
||||
cross_product_configs={
|
||||
'device': ['cpu', 'cuda'],
|
||||
|
@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
|
||||
import cpp_extension # noqa
|
||||
import cpp_extension # noqa: F401
|
||||
import torch
|
||||
|
||||
|
||||
|
@ -86,7 +86,7 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
|
||||
return Backend::QuantizedCPU;
|
||||
} else if (t == DispatchKey::QuantizedCUDA) {
|
||||
return Backend::QuantizedCUDA;
|
||||
} else if (t == DispatchKey::XPU) {
|
||||
} else if (t == DispatchKey::XPU || t == DispatchKey::AutogradXPU) {
|
||||
return Backend::XPU;
|
||||
} else if (t == DispatchKey::SparseXPU) {
|
||||
return Backend::SparseXPU;
|
||||
|
@ -146,6 +146,8 @@ DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
|
||||
switch (t) {
|
||||
case DispatchKey::CPU:
|
||||
return DispatchKey::AutogradCPU;
|
||||
case DispatchKey::XPU:
|
||||
return DispatchKey::AutogradXPU;
|
||||
case DispatchKey::CUDA:
|
||||
return DispatchKey::AutogradCUDA;
|
||||
case DispatchKey::XLA:
|
||||
|
@ -19,6 +19,10 @@
|
||||
#include <execinfo.h>
|
||||
#endif
|
||||
|
||||
#ifdef FBCODE_CAFFE2
|
||||
#include <common/process/StackTrace.h>
|
||||
#endif
|
||||
|
||||
namespace c10 {
|
||||
|
||||
#if SUPPORTS_BACKTRACE
|
||||
@ -167,7 +171,14 @@ std::string get_backtrace(
|
||||
size_t frames_to_skip,
|
||||
size_t maximum_number_of_frames,
|
||||
bool skip_python_frames) {
|
||||
#if SUPPORTS_BACKTRACE
|
||||
#ifdef FBCODE_CAFFE2
|
||||
// For some reason, the stacktrace implementation in fbcode is
|
||||
// better than ours, see https://github.com/pytorch/pytorch/issues/56399
|
||||
// When it's available, just use that.
|
||||
facebook::process::StackTrace st;
|
||||
return st.toString();
|
||||
|
||||
#elif SUPPORTS_BACKTRACE
|
||||
|
||||
// We always skip this frame (backtrace).
|
||||
frames_to_skip += 1;
|
||||
|
@ -209,7 +209,6 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
|
||||
float weight_sum = 0.0;
|
||||
if (!label_prob_mode_) {
|
||||
const int* label_data = T.data<int>();
|
||||
const float* Xdata = X.data<float>();
|
||||
|
||||
for (int i = 0; i < N; ++i) {
|
||||
CAFFE_ENFORCE(
|
||||
|
@ -26,6 +26,15 @@ C10_EXPORT std::string DeviceTypeName(const int32_t& d) {
|
||||
return at::DeviceTypeName(static_cast<at::DeviceType>(d));
|
||||
}
|
||||
|
||||
void setTotalBytesLimit(::google::protobuf::io::CodedInputStream& stream, int bytes_limit, int warning_threshold) {
|
||||
#if GOOGLE_PROTOBUF_VERSION >= 3011000
|
||||
// Only take one parameter since protobuf 3.11
|
||||
stream.SetTotalBytesLimit(bytes_limit);
|
||||
#else
|
||||
stream.SetTotalBytesLimit(bytes_limit, warning_threshold);
|
||||
#endif
|
||||
}
|
||||
|
||||
C10_EXPORT int DeviceId(const DeviceOption& option) {
|
||||
switch (option.device_type()) {
|
||||
case PROTO_CPU:
|
||||
@ -136,7 +145,7 @@ C10_EXPORT bool ParseProtoFromLargeString(
|
||||
::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
|
||||
::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
|
||||
// Set PlanDef message size limit to 2G.
|
||||
coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
|
||||
setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
|
||||
return proto->ParseFromCodedStream(&coded_stream);
|
||||
}
|
||||
|
||||
@ -149,7 +158,7 @@ C10_EXPORT bool ReadProtoFromBinaryFile(
|
||||
// Total bytes hard limit / warning limit are set to 2GB and 512MB
|
||||
// respectively.
|
||||
::google::protobuf::io::CodedInputStream coded_stream(&stream);
|
||||
coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
|
||||
setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
|
||||
return proto->ParseFromCodedStream(&coded_stream);
|
||||
}
|
||||
|
||||
@ -200,7 +209,7 @@ C10_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
|
||||
::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
|
||||
::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
|
||||
// Set PlanDef message size limit to 2G.
|
||||
coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
|
||||
setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
|
||||
return proto->ParseFromCodedStream(&coded_stream);
|
||||
}
|
||||
|
||||
@ -244,7 +253,13 @@ C10_EXPORT bool ReadProtoFromBinaryFile(
|
||||
std::unique_ptr<CodedInputStream> coded_input(
|
||||
new CodedInputStream(raw_input.get()));
|
||||
// A hack to manually allow using very large protocol buffers.
|
||||
coded_input->SetTotalBytesLimit(2147483647, 536870912);
|
||||
#if GOOGLE_PROTOBUF_VERSION >= 3011000
|
||||
// Only take one parameter since protobuf 3.11
|
||||
coded_input->SetTotalBytesLimit(2147483647);
|
||||
#else
|
||||
// Total bytes hard limit / warning limit are set to 2GB and 512MB respectively.
|
||||
coded_input->SetTotalBytesLimit(2147483647, 536870912);
|
||||
#endif
|
||||
bool success = proto->ParseFromCodedStream(coded_input.get());
|
||||
coded_input.reset();
|
||||
raw_input.reset();
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "caffe2/utils/signal_handler.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include <c10/util/Backtrace.h>
|
||||
|
||||
#if defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
|
||||
|
||||
@ -149,63 +150,11 @@ const char* getSignalName(int signum) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
_Unwind_Reason_Code unwinder(struct _Unwind_Context* context, void* userInfo) {
|
||||
auto& pcs = *reinterpret_cast<std::vector<uintptr_t>*>(userInfo);
|
||||
pcs.push_back(_Unwind_GetIP(context));
|
||||
return _URC_NO_REASON;
|
||||
}
|
||||
|
||||
std::vector<uintptr_t> getBacktrace() {
|
||||
std::vector<uintptr_t> pcs;
|
||||
_Unwind_Backtrace(unwinder, &pcs);
|
||||
return pcs;
|
||||
}
|
||||
|
||||
void printBlobSizes() {
|
||||
::caffe2::Workspace::ForEach(
|
||||
[&](::caffe2::Workspace* ws) { ws->PrintBlobSizes(); });
|
||||
}
|
||||
|
||||
void printStacktrace() {
|
||||
std::vector<uintptr_t> pcs = getBacktrace();
|
||||
Dl_info info;
|
||||
size_t i = 0;
|
||||
for (uintptr_t pcAddr : pcs) {
|
||||
const void* pc = reinterpret_cast<const void*>(pcAddr);
|
||||
const char* path = nullptr;
|
||||
const char* name = "???";
|
||||
char* demangled = nullptr;
|
||||
int offset = -1;
|
||||
|
||||
std::cerr << "[" << i << "] ";
|
||||
if (dladdr(pc, &info)) {
|
||||
path = info.dli_fname;
|
||||
name = info.dli_sname ?: "???";
|
||||
offset = reinterpret_cast<uintptr_t>(pc) -
|
||||
reinterpret_cast<uintptr_t>(info.dli_saddr);
|
||||
|
||||
int status;
|
||||
demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
|
||||
if (status == 0) {
|
||||
name = demangled;
|
||||
}
|
||||
}
|
||||
std::cerr << name;
|
||||
if (offset >= 0) {
|
||||
std::cerr << "+" << reinterpret_cast<void*>(offset);
|
||||
}
|
||||
std::cerr << "(" << pc << ")";
|
||||
if (path) {
|
||||
std::cerr << " in " << path;
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
if (demangled) {
|
||||
free(demangled);
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
void callPreviousSignalHandler(
|
||||
struct sigaction* action,
|
||||
int signum,
|
||||
@ -229,7 +178,7 @@ void stacktraceSignalHandler(bool needsLock) {
|
||||
pid_t tid = syscall(SYS_gettid);
|
||||
std::cerr << fatalSignalName << "(" << fatalSignum << "), PID: " << ::getpid()
|
||||
<< ", Thread " << tid << ": " << std::endl;
|
||||
printStacktrace();
|
||||
std::cerr << c10::get_backtrace();
|
||||
std::cerr << std::endl;
|
||||
if (needsLock) {
|
||||
pthread_mutex_unlock(&writingMutex);
|
||||
|
@ -18,7 +18,7 @@ Features described in this documentation are classified by release status:
|
||||
breaking changes can happen and notice will be given one release ahead
|
||||
of time).
|
||||
|
||||
*Beta:* Features are tagged as Beta because the API may change based on
|
||||
*Beta:* These features are tagged as Beta because the API may change based on
|
||||
user feedback, because the performance needs to improve, or because
|
||||
coverage across operators is not yet complete. For Beta features, we are
|
||||
committing to seeing the feature through to the Stable classification.
|
||||
|
@ -19,7 +19,6 @@ TorchScript
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Language Reference
|
||||
|
||||
jit_language_reference_v2
|
||||
|
||||
|
@ -39,7 +39,7 @@ files =
|
||||
benchmarks/instruction_counts,
|
||||
tools/autograd/*.py,
|
||||
tools/clang_tidy.py,
|
||||
tools/codegen/*.py,
|
||||
tools/codegen,
|
||||
tools/extract_scripts.py,
|
||||
tools/mypy_wrapper.py,
|
||||
tools/print_test_stats.py,
|
||||
|
@ -697,7 +697,7 @@ class TestBenchmarkUtils(TestCase):
|
||||
2000 /usr/include/c++/8/bits/atomic_base.h:at::Tensor at::detail::make_tensor ... t_null_type<c10::StorageImpl> >&&, c10::DispatchKey&&, caffe2::TypeMeta&)
|
||||
2000 /usr/include/c++/8/array:at::Tensor& c10::Dispatcher::callWithDispatchKe ... , c10::Scalar)> const&, c10::DispatchKey, at::Tensor&, c10::Scalar) const
|
||||
|
||||
Total: 8869966""" # noqa
|
||||
Total: 8869966""" # noqa: B950
|
||||
)
|
||||
|
||||
self.regularizeAndAssertExpectedInline(
|
||||
@ -935,7 +935,7 @@ class TestBenchmarkUtils(TestCase):
|
||||
compute_optimized | \x1b[2m\x1b[91m 3 \x1b[0m\x1b[0m | 4.0 | 11 | \x1b[92m\x1b[1m 2100 \x1b[0m\x1b[0m | 2100
|
||||
special_case (square) | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 8 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 1700 \x1b[0m\x1b[0m
|
||||
|
||||
Times are in microseconds (us).""" # noqa
|
||||
Times are in microseconds (us).""" # noqa: B950
|
||||
)
|
||||
|
||||
compare.colorize(rowwise=True)
|
||||
@ -949,7 +949,7 @@ class TestBenchmarkUtils(TestCase):
|
||||
compute_optimized | \x1b[92m\x1b[1m 3 \x1b[0m\x1b[0m | 4.0 | \x1b[2m\x1b[91m 11 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 2100 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 2100 \x1b[0m\x1b[0m
|
||||
special_case (square) | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | | \x1b[31m\x1b[1m 8 \x1b[0m\x1b[0m | | \x1b[31m\x1b[1m 1700 \x1b[0m\x1b[0m
|
||||
|
||||
Times are in microseconds (us).""" # noqa
|
||||
Times are in microseconds (us).""" # noqa: B950
|
||||
)
|
||||
|
||||
def print_new_expected(s: str) -> None:
|
||||
|
@ -61,13 +61,14 @@ TEST(GradModeTest, TestRequiresGradViewOpExiting) {
|
||||
|
||||
if (requires_grad) {
|
||||
ASSERT_THROWS_WITH(view_out.mul_(2), // go through kernels: VariableType, InplaceOrView, CPU
|
||||
"A view was created in no_grad mode and is being modified inplace")
|
||||
"a leaf Variable that requires grad is being used in an in-place operation")
|
||||
} else {
|
||||
view_out.mul_(2);
|
||||
}
|
||||
|
||||
tmp = view_out.view({2, 3});
|
||||
ASSERT_EQ(tmp.requires_grad(), requires_grad);
|
||||
assert_tensor_creation_meta(tmp, torch::autograd::CreationMeta::NO_GRAD_MODE);
|
||||
// TODO: update when above error is fixed
|
||||
// assert_tensor_creation_meta(tmp, torch::autograd::CreationMeta::NO_GRAD_MODE);
|
||||
}
|
||||
}
|
||||
|
@ -424,7 +424,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
|
||||
ASSERT_TRUE(almost_equal(tensor[2], 3.125));
|
||||
|
||||
tensor = torch::tensor({1.5f, 2.25f, 3.125f});
|
||||
ASSERT_TRUE(tensor.is_variable());
|
||||
ASSERT_EQ(tensor.numel(), 3);
|
||||
ASSERT_EQ(tensor.sizes(), std::vector<int64_t>({3}));
|
||||
ASSERT_EQ(tensor.dtype(), default_dtype);
|
||||
@ -433,7 +432,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
|
||||
ASSERT_TRUE(almost_equal(tensor[2], 3.125f));
|
||||
|
||||
tensor = torch::tensor(at::ArrayRef<float>({1.5f, 2.25f, 3.125f}));
|
||||
ASSERT_TRUE(tensor.is_variable());
|
||||
ASSERT_EQ(tensor.numel(), 3);
|
||||
ASSERT_EQ(tensor.dtype(), default_dtype);
|
||||
ASSERT_TRUE(almost_equal(tensor[0], 1.5));
|
||||
@ -441,7 +439,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
|
||||
ASSERT_TRUE(almost_equal(tensor[2], 3.125));
|
||||
|
||||
tensor = torch::tensor(std::vector<float>({1.5f, 2.25f, 3.125f}));
|
||||
ASSERT_TRUE(tensor.is_variable());
|
||||
ASSERT_EQ(tensor.numel(), 3);
|
||||
ASSERT_EQ(tensor.sizes(), std::vector<int64_t>({3}));
|
||||
ASSERT_EQ(tensor.dtype(), default_dtype);
|
||||
|
@ -46,6 +46,7 @@ TEST(TensorpipeSerialize, Base) {
|
||||
tensorpipe::Descriptor::Tensor t;
|
||||
t.length = tpTensor.length;
|
||||
t.sourceDevice = tpTensor.buffer.device();
|
||||
t.targetDevice = tpTensor.targetDevice;
|
||||
t.metadata = tpTensor.metadata;
|
||||
recvingTpDescriptor.tensors.push_back(std::move(t));
|
||||
}
|
||||
|
@ -4324,7 +4324,8 @@ TEST(LoopNest, fuseLoopsSimple) {
|
||||
auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
|
||||
auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4364,7 +4365,8 @@ TEST(LoopNest, fuseLoopsMultiple) {
|
||||
auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
|
||||
auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
|
||||
auto par = Block::make({forI, forJ, forK});
|
||||
auto fused_loop = LoopNest::fuseLoops({forI, forJ, forK});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forI, forJ, forK}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4421,7 +4423,8 @@ TEST(LoopNest, fuseLoopsNested) {
|
||||
auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
|
||||
auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
|
||||
auto par = Block::make({forM, forN});
|
||||
auto fused_loop = LoopNest::fuseLoops({forM, forN});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4481,7 +4484,8 @@ TEST(LoopNest, fuseLoopsNested2D) {
|
||||
50,
|
||||
Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)))));
|
||||
auto par = Block::make({forI, forM});
|
||||
auto fused_loop = LoopNest::fuseLoops({forI, forM});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4522,7 +4526,8 @@ TEST(LoopNest, fuseLoopsNested2DInner) {
|
||||
auto forN = For::make(
|
||||
n, 0, 100, Store::make(b_buf, {i, n}, Add::make(i, Mul::make(n, 100))));
|
||||
auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
|
||||
auto fused_loop = LoopNest::fuseLoops({forJ, forN});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *forI;
|
||||
@ -4557,8 +4562,8 @@ TEST(LoopNest, fuseLoopsDifferentStopBounds) {
|
||||
auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
|
||||
auto forK = For::make(k, 0, 50, Store::make(b_buf, {j}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forJ, forK}), "Loops with different stop bounds");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsDifferentStartBounds) {
|
||||
@ -4578,8 +4583,8 @@ TEST(LoopNest, fuseLoopsDifferentStartBounds) {
|
||||
auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
|
||||
auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forJ, forK}), "Loops with different start bounds");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsNotContiguous) {
|
||||
@ -4601,8 +4606,8 @@ TEST(LoopNest, fuseLoopsNotContiguous) {
|
||||
auto initB = Store::make(b_buf, {0}, 0);
|
||||
auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, initB, forK});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forJ, forK}), "Only contiguous loops can be fused");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsWithDifferentParents) {
|
||||
@ -4628,8 +4633,8 @@ TEST(LoopNest, fuseLoopsWithDifferentParents) {
|
||||
auto initB = Store::make(b_buf, {0}, 0);
|
||||
auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
|
||||
auto par = Block::make({forI, initB, forK});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forJ, forK}), "loops with different parents");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsWithVariableBounds) {
|
||||
@ -4650,7 +4655,8 @@ TEST(LoopNest, fuseLoopsWithVariableBounds) {
|
||||
auto forJ = For::make(j, 0, N, Store::make(a_buf, {j}, Mul::make(10, j)));
|
||||
auto forK = For::make(k, 0, N, Store::make(b_buf, {j}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4686,7 +4692,8 @@ TEST(LoopNest, fuseLoopsWithExprBounds) {
|
||||
auto forJ = For::make(j, 0, M + N, Store::make(a_buf, {j}, Mul::make(10, j)));
|
||||
auto forK = For::make(k, 0, M + N, Store::make(b_buf, {j}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4722,7 +4729,8 @@ TEST(LoopNest, fuseLoopsWithDifferentExprBounds) {
|
||||
auto forJ = For::make(j, M, N * 2, Store::make(a_buf, {j}, Mul::make(10, j)));
|
||||
auto forK = For::make(k, M, N + N, Store::make(b_buf, {j}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4757,7 +4765,8 @@ TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) {
|
||||
For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(30, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
|
||||
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4803,7 +4812,8 @@ TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) {
|
||||
auto forM = For::make(m, 0, 20, forN);
|
||||
auto par = Block::make({forI, forM});
|
||||
|
||||
auto fused_loop = LoopNest::fuseLoops({forI, forM});
|
||||
For* fused_loop;
|
||||
ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << *par;
|
||||
@ -4839,9 +4849,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies1) {
|
||||
auto forK =
|
||||
For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forJ, forK}),
|
||||
"not valid since it results in a loop carried dependence");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
|
||||
@ -4861,9 +4870,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
|
||||
auto forK =
|
||||
For::make(k, 10, 100, Store::make(a_buf, {k + 50}, Mul::make(20, k)));
|
||||
auto par = Block::make({forJ, forK});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forJ, forK}),
|
||||
"not valid since it results in a loop carried dependence");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
|
||||
@ -4905,9 +4913,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
|
||||
auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
|
||||
auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
|
||||
auto par = Block::make({forM, forN});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forM, forN}),
|
||||
"not valid since it results in a loop carried dependence");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
|
||||
@ -4948,9 +4955,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
|
||||
50,
|
||||
Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)))));
|
||||
auto par = Block::make({forI, forM});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forI, forM}),
|
||||
"not valid since it results in a loop carried dependence");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
|
||||
@ -4977,9 +4983,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
|
||||
100,
|
||||
Store::make(a_buf, {i, n + 1}, Add::make(i, Mul::make(n, 100))));
|
||||
auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forJ, forN}),
|
||||
"not valid since it results in a loop carried dependence");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
|
||||
@ -5004,9 +5009,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
|
||||
Store::make(
|
||||
b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
|
||||
auto par = Block::make({forJ, forK});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forJ, forK}),
|
||||
"not valid since it results in a loop carried dependence");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
|
||||
@ -5031,9 +5035,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
|
||||
b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
|
||||
auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
|
||||
auto par = Block::make({forK, forJ});
|
||||
ASSERT_THROWS_WITH(
|
||||
LoopNest::fuseLoops({forK, forJ}),
|
||||
"not valid since it results in a loop carried dependence");
|
||||
For* fused_loop;
|
||||
ASSERT_FALSE(LoopNest::fuseLoops({forK, forJ}, &fused_loop));
|
||||
}
|
||||
|
||||
TEST(LoopNest, areLoopsPerfectlyNested) {
|
||||
|
15
test/distributed/bin/test_script.py
Executable file
15
test/distributed/bin/test_script.py
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the BSD-style license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
|
||||
def main():
|
||||
print("Success, smoke test")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -32,7 +32,7 @@ def test_skippable_repr():
|
||||
|
||||
def forward(self, x):
|
||||
yield stash("hello", x)
|
||||
return self.conv(x) # noqa
|
||||
return self.conv(x) # noqa: B901
|
||||
|
||||
m = Hello()
|
||||
assert (
|
||||
|
@ -30,7 +30,7 @@ def test_1to3(balance, checkpoint, setup_rpc):
|
||||
def forward(self, input):
|
||||
yield stash("1to3", input)
|
||||
output = self.conv(input)
|
||||
return output # noqa
|
||||
return output # noqa: B901
|
||||
|
||||
class Layer2(nn.Module):
|
||||
def __init__(self):
|
||||
@ -73,7 +73,7 @@ def test_none_skip(setup_rpc):
|
||||
class Stash(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("none", None)
|
||||
return input # noqa
|
||||
return input # noqa: B901
|
||||
|
||||
@skippable(pop=["none"])
|
||||
class Pop(nn.Module):
|
||||
|
@ -19,7 +19,7 @@ class Pass(nn.Module):
|
||||
class StashFoo(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("foo", input)
|
||||
return input # noqa
|
||||
return input # noqa: B901
|
||||
|
||||
|
||||
@skippable(pop=["foo"])
|
||||
@ -33,7 +33,7 @@ class PopFoo(nn.Module):
|
||||
class StashBar(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("bar", input)
|
||||
return input # noqa
|
||||
return input # noqa: B901
|
||||
|
||||
|
||||
@skippable(pop=["bar"])
|
||||
|
@ -17,7 +17,7 @@ from torch.distributed.pipeline.sync.skip.tracker import current_skip_tracker
|
||||
class Stash(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("skip", input)
|
||||
return input # noqa
|
||||
return input # noqa: B901
|
||||
|
||||
|
||||
@skippable(pop=["skip"])
|
||||
|
@ -24,7 +24,7 @@ def test_stash(skip_tracker):
|
||||
class Stash(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("foo", input)
|
||||
return input * 2 # noqa
|
||||
return input * 2 # noqa: B901
|
||||
|
||||
l1 = Stash()
|
||||
|
||||
@ -41,13 +41,13 @@ def test_pop():
|
||||
class Stash(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("foo", input)
|
||||
return input * 2 # noqa
|
||||
return input * 2 # noqa: B901
|
||||
|
||||
@skippable(pop=["foo"])
|
||||
class Pop(nn.Module):
|
||||
def forward(self, input):
|
||||
foo = yield pop("foo")
|
||||
return foo # noqa
|
||||
return foo
|
||||
|
||||
l1 = Stash()
|
||||
l2 = Pop()
|
||||
@ -83,7 +83,7 @@ def test_stash_not_declared():
|
||||
class Stash(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("foo", input)
|
||||
return input * 2 # noqa
|
||||
return input * 2 # noqa: B901
|
||||
|
||||
l1 = Stash()
|
||||
|
||||
@ -96,13 +96,13 @@ def test_pop_not_declared():
|
||||
class Stash(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("foo", input)
|
||||
return input * 2 # noqa
|
||||
return input * 2 # noqa: B901
|
||||
|
||||
@skippable()
|
||||
class Pop(nn.Module):
|
||||
def forward(self, input):
|
||||
foo = yield pop("foo")
|
||||
return foo # noqa
|
||||
return foo
|
||||
|
||||
l1 = Stash()
|
||||
l2 = Pop()
|
||||
@ -130,7 +130,7 @@ def test_stash_none():
|
||||
class Stash(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("foo", None)
|
||||
return input * 2 # noqa
|
||||
return input * 2 # noqa: B901
|
||||
|
||||
l1 = Stash()
|
||||
l1(torch.tensor(42))
|
||||
|
@ -40,7 +40,7 @@ def test_default_skip_tracker_by_data_parallel():
|
||||
class Stash(nn.Module):
|
||||
def forward(self, input):
|
||||
yield stash("foo", input)
|
||||
return input * 2 # noqa
|
||||
return input * 2 # noqa: B901
|
||||
|
||||
@skippable(pop=["foo"])
|
||||
class Pop(nn.Module):
|
||||
|
@ -12,7 +12,7 @@ def test_clock_cycles():
|
||||
assert list(_clock_cycles(1, 3)) == [[(0, 0)], [(0, 1)], [(0, 2)]]
|
||||
assert list(_clock_cycles(3, 1)) == [[(0, 0)], [(1, 0)], [(2, 0)]]
|
||||
|
||||
assert list(_clock_cycles(3, 3)) == [ # noqa
|
||||
assert list(_clock_cycles(3, 3)) == [
|
||||
[(0, 0)],
|
||||
[(1, 0), (0, 1)],
|
||||
[(2, 0), (1, 1), (0, 2)],
|
||||
@ -20,7 +20,7 @@ def test_clock_cycles():
|
||||
[(2, 2)],
|
||||
]
|
||||
|
||||
assert list(_clock_cycles(4, 2)) == [ # noqa
|
||||
assert list(_clock_cycles(4, 2)) == [
|
||||
[(0, 0)],
|
||||
[(1, 0), (0, 1)],
|
||||
[(2, 0), (1, 1)],
|
||||
|
@ -49,6 +49,7 @@ from torch.testing._internal.common_distributed import (
|
||||
create_device,
|
||||
with_dist_debug_levels,
|
||||
with_nccl_blocking_wait,
|
||||
create_tcp_store,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
TestCase,
|
||||
@ -299,27 +300,9 @@ class PrefixFileStoreTest(TestCase, StoreTestBase):
|
||||
def _create_store(self):
|
||||
return c10d.PrefixStore(self.prefix, self.filestore)
|
||||
|
||||
|
||||
def create_tcp_store(addr, world_size=1, wait_for_workers=True):
|
||||
"""
|
||||
Creates a TCP store. Retries if the chosen port is already in use.
|
||||
"""
|
||||
ports = []
|
||||
for _ in range(10):
|
||||
try:
|
||||
port = common.find_free_port()
|
||||
ports.append(port)
|
||||
return c10d.TCPStore(addr, port, world_size, True, wait_for_workers=wait_for_workers)
|
||||
except RuntimeError as error:
|
||||
if str(error) == "Address already in use":
|
||||
continue
|
||||
raise
|
||||
raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports))
|
||||
|
||||
|
||||
class TCPStoreTest(TestCase, StoreTestBase):
|
||||
def _create_store(self):
|
||||
store = create_tcp_store("localhost")
|
||||
store = create_tcp_store()
|
||||
store.set_timeout(timedelta(seconds=300))
|
||||
return store
|
||||
|
||||
@ -329,7 +312,7 @@ class TCPStoreTest(TestCase, StoreTestBase):
|
||||
else:
|
||||
err_msg_reg = "^Address already in use$"
|
||||
with self.assertRaisesRegex(RuntimeError, err_msg_reg):
|
||||
addr = "localhost"
|
||||
addr = DEFAULT_HOSTNAME
|
||||
port = common.find_free_port()
|
||||
|
||||
# Use noqa to silence flake8.
|
||||
@ -418,7 +401,7 @@ class TCPStoreTest(TestCase, StoreTestBase):
|
||||
class PrefixTCPStoreTest(TestCase, StoreTestBase):
|
||||
def setUp(self):
|
||||
super(PrefixTCPStoreTest, self).setUp()
|
||||
self.tcpstore = create_tcp_store("localhost")
|
||||
self.tcpstore = create_tcp_store()
|
||||
self.prefix = "test_prefix"
|
||||
self.tcpstore.set_timeout(timedelta(seconds=300))
|
||||
|
||||
@ -652,7 +635,7 @@ class RendezvousFileTest(TestCase):
|
||||
@skip_if_win32()
|
||||
class RendezvousTCPTest(TestCase):
|
||||
def create_tcp_url(self):
|
||||
addr = "localhost"
|
||||
addr = DEFAULT_HOSTNAME
|
||||
port = common.find_free_port()
|
||||
url = "tcp://%s:%d?world_size=%d" % (addr, port, 1)
|
||||
return url
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user