2021-04-21 nightly release (1e03a2505f9cee92587bf45fbbbbfedede5cb9ec)

This commit is contained in:
chronos_secgrp_pytorch_oss_ci_oncall
2021-04-21 00:01:48 -07:00
parent 24ee5bc666
commit aca2009f6b
194 changed files with 4365 additions and 2228 deletions

View File

@ -125,7 +125,7 @@ def FalsePred(_):
def TruePred(_):
return True
_VC2019 = VcSpec(2019, ["14", "28", "29333"], hide_version=True)
_VC2019 = VcSpec(2019)
WORKFLOW_DATA = [
# VS2019 CUDA-10.1

View File

@ -6786,7 +6786,7 @@ workflows:
python_version: "3.6"
use_cuda: "1"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
@ -6799,7 +6799,7 @@ workflows:
test_name: pytorch-windows-test1
use_cuda: "1"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
@ -6812,7 +6812,7 @@ workflows:
test_name: pytorch-windows-test2
use_cuda: "1"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_build:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
@ -6821,7 +6821,7 @@ workflows:
python_version: "3.6"
use_cuda: "1"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
@ -6840,7 +6840,7 @@ workflows:
test_name: pytorch-windows-test1
use_cuda: "1"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
@ -6859,7 +6859,7 @@ workflows:
test_name: pytorch-windows-test2
use_cuda: "1"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_build:
build_environment: pytorch-win-vs2019-cpu-py3
@ -6868,7 +6868,7 @@ workflows:
python_version: "3.6"
use_cuda: "0"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cpu-py3
@ -6886,7 +6886,7 @@ workflows:
test_name: pytorch-windows-test1
use_cuda: "0"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cpu-py3
@ -6904,7 +6904,7 @@ workflows:
test_name: pytorch-windows-test2
use_cuda: "0"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
@ -6922,7 +6922,7 @@ workflows:
test_name: pytorch-windows-test1
use_cuda: "0"
vc_product: BuildTools
vc_version: "14.28.29333"
vc_version: ""
vc_year: "2019"
- update_s3_htmls:
context: org-member

164
.github/scripts/generate_linux_ci_workflows.py vendored Executable file
View File

@ -0,0 +1,164 @@
#!/usr/bin/env python
from pathlib import Path
import jinja2
DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
GITHUB_DIR = Path(__file__).parent.parent
CPU_TEST_RUNNER = "linux.2xlarge"
CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
class PyTorchLinuxWorkflow:
def __init__(self, build_environment: str, docker_image_base: str):
self.build_environment = build_environment
self.docker_image_base = docker_image_base
self.test_runner_type = CPU_TEST_RUNNER
if "cuda" in build_environment:
self.test_runner_type = CUDA_TEST_RUNNER
def generate_workflow_file(
self, workflow_template: jinja2.Template, jinja_env: jinja2.Environment
) -> Path:
output_file_path = GITHUB_DIR.joinpath(
f"workflows/{self.build_environment}.yml"
)
with open(output_file_path, "w") as output_file:
output_file.write(
workflow_template.render(
build_environment=self.build_environment,
docker_image_base=self.docker_image_base,
test_runner_type=self.test_runner_type
)
)
output_file.write('\n')
return output_file_path
WORKFLOWS = [
PyTorchLinuxWorkflow(
build_environment="pytorch-linux-xenial-py3.6-gcc5.4",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-parallelnative-linux-xenial-py3.6-gcc5.4",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-pure_torch-linux-xenial-py3.6-gcc5.4",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3.6-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-asan",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang7-onnx",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-bionic-py3.6-clang9-noarch",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-xla-linux-bionic-py3.6-clang9",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-vulkan-linux-bionic-py3.6-clang9",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-bionic-py3.8-gcc9-coverage",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.8-gcc9",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-bionic-rocm3.9-py3.6",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm3.9-py3.6",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-mobile",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-dynamic",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-static",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-mobile-code-analysis",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
# PyTorchLinuxWorkflow(
# build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a",
# docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
# ),
]
if __name__ == "__main__":
jinja_env = jinja2.Environment(
variable_start_string="!{{",
loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))),
)
workflow_template = jinja_env.get_template("linux_ci_workflow.yml.in")
for workflow in WORKFLOWS:
print(
workflow.generate_workflow_file(
workflow_template=workflow_template,
jinja_env=jinja_env
)
)

43
.github/scripts/install_nvidia_utils_linux.sh vendored Executable file
View File

@ -0,0 +1,43 @@
#!/usr/bin/env bash
set -eou pipefail
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \
DRIVER_FN="NVIDIA-Linux-x86_64-460.39.run"
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
install_nvidia_docker2_amzn2() {
(
set -x
# Needed for yum-config-manager
sudo yum install -y yum-utils
sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
sudo yum install -y nvidia-docker2
sudo systemctl restart docker
)
}
install_nvidia_driver() {
(
set -x
sudo yum groupinstall -y "Development Tools"
curl -fsL -o nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
sudo /bin/bash nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
nvidia-smi
)
}
# Install container toolkit based on distribution
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_docker2_amzn2
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac
echo "== Installing nvidia driver ${DRIVER_FN} =="
install_nvidia_driver

5
.github/scripts/report_git_status.sh vendored Executable file
View File

@ -0,0 +1,5 @@
#!/usr/bin/env bash
CHANGES=$(git status --porcelain)
echo "$CHANGES"
git diff
[ -z "$CHANGES" ]

View File

@ -0,0 +1,193 @@
# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
#
# Template is at: .github/templates/linux_ci_workflow.yml
# Generation script: .github/scripts/generate_linux_ci_workflows.py
name: Linux CI (!{{ build_environment }})
on:
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
# pull_request:
push:
branches:
- master
- release/*
workflow_dispatch:
env:
BUILD_ENVIRONMENT: !{{ build_environment }}
DOCKER_IMAGE_BASE: !{{ docker_image_base }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
TORCH_CUDA_ARCH_LIST: 5.2
IN_CI: 1
jobs:
calculate-docker-image:
runs-on: ubuntu-18.04
outputs:
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Calculate docker image tag
id: calculate-tag
run: |
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
build:
runs-on: linux.2xlarge
needs: calculate-docker-image
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
submodules: recursive
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Create test binary build directories
run: |
mkdir -pv ../custom-op-build
mkdir -pv ../custom-backend-build
mkdir -pv ../jit-hook-build
- name: Build PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
# Why the three volume mounts here? So test binaries are put in the correct spot
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
# TODO: Stop building test binaries as part of the build phase
docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS \
-e SCCACHE_BUCKET \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)/../":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Archive artifacts into zip
run: |
(cd "${GITHUB_WORKSPACE}/../" && zip -r pytorch/artifacts.zip pytorch/dist pytorch/build custom-op-build/ custom-backend-build/ jit-hook-build/)
- uses: actions/upload-artifact@v2
name: Store PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
retention-days: 30
if-no-files-found: error
path:
artifacts.zip
- name: Clean up docker images
if: always()
run: |
# Prune all of the docker images
docker system prune -af
test:
runs-on: !{{ test_runner_type }}
needs:
- calculate-docker-image
- build
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
run: |
bash .github/scripts/install_nvidia_utils_linux.sh
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Determine shm-size
run: |
shm_size="1g"
case "${BUILD_ENVIRONMENT}" in
*cuda*)
shm_size="2g"
;;
*rocm*)
shm_size="8g"
;;
esac
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
- uses: actions/download-artifact@v2
name: Download PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
- name: Unzip artifacts
run: |
(cd "${GITHUB_WORKSPACE}/../" && unzip -q pytorch/artifacts.zip)
- name: Output disk space left
run: |
sudo df -H
- name: Test PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
# Why the three volume mounts here? So test binaries are put in the correct spot
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086
docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e IN_CI \
-e MAX_JOBS \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'
- name: Clean up docker images
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
# Prune all of the docker images
docker system prune -af

View File

@ -25,7 +25,7 @@ jobs:
ISSUE_NUMBER="${PR_NUMBER}"
else
TITLE="${ISSUE_TITLE}"
ISSUE_NUMBER="${ISSUE_NUMBER}"
# ISSUE_NUMBER is already set
fi
echo ::set-output name=TITLE::"${TITLE}"
echo ::set-output name=ISSUE_NUMBER::"${ISSUE_NUMBER}"

View File

@ -34,10 +34,10 @@ jobs:
- name: Extract scripts from GitHub Actions workflows
run: tools/extract_scripts.py --out=.extracted_scripts
- name: ShellCheck
# https://github.com/koalaman/shellcheck/tree/v0.7.1#installing-a-pre-compiled-binary
# https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
run: |
set -x
scversion="v0.7.1"
scversion="v0.7.2"
wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
rm -r "shellcheck-${scversion}"
@ -45,19 +45,23 @@ jobs:
tools/run_shellcheck.sh .jenkins/pytorch .extracted_scripts
- name: Ensure correct trailing newlines
run: |
(! git grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)tools/clang_format_hash' | tools/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))
(! git --no-pager grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)tools/clang_format_hash' | tools/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))
- name: Ensure no trailing spaces
run: |
(! git grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
(! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false))
- name: Ensure no tabs
run: |
(! git grep -In $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above lines have tabs; please convert them to spaces"; false))
(! git --no-pager grep -In $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above lines have tabs; please convert them to spaces"; false))
- name: Ensure no non-breaking spaces
run: |
(! git grep -In $'\u00a0' -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
(! git --no-pager grep -In $'\u00a0' -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
- name: Ensure canonical include
run: |
(! git grep -In $'#include "' -- ./c10 ./aten ./torch/csrc ':(exclude)aten/src/ATen/native/quantized/cpu/qnnpack/**' || (echo "The above lines have include with quotes; please convert them to #include <xxxx>"; false))
(! git --no-pager grep -In $'#include "' -- ./c10 ./aten ./torch/csrc ':(exclude)aten/src/ATen/native/quantized/cpu/qnnpack/**' || (echo "The above lines have include with quotes; please convert them to #include <xxxx>"; false))
- name: Ensure no unqualified noqa
run: |
# shellcheck disable=SC2016
(! git --no-pager grep -InP '# noqa(?!: [A-Z]+\d{3})' -- '**.py' ':(exclude)caffe2' || (echo 'The above lines have unqualified `noqa`; please convert them to `noqa: XXXX`'; false))
# note that this next step depends on a clean checkout;
# if you run it locally then it will likely to complain
# about all the generated files in torch/test
@ -75,7 +79,7 @@ jobs:
python torch/testing/check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
- name: Ensure no direct cub include
run: |
(! git grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
(! git --no-pager grep -I -no $'#include <cub/' -- ./aten ':(exclude)aten/src/ATen/cuda/cub.cuh' || (echo "The above files have direct cub include; please include ATen/cuda/cub.cuh instead and wrap your cub calls in at::native namespace if necessary"; false))
python2-setup-compat:
runs-on: ubuntu-18.04
@ -91,6 +95,23 @@ jobs:
run: |
python2 setup.py | grep "Python 2 has reached end-of-life and is no longer supported by PyTorch."
templates:
runs-on: ubuntu-18.04
steps:
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.x
architecture: x64
- name: Install Jinja2
run: pip install Jinja2
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Regenerate workflows
run: .github/scripts/generate_linux_ci_workflows.py
- name: Assert that regenerating the workflows didn't change them
run: .github/scripts/report_git_status.sh
toc:
runs-on: ubuntu-18.04
# https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687
@ -107,16 +128,11 @@ jobs:
run: |
set -eux
export PATH=~/.npm-global/bin:"$PATH"
for FILE in {CONTRIBUTING,README}.md; do
for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
markdown-toc --bullets='-' -i "$FILE"
done
- name: Assert that regenerating the ToCs didn't change them
run: |
set -eux
CHANGES=$(git status --porcelain)
echo "$CHANGES"
git diff
[ -z "$CHANGES" ]
run: .github/scripts/report_git_status.sh
flake8-py3:
runs-on: ubuntu-18.04
@ -137,21 +153,23 @@ jobs:
mkdir flake8-output
cd flake8-output
echo "$HEAD_SHA" > commit-sha.txt
- name: Run flake8
- name: Install dependencies
run: |
set -eux
pip install typing-extensions # for tools/translate_annotations.py
pip install -r requirements-flake8.txt
flake8 --version
- name: Run flake8
run: |
set -eux
flake8 | tee "${GITHUB_WORKSPACE}"/flake8-output.txt
cp flake8-output.txt flake8-output/annotations.json
- name: Translate annotations
if: github.event_name == 'pull_request'
env:
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
tools/translate_annotations.py \
--file=flake8-output.txt \
--file="${GITHUB_WORKSPACE}"/flake8-output.txt \
--regex='^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorCode>\w+\d+) (?P<errorDesc>.*)' \
--commit="$HEAD_SHA" \
> flake8-output/annotations.json
@ -202,10 +220,7 @@ jobs:
sudo apt-get update
sudo apt-get install -y clang-tidy-11
sudo update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 1000
- name: Run clang-tidy
env:
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
- name: Generate build files
run: |
set -eux
git remote add upstream https://github.com/pytorch/pytorch
@ -229,6 +244,12 @@ jobs:
--native-functions-path aten/src/ATen/native/native_functions.yaml \
--nn-path aten/src
fi
- name: Run clang-tidy
env:
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
set -eux
# Run Clang-Tidy
# The negative filters below are to exclude files that include onnx_pb.h or
@ -283,13 +304,16 @@ jobs:
architecture: x64
- name: Fetch PyTorch
uses: actions/checkout@v2
- name: Run cmakelint
- name: Install dependencies
run: |
set -eux
pip install cmakelint
cmakelint --version
- name: Run cmakelint
run: |
set -eux
git ls-files -z -- bootstrap '*.cmake' '*.cmake.in' '*CMakeLists.txt' | \
grep -E -z -v '^(cmake/Modules/|cmake/Modules_CUDA_fix/)' | \
grep -E -z -v '^(cmake/Modules/|cmake/Modules_CUDA_fix/|cmake/Caffe2Config.cmake.in|aten/src/ATen/ATenConfig.cmake.in|cmake/Caffe2ConfigVersion.cmake.in|cmake/TorchConfig.cmake.in|cmake/TorchConfigVersion.cmake.in|cmake/cmake_uninstall.cmake.in)' | \
xargs -0 cmakelint --config=.cmakelintrc --spaces=2 --quiet
mypy:

View File

@ -0,0 +1,193 @@
# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually
#
# Template is at: .github/templates/linux_ci_workflow.yml
# Generation script: .github/scripts/generate_linux_ci_workflows.py
name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
on:
# TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
# pull_request:
push:
branches:
- master
- release/*
workflow_dispatch:
env:
BUILD_ENVIRONMENT: pytorch-linux-xenial-py3.6-gcc5.4
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
TORCH_CUDA_ARCH_LIST: 5.2
IN_CI: 1
jobs:
calculate-docker-image:
runs-on: ubuntu-18.04
outputs:
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Calculate docker image tag
id: calculate-tag
run: |
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
build:
runs-on: linux.2xlarge
needs: calculate-docker-image
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
submodules: recursive
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Create test binary build directories
run: |
mkdir -pv ../custom-op-build
mkdir -pv ../custom-backend-build
mkdir -pv ../jit-hook-build
- name: Build PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
# Why the three volume mounts here? So test binaries are put in the correct spot
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
# TODO: Stop building test binaries as part of the build phase
docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS \
-e SCCACHE_BUCKET \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)/../":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Archive artifacts into zip
run: |
(cd "${GITHUB_WORKSPACE}/../" && zip -r pytorch/artifacts.zip pytorch/dist pytorch/build custom-op-build/ custom-backend-build/ jit-hook-build/)
- uses: actions/upload-artifact@v2
name: Store PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
retention-days: 30
if-no-files-found: error
path:
artifacts.zip
- name: Clean up docker images
if: always()
run: |
# Prune all of the docker images
docker system prune -af
test:
runs-on: linux.2xlarge
needs:
- calculate-docker-image
- build
env:
DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
steps:
- name: Chown workspace
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Log in to ECR
run: |
aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
bash /tmp/ecr-login.sh
rm /tmp/ecr-login.sh
- name: Pull docker image
run: |
docker pull "${DOCKER_IMAGE}"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }}
run: |
bash .github/scripts/install_nvidia_utils_linux.sh
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Determine shm-size
run: |
shm_size="1g"
case "${BUILD_ENVIRONMENT}" in
*cuda*)
shm_size="2g"
;;
*rocm*)
shm_size="8g"
;;
esac
echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
- uses: actions/download-artifact@v2
name: Download PyTorch Build Artifacts
with:
name: ${{ env.BUILD_ENVIRONMENT }}
- name: Unzip artifacts
run: |
(cd "${GITHUB_WORKSPACE}/../" && unzip -q pytorch/artifacts.zip)
- name: Output disk space left
run: |
sudo df -H
- name: Test PyTorch
run: |
SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM
export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS ))
# Why the three volume mounts here? So test binaries are put in the correct spot
# NOTE: You cannot volume mount ${GITHUB_WORKSPACE}../:/var/lib/jenkins since sccache connection will hang
# See CUSTOM_OP_BUILD, JIT_HOOK_BUILD, CUSTOM_BACKEND_BUILD
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086
docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e IN_CI \
-e MAX_JOBS \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="${SHM_SIZE}" \
--tty \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'
- name: Clean up docker images
if: always()
run: |
# Ensure the working directory gets chowned back to the current user
docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" .
# Prune all of the docker images
docker system prune -af

9
.gitignore vendored
View File

@ -292,3 +292,12 @@ bazel-*
# direnv, posh-direnv
.envrc
.psenvrc
# generated shellcheck directories
.shellcheck_generated*/
# zip archives
*.zip
# core dump files
core.*

View File

@ -59,6 +59,17 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
export BUILD_SPLIT_CUDA=ON
fi
if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
export BUILD_CAFFE2=OFF
fi
if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
export ATEN_THREADING=TBB
export USE_TBB=1
elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
export ATEN_THREADING=NATIVE
fi
# TODO: Don't run this...
pip_install -r requirements.txt || true
@ -234,7 +245,7 @@ else
CUSTOM_OP_TEST="$PWD/test/custom_operator"
python --version
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir "$CUSTOM_OP_BUILD"
mkdir -p "$CUSTOM_OP_BUILD"
pushd "$CUSTOM_OP_BUILD"
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1
@ -246,7 +257,7 @@ else
JIT_HOOK_TEST="$PWD/test/jit_hooks"
python --version
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir "$JIT_HOOK_BUILD"
mkdir -p "$JIT_HOOK_BUILD"
pushd "$JIT_HOOK_BUILD"
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1
@ -257,7 +268,7 @@ else
CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build"
CUSTOM_BACKEND_TEST="$PWD/test/custom_backend"
python --version
mkdir "$CUSTOM_BACKEND_BUILD"
mkdir -p "$CUSTOM_BACKEND_BUILD"
pushd "$CUSTOM_BACKEND_BUILD"
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)"
make VERBOSE=1

View File

@ -72,7 +72,16 @@ if [[ "$BUILD_ENVIRONMENT" != *pytorch-win-* ]]; then
# Save sccache logs to file
sccache --stop-server || true
rm ~/sccache_error.log || true
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then
# sccache --start-server seems to hang forever on self hosted runners for GHA
# so let's just go ahead and skip the --start-server altogether since it seems
# as though sccache still gets used even when the sscache server isn't started
# explicitly
echo "Skipping sccache server initialization, setting environment variables"
export SCCACHE_IDLE_TIMEOUT=1200
export SCCACHE_ERROR_LOG=~/sccache_error.log
export RUST_LOG=sccache::server=error
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
else
# increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:

View File

@ -26,7 +26,7 @@ if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
retry bash "${WORKSPACE_DIR}"/miniconda3.sh -b -p "${WORKSPACE_DIR}"/miniconda3
fi
export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
# shellcheck disable=SC1090
# shellcheck disable=SC1091
source "${WORKSPACE_DIR}"/miniconda3/bin/activate
retry conda install -y mkl mkl-include numpy=1.18.5 pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja typing_extensions dataclasses pip
# The torch.hub tests make requests to GitHub.

View File

@ -51,7 +51,11 @@ test_python_all() {
export GLOO_SOCKET_IFNAME=lo0
echo "Ninja version: $(ninja --version)"
if [ -n "$CIRCLE_PULL_REQUEST" ]; then
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
if [ -n "$IN_PULL_REQUEST" ]; then
DETERMINE_FROM=$(mktemp)
file_diff_from_base "$DETERMINE_FROM"
fi

View File

@ -115,7 +115,11 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX2-* ]]; then
export ATEN_CPU_CAPABILITY=avx
fi
if [ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
DETERMINE_FROM=$(mktemp)
file_diff_from_base "$DETERMINE_FROM"
fi

View File

@ -42,12 +42,16 @@ fi
export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers
if [ -n "$CIRCLE_PULL_REQUEST" ]; then
# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
if [ -n "$IN_PULL_REQUEST" ]; then
DETERMINE_FROM="${TMP_DIR}/determine_from"
file_diff_from_base "$DETERMINE_FROM"
fi
if [[ "${CIRCLE_JOB}" == *11* ]]; then
if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then
export BUILD_SPLIT_CUDA=ON
fi

View File

@ -10,6 +10,7 @@
- [Unit testing](#unit-testing)
- [Python Unit Testing](#python-unit-testing)
- [Better local unit tests with `pytest`](#better-local-unit-tests-with-pytest)
- [Local linting](#local-linting)
- [Running `mypy`](#running-mypy)
- [C++ Unit Testing](#c-unit-testing)
- [Writing documentation](#writing-documentation)
@ -357,13 +358,44 @@ The above is an example of testing a change to all Loss functions: this
command runs tests such as `TestNN.test_BCELoss` and
`TestNN.test_MSELoss` and can be useful to save keystrokes.
### Local linting
You can run the same linting steps that are used in CI locally via `make`:
```bash
make lint -j 6 # run lint (using 6 parallel jobs)
```
These jobs may require extra dependencies that aren't dependencies of PyTorch
itself, so you can install them via this command, which you should only have to
run once:
```bash
make setup_lint
```
To run a specific linting step, use one of these targets or see the
[`Makefile`](Makefile) for a complete list of options.
```bash
# Check for tabs, trailing newlines, etc.
make quick_checks
make flake8
make mypy
make cmakelint
```
### Running `mypy`
`mypy` is an optional static type checker for Python. We have multiple `mypy`
configs for the PyTorch codebase, so you can run them all using this command:
```bash
for CONFIG in mypy*.ini; do mypy --config="$CONFIG"; done
make mypy
```
See [Guide for adding type annotations to

View File

@ -1,6 +1,7 @@
# PyTorch Glossary
- [PyTorch Glossary](#pytorch-glossary)
<!-- toc -->
- [Operation and Kernel](#operation-and-kernel)
- [ATen](#aten)
- [Operation](#operation)
@ -19,6 +20,8 @@
- [Tracing](#tracing)
- [Scripting](#scripting)
<!-- tocstop -->
# Operation and Kernel
## ATen

View File

@ -14,8 +14,63 @@ ios:
clean: # This will remove ALL build folders.
@rm -r build*/
@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
linecount:
@cloc --read-lang-def=caffe.cloc caffe2 || \
echo "Cloc is not available on the machine. You can install cloc with " && \
echo " sudo apt-get install cloc"
SHELLCHECK_GHA_GENERATED_FOLDER=.shellcheck_generated_gha
shellcheck-gha:
@$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER)
tools/extract_scripts.py --out=$(SHELLCHECK_GHA_GENERATED_FOLDER)
tools/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER)
generate-gha-workflows:
./.github/scripts/generate_linux_ci_workflows.py
$(MAKE) shellcheck-gha
setup_lint:
python tools/actions_local_runner.py --file .github/workflows/lint.yml \
--job 'flake8-py3' --step 'Install dependencies'
python tools/actions_local_runner.py --file .github/workflows/lint.yml \
--job 'cmakelint' --step 'Install dependencies'
pip install jinja2
quick_checks:
# TODO: This is broken when 'git config submodule.recurse' is 'true'
@python tools/actions_local_runner.py \
--file .github/workflows/lint.yml \
--job 'quick-checks' \
--step 'Ensure no trailing spaces' \
--step 'Ensure no tabs' \
--step 'Ensure no non-breaking spaces' \
--step 'Ensure canonical include' \
--step 'Ensure no unqualified noqa' \
--step 'Ensure no direct cub include' \
--step 'Ensure correct trailing newlines'
flake8:
@python tools/actions_local_runner.py \
--file .github/workflows/lint.yml \
--job 'flake8-py3' \
--step 'Run flake8'
mypy:
@python tools/actions_local_runner.py \
--file .github/workflows/lint.yml \
--job 'mypy' \
--step 'Run mypy'
cmakelint:
@python tools/actions_local_runner.py \
--file .github/workflows/lint.yml \
--job 'cmakelint' \
--step 'Run cmakelint'
clang_tidy:
echo "clang-tidy local lint is not yet implemented"
exit 1
lint: flake8 mypy quick_checks cmakelint generate-gha-workflows

View File

@ -201,8 +201,7 @@ After that, you can use libtorch C++ API from your native code.
namespace pytorch_testapp_jni {
namespace {
struct JITCallGuard {
torch::autograd::AutoGradMode no_autograd_guard{false};
torch::AutoNonVariableTypeMode non_var_guard{true};
c10::InferenceMode guard;
torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
};
}

View File

@ -26,14 +26,8 @@ namespace pytorch_jni {
namespace {
struct JITCallGuard {
// AutoGrad is disabled for mobile by default.
torch::autograd::AutoGradMode no_autograd_guard{false};
// VariableType dispatch is not included in default mobile build. We need set
// this guard globally to avoid dispatch error (only for dynamic dispatch).
// Thanks to the unification of Variable class and Tensor class it's no longer
// required to toggle the NonVariableTypeMode per op - so it doesn't hurt to
// always set NonVariableTypeMode for inference only use case.
torch::AutoNonVariableTypeMode non_var_guard{true};
// Inference only workload.
c10::InferenceMode guard;
// Disable graph optimizer to ensure list of unused ops are not changed for
// custom mobile build.
torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};

View File

@ -17,14 +17,8 @@ namespace pytorch_jni {
namespace {
struct LiteJITCallGuard {
// VariableType dispatch is not included in default mobile build. We need set
// this guard globally to avoid dispatch error (only for dynamic dispatch).
// Thanks to the unification of Variable class and Tensor class it's no longer
// required to toggle the NonVariableTypeMode per op - so it doesn't hurt to
// always set NonVariableTypeMode for inference only use case.
// TODO: avoid having to set this guard for custom mobile build with mobile
// interpreter.
torch::AutoNonVariableTypeMode non_var_guard{true};
// Inference only workload.
c10::InferenceMode guard;
};
} // namespace

View File

@ -24,8 +24,7 @@ void log(const char* m, T t) {
}
struct JITCallGuard {
torch::autograd::AutoGradMode no_autograd_guard{false};
torch::AutoNonVariableTypeMode non_var_guard{true};
c10::InferenceMode guard;
torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false};
};
} // namespace

View File

@ -40,6 +40,10 @@ TORCH_LIBRARY_IMPL(_, AutogradCPU, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}
TORCH_LIBRARY_IMPL(_, AutogradXPU, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}
TORCH_LIBRARY_IMPL(_, AutogradCUDA, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}

View File

@ -202,6 +202,17 @@ public:
}
return mask;
}
Vec256<T> isnan() const {
Vec256<T> vec;
for (int64_t i = 0; i != size(); i++) {
if (_isnan(values[i])) {
std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));
} else {
std::memset(static_cast<void*>(vec.values + i), 0, sizeof(T));
}
}
return vec;
}
Vec256<T> map(T (*f)(T)) const {
Vec256<T> ret;
for (int64_t i = 0; i != size(); i++) {

View File

@ -96,6 +96,9 @@ public:
__m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ);
return _mm256_movemask_pd(cmp);
}
Vec256<double> isnan() const {
return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
}
Vec256<double> map(double (*f)(double)) const {
__at_align32__ double tmp[size()];
store(tmp);

View File

@ -103,6 +103,9 @@ public:
__m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
return _mm256_movemask_ps(cmp);
}
Vec256<float> isnan() const {
return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
}
Vec256<float> map(float (*f)(float)) const {
__at_align32__ float tmp[size()];
store(tmp);

View File

@ -283,6 +283,19 @@ public:
}
return mask;
}
Vec256<float> isnan() const {
__at_align32__ float tmp[size()];
__at_align32__ float res[size()];
store(tmp);
for (int i = 0; i < size(); i++) {
if (_isnan(tmp[i])) {
std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
} else {
std::memset(static_cast<void*>(&res[i]), 0, sizeof(float));
}
}
return loadu(res);
};
Vec256<float> map(float (*f)(float)) const {
__at_align32__ float tmp[size()];
store(tmp);

View File

@ -5,6 +5,55 @@
namespace at {
namespace meta {
TORCH_META_FUNC(adaptive_max_pool2d) (const Tensor& input, IntArrayRef output_size) {
for (int64_t i = 0; i < input.ndimension(); i++) {
TORCH_CHECK(
input.size(i) > 0,
"adaptive_max_pool2d: expected input to have non-empty spatial dimensions, "
"but input has sizes ",
input.sizes(),
" with dimension ",
i,
" being "
"empty");
}
TORCH_CHECK(
(input.ndimension() == 3 || input.ndimension() == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
TORCH_CHECK(
output_size.size() == 2,
"adaptive_max_pool2d: internal error: output_size.size() must be 2");
int dimH = 1;
int64_t sizeB = 1;
int64_t sizeD = 0;
if (input.ndimension() == 4) {
sizeB = input.size(0);
dimH++;
}
sizeD = input.size(dimH - 1);
int64_t osizeH = output_size[0];
int64_t osizeW = output_size[1];
/* resize output */
if (input.ndimension() == 3) {
set_output(0, {sizeD, osizeH, osizeW}, input.options());
/* indices will contain i,j locations for each output point */
set_output(1, {sizeD, osizeH, osizeW}, input.options().dtype(kLong));
} else {
set_output(0, {sizeB, sizeD, osizeH, osizeW}, input.options());
/* indices will contain i,j locations for each output point */
set_output(1, {sizeB, sizeD, osizeH, osizeW}, input.options().dtype(kLong));
}
}
} // namespace meta
namespace native {
namespace {
@ -115,102 +164,6 @@ static void adaptive_max_pool2d_out_frame(
});
}
void adaptive_max_pool2d_out_cpu_template(
Tensor& output,
Tensor& indices,
const Tensor& input,
IntArrayRef output_size)
{
int dimW = 2;
int dimH = 1;
int64_t sizeB = 1;
int64_t sizeD = 0;
int64_t isizeH = 0;
int64_t isizeW = 0;
int64_t istrideD = 0;
int64_t istrideH = 0;
int64_t istrideW = 0;
int64_t istrideB = 0;
for (int64_t i = 0; i < input.ndimension(); i++) {
TORCH_CHECK(input.size(i) > 0,
"adaptive_max_pool2d: expected input to have non-empty spatial dimensions, "
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
"empty");
}
TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
TORCH_CHECK(output_size.size() == 2,
"adaptive_max_pool2d: internal error: output_size.size() must be 2");
if (input.ndimension() == 4)
{
istrideB = input.stride(0);
sizeB = input.size(0);
dimW++;
dimH++;
}
/* sizes */
sizeD = input.size(dimH-1);
isizeH = input.size(dimH);
isizeW = input.size(dimW);
/* strides */
istrideD = input.stride(dimH-1);
istrideH = input.stride(dimH);
istrideW = input.stride(dimW);
int64_t osizeH = output_size[0];
int64_t osizeW = output_size[1];
/* resize output */
if (input.ndimension() == 3)
{
output.resize_({sizeD, osizeH, osizeW});
/* indices will contain i,j locations for each output point */
indices.resize_({sizeD, osizeH, osizeW});
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
adaptive_max_pool2d_single_out_frame<scalar_t>(input_data, output_data,
indices_data,
sizeD,
isizeH, isizeW,
osizeH, osizeW,
istrideD,
istrideH, istrideW);
}
);
}
else
{
output.resize_({sizeB, sizeD, osizeH, osizeW});
/* indices will contain i,j locations for each output point */
indices.resize_({sizeB, sizeD, osizeH, osizeW});
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
adaptive_max_pool2d_out_frame<scalar_t>(input_data, output_data,
indices_data,
sizeB, sizeD,
isizeH, isizeW,
osizeH, osizeW,
istrideB, istrideD,
istrideH, istrideW);
}
);
}
}
template <typename scalar_t>
static void adaptive_max_pool2d_backward_single_out_frame(
scalar_t *gradInput_p,
@ -346,31 +299,83 @@ Tensor& adaptive_max_pool2d_backward_out_cpu_template(
} // namespace
std::tuple<Tensor&, Tensor&> adaptive_max_pool2d_out_cpu(const Tensor& input,
IntArrayRef output_size,
Tensor& output,
Tensor& indices)
{
adaptive_max_pool2d_out_cpu_template(
output,
indices,
input,
output_size);
return std::tuple<Tensor&, Tensor&>(output, indices);
}
TORCH_IMPL_FUNC(adaptive_max_pool2d_out_cpu)
(const Tensor& input, IntArrayRef output_size, const Tensor& output, const Tensor& indices) {
int dimW = 2;
int dimH = 1;
int64_t sizeB = 1;
int64_t sizeD = 0;
int64_t isizeH = 0;
int64_t isizeW = 0;
std::tuple<Tensor, Tensor> adaptive_max_pool2d_cpu(
const Tensor& input,
IntArrayRef output_size)
{
Tensor output = at::empty({0}, input.options());
Tensor indices = at::empty({0}, input.options().dtype(kLong));
adaptive_max_pool2d_out_cpu_template(
output,
indices,
input,
output_size);
return std::tuple<Tensor, Tensor>(output, indices);
int64_t istrideD = 0;
int64_t istrideH = 0;
int64_t istrideW = 0;
int64_t istrideB = 0;
if (input.ndimension() == 4) {
istrideB = input.stride(0);
sizeB = input.size(0);
dimW++;
dimH++;
}
/* sizes */
sizeD = input.size(dimH - 1);
isizeH = input.size(dimH);
isizeW = input.size(dimW);
/* strides */
istrideD = input.stride(dimH - 1);
istrideH = input.stride(dimH);
istrideW = input.stride(dimW);
int64_t osizeH = output_size[0];
int64_t osizeW = output_size[1];
/* resize output */
if (input.ndimension() == 3) {
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
adaptive_max_pool2d_single_out_frame<scalar_t>(
input_data,
output_data,
indices_data,
sizeD,
isizeH,
isizeW,
osizeH,
osizeW,
istrideD,
istrideH,
istrideW);
});
} else {
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "adaptive_max_pool2d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
adaptive_max_pool2d_out_frame<scalar_t>(
input_data,
output_data,
indices_data,
sizeB,
sizeD,
isizeH,
isizeW,
osizeH,
osizeW,
istrideB,
istrideD,
istrideH,
istrideW);
});
}
}
Tensor& adaptive_max_pool2d_backward_out_cpu(const Tensor& gradOutput_,

View File

@ -5,6 +5,57 @@
namespace at {
namespace meta {
TORCH_META_FUNC(adaptive_max_pool3d) (const Tensor& input, IntArrayRef output_size) {
for (int64_t i = 0; i < input.ndimension(); i++) {
TORCH_CHECK(
input.size(i) > 0,
"adaptive_max_pool3d: expected input to have non-empty spatial dimensions, "
"but input has sizes ",
input.sizes(),
" with dimension ",
i,
" being "
"empty");
}
TORCH_CHECK(
(input.ndimension() == 4 || input.ndimension() == 5),
"non-empty 4D or 5D (batch mode) tensor expected for input");
TORCH_CHECK(
output_size.size() == 3,
"adaptive_max_pool3d: internal error: output_size.size() must be 3");
int dimD = 0;
int64_t sizeB = 1;
int64_t sizeD = 0;
if (input.ndimension() == 5) {
sizeB = input.size(0);
dimD++;
}
/* sizes */
sizeD = input.size(dimD);
int64_t osizeT = output_size[0];
int64_t osizeH = output_size[1];
int64_t osizeW = output_size[2];
/* resize output */
if (input.ndimension() == 4) {
set_output(0, {sizeD, osizeT, osizeH, osizeW}, input.options());
/* indices will contain max input locations for each output point */
set_output(1, {sizeD, osizeT, osizeH, osizeW}, input.options().dtype(kLong));
} else {
set_output(0, {sizeB, sizeD, osizeT, osizeH, osizeW}, input.options());
/* indices will contain max input locations for each output point */
set_output(1, {sizeB, sizeD, osizeT, osizeH, osizeW}, input.options().dtype(kLong));
}
}
} // namespace meta
namespace native {
namespace {
@ -393,31 +444,97 @@ Tensor& adaptive_max_pool3d_backward_out_cpu_template(
} // namespace
std::tuple<Tensor&, Tensor&> adaptive_max_pool3d_out_cpu(const Tensor& input,
IntArrayRef output_size,
Tensor& output,
Tensor& indices)
{
adaptive_max_pool3d_out_cpu_template(
output,
indices,
input,
output_size);
return std::tuple<Tensor&, Tensor&>(output, indices);
}
TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
(const Tensor& input, IntArrayRef output_size, const Tensor& output, const Tensor& indices) {
int dimD = 0;
int dimT = 1;
int dimH = 2;
int dimW = 3;
int64_t sizeB = 1;
int64_t sizeD = 0;
int64_t isizeT = 0;
int64_t isizeH = 0;
int64_t isizeW = 0;
std::tuple<Tensor, Tensor> adaptive_max_pool3d_cpu(
const Tensor& input,
IntArrayRef output_size)
{
Tensor output = at::empty({0}, input.options());
Tensor indices = at::empty({0}, input.options().dtype(kLong));
adaptive_max_pool3d_out_cpu_template(
output,
indices,
input,
output_size);
return std::tuple<Tensor, Tensor>(output, indices);
int64_t istrideB = 0;
int64_t istrideD = 0;
int64_t istrideT = 0;
int64_t istrideH = 0;
int64_t istrideW = 0;
if (input.ndimension() == 5) {
istrideB = input.stride(0);
sizeB = input.size(0);
dimD++;
dimT++;
dimH++;
dimW++;
}
/* sizes */
sizeD = input.size(dimD);
isizeT = input.size(dimT);
isizeH = input.size(dimH);
isizeW = input.size(dimW);
/* strides */
istrideD = input.stride(dimD);
istrideT = input.stride(dimT);
istrideH = input.stride(dimH);
istrideW = input.stride(dimW);
int64_t osizeT = output_size[0];
int64_t osizeH = output_size[1];
int64_t osizeW = output_size[2];
if (input.ndimension() == 4) {
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
adaptive_max_pool3d_single_out_frame<scalar_t>(
input_data,
output_data,
indices_data,
sizeD,
isizeT,
isizeH,
isizeW,
osizeT,
osizeH,
osizeW,
istrideD,
istrideT,
istrideH,
istrideW);
});
} else {
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
adaptive_max_pool3d_out_frame<scalar_t>(
input_data,
output_data,
indices_data,
sizeB,
sizeD,
isizeT,
isizeH,
isizeW,
osizeT,
osizeH,
osizeW,
istrideB,
istrideD,
istrideT,
istrideH,
istrideW);
});
}
}
Tensor& adaptive_max_pool3d_backward_out_cpu(const Tensor& gradOutput_,

View File

@ -918,8 +918,7 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor
// - 2-dimensional (2D) tensor or batch of 2D tensors (matrix case)
// original torch.solve supported only the matrix case, while NumPy works for both cases
// for the batched input we need to be able to distinguish them
auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim()-1); // input.shape[:-1]
bool vector_case = other.dim() == 1 || (input.dim()-1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
bool vector_case = linalg_solve_is_vector_rhs(input, other);
bool is_batched_column_major = false;
if (vector_case) {
@ -929,7 +928,7 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor
}
// if 'other' is a batch of 2D tensors, then 'input' can be non-batched and will be broadcasted
auto expected_shape = expected_batched_rhs_shape;
auto expected_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
if (!vector_case && other.dim() > 2) {
expected_shape = other.sizes();
}
@ -1020,8 +1019,7 @@ Tensor& linalg_solve_out(const Tensor& input, const Tensor& other, Tensor& resul
// Now check LAPACK/MAGMA error codes
// batchCheckErrors(Tensor, char*) calls 'infos = infos.to(kCPU)'
auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim()-1); // input.shape[:-1]
bool vector_case = other.dim() == 1 || (input.dim()-1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
bool vector_case = linalg_solve_is_vector_rhs(input, other);
if (vector_case ? result.dim() > 1 : result.dim() > 2) {
batchCheckErrors(infos, "linalg_solve");
} else {
@ -1606,9 +1604,8 @@ std::tuple<Tensor&, Tensor&> triangular_solve_out(const Tensor& self, const Tens
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
template<typename scalar_t>
static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
std::vector<int64_t>& infos) {
template <typename scalar_t>
static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n) {
#ifndef USE_LAPACK
AT_ERROR("qr: LAPACK library not found in compilation");
#else
@ -1627,6 +1624,7 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
int lwork = -1;
scalar_t wkopt;
lapackGeqrf<scalar_t>(m, n, self_data, m, tau_data, &wkopt, lwork, &info);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
Tensor work = at::empty({lwork}, self.options());
@ -1636,10 +1634,10 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
// now compute the actual R and TAU
lapackGeqrf<scalar_t>(m, n, self_working_ptr, m, tau_working_ptr, work.data_ptr<scalar_t>(), lwork, &info);
infos[i] = info;
if (info != 0) {
return;
}
// info from lapackGeqrf only reports if the i-th parameter is wrong
// so we don't need to check it all the time
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
}
#endif
}
@ -1647,7 +1645,6 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n,
std::tuple<Tensor, Tensor> _linalg_qr_helper_cpu(const Tensor& self, std::string mode) {
bool compute_q, reduced;
std::tie(compute_q, reduced) = _parse_qr_mode(mode);
std::vector<int64_t> infos(batchCount(self), 0);
int64_t m = self.size(-2), n = self.size(-1);
// Setup inputs for apply_geqrf
@ -1682,13 +1679,8 @@ std::tuple<Tensor, Tensor> _linalg_qr_helper_cpu(const Tensor& self, std::string
q_working_copy.narrow(-1, 0, n).copy_(self);
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "qr_cpu", [&]{
apply_geqrf<scalar_t>(q_working_copy, tau_working_copy, m, n, infos);
apply_geqrf<scalar_t>(q_working_copy, tau_working_copy, m, n);
});
if (self.dim() > 2) {
batchCheckErrors(infos, "qr_cpu");
} else {
singleCheckErrors(infos[0], "qr_cpu");
}
R = q_working_copy.slice(-2, 0, n_columns_q).slice(-1, 0, n).triu();
if (!compute_q) {
@ -2977,204 +2969,378 @@ Tensor& _lstsq_helper_cpu(
#endif
}
std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
const Tensor& self, const Tensor& b,
c10::optional<double> cond,
c10::optional<std::string> driver) {
TORCH_CHECK(
self.device().type() == b.device().type(),
"torch.linalg.lstsq: input tensors should be on the same device"
);
TORCH_CHECK(
self.scalar_type() == b.scalar_type(),
"torch.linalg.lstsq: input tensors should be of the same dtype"
);
TORCH_CHECK(
self.dim() >= 2,
"torch.linalg.lstsq: input `self` Tensor should be at least 2D"
);
TORCH_CHECK(
b.dim() >= 1,
"torch.linalg.lstsq: input `b` Tensor should be at least 1D"
);
auto dim_diff = self.dim() - b.dim();
TORCH_CHECK(
0 <= dim_diff && dim_diff <= 1,
"torch.linalg.lstsq: self.dim() must be greater or equal to b.dim() and "
"(self.dim() - b.dim()) <= 1"
);
Tensor b_2d = dim_diff ? b.unsqueeze(-1) : b;
TORCH_CHECK(
self.size(-2) == b_2d.size(-2),
dim_diff ? "torch.linalg.lstsq: self.size(-2) should match b.size(-1)" :
"torch.linalg.lstsq: self.size(-2) should match b.size(-2)"
);
/*
Solves a least squares problem. That is minimizing the squared Frobenius norm of |B - A X|.
// if `driver` is empty, we use `driver_opt` to be set to
// c10::nullopt if working with CUDA tensors,
Input args:
* 'input' - Tensor containing batches of m-by-n matrix A.
* 'other' - Tensor containing batches of max(m, n)-by-nrhs matrix B.
* 'cond' - relative tolerance for determining rank of A.
* 'driver' - the name of the LAPACK driver that is used to compute the solution.
Output args (modified in-place):
* 'solution' - Tensor to store the solution matrix X.
* 'residuals' - Tensor to store values of the residual sum of squares for each column of the solution.
* 'rank' - Tensor to store the rank of A.
* 'singular_values' - Tensor to store the singular values of A.
* 'infos' - Tensor to store error codes of linear algebra math library.
For further details, please see the LAPACK documentation for GELS/GELSY/GELSS/GELSD routines.
*/
static void linalg_lstsq_out_info(
Tensor& solution,
Tensor& residuals,
Tensor& rank,
Tensor& singular_values,
Tensor& infos,
const Tensor& input,
const Tensor& other,
double rcond,
std::string& driver) {
// These internal asserts make explicit the assumptions in the implementation
// Error check with the actual error messages are done on the higher level of
// the hierarchy of calls
TORCH_INTERNAL_ASSERT(input.dim() >= 2);
TORCH_INTERNAL_ASSERT(other.dim() >= 1);
auto dim_diff = input.dim() - other.dim();
TORCH_INTERNAL_ASSERT(0 <= dim_diff && dim_diff <= 1);
TORCH_INTERNAL_ASSERT(input.scalar_type() == other.scalar_type());
TORCH_INTERNAL_ASSERT(input.device() == other.device());
TORCH_INTERNAL_ASSERT(solution.scalar_type() == input.scalar_type());
TORCH_INTERNAL_ASSERT(solution.device() == input.device());
TORCH_INTERNAL_ASSERT(residuals.device() == input.device());
TORCH_INTERNAL_ASSERT(rank.scalar_type() == at::kLong);
TORCH_INTERNAL_ASSERT(rank.device() == input.device());
auto real_dtype = toValueType(input.scalar_type());
TORCH_INTERNAL_ASSERT(singular_values.scalar_type() == real_dtype);
TORCH_INTERNAL_ASSERT(singular_values.device() == input.device());
TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
TORCH_INTERNAL_ASSERT(infos.device() == input.device());
TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));
TORCH_INTERNAL_ASSERT(infos.is_contiguous());
bool vector_case = linalg_solve_is_vector_rhs(input, other);
// we need to unsqueeze 'other' because 2-dimensional tensors are expected in the implementation
Tensor other_2d = vector_case ? other.unsqueeze(-1) : other;
TORCH_INTERNAL_ASSERT(input.size(-2) == other_2d.size(-2));
std::vector<int64_t> expected_solution_shape = broadcast_batch_size(input, other_2d, input.dim() - 2);
// the actual shape of the solution returned is (*, n,) or (*, n, nrhs)
// but LAPACK requires extra dimensions to store raw residuals
// so the expected shape is (*, max(m, n),) or (*, max(m, n), nrhs)
auto m = input.size(-2);
auto n = input.size(-1);
auto nrhs = other.size(-1);
expected_solution_shape.push_back(std::max(m, n));
if (!vector_case) {
expected_solution_shape.push_back(nrhs);
}
// if 'solution' has no elements we can modify it
if (solution.numel() == 0) {
if (vector_case) {
solution.resize_(expected_solution_shape, MemoryFormat::Contiguous);
} else {
auto shape_transposed = expected_solution_shape;
std::swap(shape_transposed.end()[-1], shape_transposed.end()[-2]);
solution.resize_(shape_transposed, MemoryFormat::Contiguous);
solution.transpose_(-2, -1);
}
}
// if 'solution' is non-empty it must have the expected shape
TORCH_INTERNAL_ASSERT(solution.sizes().equals(expected_solution_shape));
// 'solution' must be in batched column major order (Fortran contiguous) for 2D inputs
// or C contiguous for 1D input
if (vector_case) {
TORCH_INTERNAL_ASSERT(solution.is_contiguous());
} else {
TORCH_INTERNAL_ASSERT(solution.transpose(-2, -1).is_contiguous());
}
// for 1-dimensional 'other', we need to unsqueeze the 'solution' before passing to "apply_solve"
if (vector_case) {
solution = solution.unsqueeze_(-1);
}
// _linalg_lstsq_helper_ performs calculations in-place and 'solution' must be a copy of other_2d
solution.narrow(-2, 0, other_2d.size(-2)).copy_(other_2d);
// if 'rank' is empty we might resize it
auto input_batch_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2);
if (rank.numel() == 0 && driver != "gels") { // gels driver doesn't set 'rank'
rank.resize_(input_batch_shape, MemoryFormat::Contiguous);
}
// if 'rank' is non-empty it must have the expected shape and be contiguous
if (driver != "gels") {
TORCH_INTERNAL_ASSERT(rank.sizes().equals(input_batch_shape));
TORCH_INTERNAL_ASSERT(rank.is_contiguous());
}
// if 'singular_values' is empty we might resize it
auto singular_values_shape = input_batch_shape.vec();
singular_values_shape.push_back(std::min(m, n));
if (singular_values.numel() == 0 && (driver == "gelsd" || driver == "gelss")) {
singular_values.resize_(singular_values_shape, MemoryFormat::Contiguous);
}
// if 'singular_values' is non-empty it must have the expected shape and be contiguous
if (driver == "gelsd" || driver == "gelss") {
TORCH_INTERNAL_ASSERT(singular_values.sizes().equals(singular_values_shape));
TORCH_INTERNAL_ASSERT(singular_values.is_contiguous());
}
// 'input' is modified in-place so we need a column-major copy
auto input_working_copy = copyBatchedColumnMajor(input);
// now the actual call that computes the result in-place (apply_lstsq)
at::_lstsq_helper_(solution, rank, singular_values, infos, input_working_copy, rcond, driver);
if (m > n && driver != "gelsy") {
// LAPACK stores residuals data for postprocessing in rows n:(m-n)
auto raw_residuals = solution.narrow(/*dim=*/-2, /*start=*/n, /*length*/m - n);
if (raw_residuals.is_complex()) {
raw_residuals.mul_(raw_residuals.conj());
raw_residuals = at::real(raw_residuals);
} else {
raw_residuals.pow_(2);
}
at::sum_out(residuals, raw_residuals, /*dim=*/-2, /*keepdim=*/false, /*dtype*/real_dtype);
}
solution = solution.narrow(/*dim=*/-2, /*start=*/0, /*length*/n);
if (m == 0) {
solution.zero_();
}
// for 1-dimensional 'other', we need to squeeze the solution after "apply_lstsq"
if (vector_case) {
solution = solution.squeeze_(-1);
}
}
static std::string get_default_lstsq_driver(c10::optional<std::string> driver, const Tensor& input) {
// if `driver` is empty, we set driver_str to "gels" if working with CUDA tensors,
// otherwise to "gelsy" driver.
// CUDA tensors are treated specially because MAGMA
// has only 'gels' driver supported.
c10::optional<std::string> driver_opt = driver;
std::string driver_str;
// check whether the user provided name is a valid driver name
if (driver.has_value()) {
auto driver_str = driver.value();
driver_str = driver.value();
// convert `driver_str` to lower case inplace.
std::transform(driver_str.begin(), driver_str.end(), driver_str.begin(),
[](unsigned char c) { return std::tolower(c); });
static std::unordered_set<std::string> allowed_drivers = {
"gels", "gelsy", "gelsd", "gelss"
};
if (at::kCPU == self.device().type()) {
if (input.device() == at::kCPU) {
TORCH_CHECK(
allowed_drivers.find(driver_str) != allowed_drivers.end(),
"torch.linalg.lstsq: parameter `driver` should be one of "
"(gels, gelsy, gelsd, gelss)"
);
}
//else if (at::kCUDA == self.device().type()) {
else {
} else { // else if (input.is_cuda())
TORCH_CHECK(
driver_str == "gels",
"torch.linalg.lstsq: `driver` other than `gels` is not supported on CUDA"
);
}
} else {
// if driver name is not provided, set to default 'gelsy' if on CPU,
// or to `gels` if on CUDA.
driver_str = input.is_cuda() ? "gels" : "gelsy";
}
// if driver name is not provided, set to default 'gelsy' if on CPU,
// or to `gels` if on CUDA.
else {
driver_opt = (at::kCPU == self.device().type())
? c10::optional<std::string>("gelsy")
: c10::optional<std::string>("gels");
return driver_str;
}
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> linalg_lstsq_out(
const Tensor& input,
const Tensor& other,
c10::optional<double> rcond,
c10::optional<std::string> driver,
Tensor& solution,
Tensor& residuals,
Tensor& rank,
Tensor& singular_values) {
TORCH_CHECK(input.dim() >= 2, "torch.linalg.lstsq: input must have at least 2 dimensions.");
TORCH_CHECK(other.dim() >= 1, "torch.linalg.lstsq: other must have at least 1 dimension.");
TORCH_CHECK(
input.scalar_type() == other.scalar_type(),
"torch.linalg.lstsq: Expected input and other to have the same dtype, but got input's dtype ",
input.scalar_type(),
" and other's dtype ",
other.scalar_type());
auto dim_diff = input.dim() - other.dim();
TORCH_CHECK(
0 <= dim_diff && dim_diff <= 1,
"torch.linalg.lstsq: input.dim() must be greater or equal to other.dim() and (input.dim() - other.dim()) <= 1");
Tensor other_2d = dim_diff ? other.unsqueeze(-1) : other;
TORCH_CHECK(
input.size(-2) == other_2d.size(-2),
dim_diff ? "torch.linalg.lstsq: input.size(-2) should match other.size(-1)"
: "torch.linalg.lstsq: input.size(-2) should match other.size(-2)");
checkSameDevice("torch.linalg.lstsq", other, input, "other");
checkSameDevice("torch.linalg.lstsq", solution, input, "solution");
checkSameDevice("torch.linalg.lstsq", residuals, input, "residuals");
checkSameDevice("torch.linalg.lstsq", rank, input, "rank");
checkSameDevice("torch.linalg.lstsq", singular_values, input, "singular_values");
// 'solution' is expected to have same dtype as input
checkLinalgCompatibleDtype("torch.linalg.lstsq", solution, input, "solution");
// 'residuals' is expected to have real float dtype
ScalarType real_dtype = c10::toValueType(input.scalar_type());
checkLinalgCompatibleDtype("torch.linalg.lstsq", residuals.scalar_type(), real_dtype, "solution");
// 'rank' is expected to have integer dtype
// actual LAPACK calls use int32_t type for rank, but we promote it to int64_t
// to be consistent with torch.linalg.matrix_rank output dtype
ScalarType rank_expected_type = ScalarType::Long;
checkLinalgCompatibleDtype("torch.linalg.lstsq", rank.scalar_type(), rank_expected_type, "rank");
// 'singular_values' is expected to have real float dtype
checkLinalgCompatibleDtype("torch.linalg.lstsq", singular_values.scalar_type(), real_dtype, "singular_values");
std::string driver_name = get_default_lstsq_driver(driver, input);
// set default rcond value
// TODO: Change this to match non-legacy NumPy behaviour
double rcond_value = rcond.has_value() && (rcond.value() > 0)
? rcond.value()
: _get_epsilon(c10::toValueType(input.scalar_type()));
auto infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt));
// now check whether the provided output tensors can be used directly
// Two types of 'other' tensors are supported:
// - 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
// - 2-dimensional (2D) tensor or batch of 2D tensors (matrix case)
// original torch.lstsq supported only the matrix case, while NumPy works for both cases
// for the batched input we need to be able to distinguish them
// auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
// bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
bool vector_case = linalg_solve_is_vector_rhs(input, other);
// provided output tensor can be used directly if:
// 1. the shape matches the expected shape
// 2. the dtype matches the expected dtype
// 3. the tensor is contiguous
// Checks for the 'solution' tensor
std::vector<int64_t> expected_solution_shape = broadcast_batch_size(input, other_2d, input.dim() - 2);
// the actual shape of the shape of the solution returned in (*, n,) or (*, n, nrhs)
// but LAPACK requires extra dimensions so the expected shape is (*, max(m, n),) or (*, max(m, n), nrhs)
expected_solution_shape.push_back(std::max(input.size(-1), input.size(-2)));
if (!vector_case && other.dim() > 2) {
expected_solution_shape.push_back(other.size(-1));
}
// CUDA has only `gels` driver now which ONLY works with overdetermined systems
if (at::kCUDA == self.device().type()) {
TORCH_CHECK(
self.size(-2) >= self.size(-1),
"torch.linalg.lstsq: only overdetermined systems (m >= n) are allowed on CUDA"
);
bool solution_equal_expected_shape = solution.sizes().equals(expected_solution_shape);
bool solution_input_same_type = (solution.scalar_type() == input.scalar_type());
bool is_solution_batched_column_major = false;
if (vector_case) {
is_solution_batched_column_major = solution.is_contiguous();
} else if (!vector_case && solution.dim() >= 2) {
is_solution_batched_column_major = solution.transpose(-2, -1).is_contiguous();
}
// LAPACK/MAGMA requries inputs to be in the column-major-order.
auto self_working_copy = copyBatchedColumnMajor(self);
// 'residuals' is not checked here because at::sum_out(residuals, ...) does that
// Tensor b must be of size (..., max(m, n), nrhs)
// and in the column-major order.
// We allow the batch dims of `self` to broadcast over the batch
// dims of `b` so that it is possible to solve multiple systems with
// with the same lhs (encoded by `self`) / rhs (encoded by `b`).
// `b_working_copy` is modified in-place and the combination of
// batch broadcasting plus LAPACK/MAGMA requirements impose the following
// restrictions on sizes/strides of `b`:
// 1. b.size = (broadcasted_batch_size(self, b), max(m, n), nrhs).
// 2. b.stride should correspond to an almost contiguous Tensor in the column-major-order,
// i.e. b.stride = b.transpose(-2, -1).contiguous().transpose(-2, -1).strides()
auto m = self.size(-2);
auto n = self.size(-1);
auto b_working_copy = copyBatchedColumnMajor(b_2d,
/*nrows=*/std::max(m, n),
/*desired_batch_sizes=*/broadcast_batch_size(self, b_2d, self.dim() - 2));
auto input_batch_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2);
double rcond = cond.has_value() && (cond.value() > 0)
? cond.value()
: _get_epsilon(c10::toValueType(self.scalar_type()));
auto batch_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend() - 2);
Tensor rank = at::empty({0}, self.options().dtype(at::kLong));
if (driver_opt.value() != "gels") {
rank.resize_(batch_shape, MemoryFormat::Contiguous);
// Checks for the 'rank' tensor
// rank is a scalar value for each matrix in the batch so
// rank's expected shape is equal to input.shape[0:input.ndim-2]
bool rank_equal_expected_shape = true;
bool rank_equal_expected_type = true;
bool rank_is_contiguous = true;
if (driver_name != "gels") { // gels driver doesn't set 'rank'
rank_equal_expected_shape = rank.sizes().equals(input_batch_shape);
rank_equal_expected_type = (rank.scalar_type() == at::kLong);
rank_is_contiguous = rank.is_contiguous();
}
auto singular_values_shape = batch_shape.vec();
singular_values_shape.push_back(std::min(m, n));
auto real_dtype = c10::toValueType(self.scalar_type());
Tensor singular_values = at::empty({0}, self.options().dtype(real_dtype));
if (driver_opt.value() == "gelsd" || driver_opt.value() == "gelss") {
singular_values.resize_(singular_values_shape, MemoryFormat::Contiguous);
// Checks for the 'singular_values' tensor
// singular values are computed only with "gelsd" and "gelss" drivers currently
bool singular_values_equal_expected_shape = true;
bool singular_values_equal_expected_type = true;
bool singular_values_is_contiguous = true;
if (driver_name == "gelsd" || driver_name == "gelss") {
auto singular_values_shape = input_batch_shape.vec();
singular_values_shape.push_back(std::min(input.size(-1), input.size(-2)));
singular_values_equal_expected_shape = singular_values.sizes().equals(singular_values_shape);
singular_values_equal_expected_type = (singular_values.scalar_type() == real_dtype);
singular_values_is_contiguous = singular_values.is_contiguous();
}
Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt).device(kCPU));
// if solution is not empty and not in batched column major format
bool copy_needed = (solution.numel() != 0 && !is_solution_batched_column_major);
copy_needed |= !solution_input_same_type; // or solution does not have the same dtype as input
copy_needed |= (solution.numel() != 0 && !solution_equal_expected_shape); // or solution does not have the expected shape
Tensor x, residuals;
copy_needed |= !rank_equal_expected_type;
copy_needed |= (rank.numel() != 0 && !rank_equal_expected_shape);
copy_needed |= (rank.numel() != 0 && !rank_is_contiguous);
// path if neither `self` nor `b` is empty
if (self.numel() && b.numel()) {
x = at::_lstsq_helper_(b_working_copy, rank, singular_values, infos, self_working_copy, rcond, driver_opt.value());
if (m > n && driver_opt.value() != "gelsy") {
residuals = x.narrow(-2, n, std::max(m, n) - n).abs().pow_(2).sum(-2);
}
x = x.narrow(-2, 0, n);
}
// if either `self` or `b` is empty, return an empty tensor or,
// if non-zero sizes, return a tensor of zeros.
else {
x = b_working_copy.zero_().narrow(-2, 0, n);
copy_needed |= !singular_values_equal_expected_type;
copy_needed |= (singular_values.numel() != 0 && !singular_values_equal_expected_shape);
copy_needed |= (singular_values.numel() != 0 && !singular_values_is_contiguous);
if (copy_needed) { // we have to allocate temporary tensors
Tensor solution_tmp = at::empty({0}, input.options());
Tensor residuals_tmp = at::empty({0}, input.options().dtype(real_dtype));
Tensor rank_tmp = at::empty({0}, input.options().dtype(at::kLong));
Tensor singular_values_tmp = at::empty({0}, input.options().dtype(real_dtype));
linalg_lstsq_out_info(solution_tmp, residuals_tmp, rank_tmp, singular_values_tmp, infos, input, other, rcond_value, driver_name);
at::native::resize_output(solution, solution_tmp.sizes());
solution.copy_(solution_tmp);
at::native::resize_output(residuals, residuals_tmp.sizes());
residuals.copy_(residuals_tmp);
at::native::resize_output(rank, rank_tmp.sizes());
rank.copy_(rank_tmp);
at::native::resize_output(singular_values, singular_values_tmp.sizes());
singular_values.copy_(singular_values_tmp);
} else {
// else use the provided output storage directly
linalg_lstsq_out_info(solution, residuals, rank, singular_values, infos, input, other, rcond_value, driver_name);
}
auto return_empty_if_undefined = [&self](Tensor& t,
c10::optional<at::ScalarType> dtype = c10::nullopt,
c10::optional<std::vector<int64_t>> shape = c10::nullopt) {
if (t.defined()) {
return t;
}
else {
auto output_dtype = dtype.has_value() ? dtype.value() : self.scalar_type();
if (shape.has_value()) {
return at::empty(shape.value(), self.options().dtype(output_dtype));
}
else {
return at::empty({0}, self.options().dtype(output_dtype));
}
}
};
// Some output stays undefined for some values of driver.
// Instead of returning undefined tensors which get exposed as
// Nones in the Python interface, we return empty tensors.
// This way we follow the convention of output types in the
// torch.linalg namespace.
// NOTE: we run drivers only if both inputs are non-empty!
// Hence the code below explicitly handles each and every output
// if `self` is empty.
// Numpy and Scipy always return ranks for empty matrices,
// even for drivers which are not rank-revealing.
if (self.numel()) {
rank = return_empty_if_undefined(rank, at::kLong);
}
else {
rank = at::zeros(batch_shape, self.options().dtype(at::kLong));
}
// undefined residuals could only be an empty Tensor of shape (0)
residuals = return_empty_if_undefined(residuals);
if (!self.numel()
&& (driver_opt.value() == "gelss" || driver_opt.value() == "gelsd")) {
// when `self` is empty, return singular_values of shape
// (*self.shape[:-2], 0) only if driver is in ('gelss', 'gelsd')
auto singular_values_empty_shape = batch_shape.vec();
singular_values_empty_shape.push_back(0);
singular_values = return_empty_if_undefined(
singular_values,
at::toValueType(self.scalar_type()),
singular_values_empty_shape);
}
else {
// otherwise return an empty tensor of shape (0)
singular_values = return_empty_if_undefined(
singular_values,
at::toValueType(self.scalar_type()));
}
if (self.dim() > 2) {
if (infos.numel() > 1) {
batchCheckErrors(infos, "torch.linalg.lstsq");
} else {
singleCheckErrors(infos.item().toInt(), "torch.linalg.lstsq");
singleCheckErrors(infos.item<int64_t>(), "torch.linalg.lstsq");
}
return std::make_tuple(x, residuals, rank, singular_values);
return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(solution, residuals, rank, singular_values);
}
std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
const Tensor& input, const Tensor& other,
c10::optional<double> rcond,
c10::optional<std::string> driver) {
Tensor solution = at::empty({0}, input.options());
Tensor residuals = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
Tensor rank = at::empty({0}, input.options().dtype(at::kLong));
Tensor singular_values = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
std::tie(solution, residuals, rank, singular_values) =
at::linalg_lstsq_outf(input, other, rcond, driver, solution, residuals, rank, singular_values);
return std::make_tuple(solution, residuals, rank, singular_values);
}
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -1,9 +1,7 @@
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <ATen/NativeFunctions.h>
#include <ATen/NamedTensorUtils.h>
#include <ATen/native/Pool.h>
#include <tuple>
namespace at {
@ -11,117 +9,10 @@ namespace native {
namespace {
template <typename scalar_t>
static void max_pool2d_with_indices_single_out_frame(
scalar_t *input_p,
scalar_t *output_p,
int64_t *ind_p,
int64_t nslices,
int64_t iwidth,
int64_t iheight,
int64_t owidth,
int64_t oheight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH
)
{
at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
for (auto k = start; k < end; k++)
{
/* loop over output */
int64_t i, j;
scalar_t *ip = input_p + k*iwidth*iheight;
for(i = 0; i < oheight; i++)
{
for(j = 0; j < owidth; j++)
{
int64_t hstart = i * dH - padH;
int64_t wstart = j * dW - padW;
int64_t hend = std::min(hstart + (kH - 1) * dilationH + 1, iheight);
int64_t wend = std::min(wstart + (kW - 1) * dilationW + 1, iwidth);
while(hstart < 0)
hstart += dilationH;
while(wstart < 0)
wstart += dilationW;
/* local pointers */
scalar_t *op = output_p + k*owidth*oheight + i*owidth + j;
int64_t *indp = ind_p + k*owidth*oheight + i*owidth + j;
/* compute local max: */
int64_t maxindex = hstart*iwidth + wstart;
scalar_t maxval = -std::numeric_limits<scalar_t>::infinity();
for(int64_t y = hstart; y < hend; y += dilationH)
{
for(int64_t x = wstart; x < wend; x += dilationW)
{
int64_t tcntr = y*iwidth + x;
scalar_t val = *(ip + tcntr);
if ((val > maxval) || std::isnan(val))
{
maxval = val;
maxindex = tcntr;
}
}
}
/* set output to local max */
*op = maxval;
/* store location of max */
*indp = maxindex;
}
}
}
});
}
template <typename scalar_t>
static void max_pool2d_with_indices_out_frame(
scalar_t *input_data,
scalar_t *output_data,
int64_t *indices_data,
int64_t nbatch,
int64_t nInputPlane,
int64_t inputWidth,
int64_t inputHeight,
int64_t outputWidth,
int64_t outputHeight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH)
{
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
for (auto p = start; p < end; p++) {
max_pool2d_with_indices_single_out_frame(
input_data+p*nInputPlane*inputWidth*inputHeight,
output_data+p*nInputPlane*outputWidth*outputHeight,
indices_data+p*nInputPlane*outputWidth*outputHeight,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH, dW, dH,
padW, padH,
dilationW, dilationH);
}
});
}
void max_pool2d_with_indices_out_cpu_template(
Tensor& output,
Tensor& indices,
const Tensor& input_,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
@ -152,152 +43,50 @@ void max_pool2d_with_indices_out_cpu_template(
const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
TORCH_CHECK(input.dtype() == output.dtype(),
"expected dtype ", input.dtype(), " for `output` but got dtype ", output.dtype());
/* sizes */
const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
const int64_t nInputPlane = input_.size(-3);
const int64_t inputHeight = input_.size(-2);
const int64_t inputWidth = input_.size(-1);
const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
const int64_t nInputPlane = input.size(-3);
const int64_t inputHeight = input.size(-2);
const int64_t inputWidth = input.size(-1);
const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
pool2d_shape_check(
input_,
input,
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
nInputPlane,
inputHeight, inputWidth,
outputHeight, outputWidth, input_.suggest_memory_format());
outputHeight, outputWidth, input.suggest_memory_format());
/* get contiguous input */
Tensor input = input_.contiguous();
/* resize output */
if (input.ndimension() == 3)
{
/* resize output and indices */
if (input.ndimension() == 3) {
output.resize_({nInputPlane, outputHeight, outputWidth});
/* indices will contain the locations for each output point */
indices.resize_({nInputPlane, outputHeight, outputWidth});
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
"max_pool2d_with_indices_cpu",
[&] {
/* get raw pointers */
scalar_t *input_data = input.data_ptr<scalar_t>();
scalar_t *output_data = output.data_ptr<scalar_t>();
int64_t *indices_data = indices.data_ptr<int64_t>();
max_pool2d_with_indices_single_out_frame(
input_data, output_data,
indices_data,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH, dW, dH,
padW, padH,
dilationW, dilationH);
}
);
}
else
{
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
} else {
output.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, input.suggest_memory_format());
/* indices will contain the locations for each output point */
indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
"max_pool2d_with_indices_cpu",
[&] {
scalar_t *input_data = input.data_ptr<scalar_t>();
scalar_t *output_data = output.data_ptr<scalar_t>();
int64_t *indices_data = indices.data_ptr<int64_t>();
max_pool2d_with_indices_out_frame(
input_data,
output_data,
indices_data,
nbatch,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH, dW, dH,
padW, padH,
dilationW, dilationH); }
);
indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, input.suggest_memory_format());
}
}
template <typename scalar_t>
static void max_pool2d_with_indices_backward_single_out_frame(
scalar_t *gradInput_p,
scalar_t *gradOutput_p,
int64_t *ind_p,
int64_t nInputPlane,
int64_t inputWidth,
int64_t inputHeight,
int64_t outputWidth,
int64_t outputHeight,
int dW,
int dH)
{
at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
for (auto k = start; k < end; k++)
{
scalar_t *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
scalar_t *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
int64_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
/* calculate max points */
int64_t i, j;
for(i = 0; i < outputHeight; i++)
{
for(j = 0; j < outputWidth; j++)
{
/* retrieve position of max */
int64_t maxp = ind_p_k[i*outputWidth + j];
if (maxp != -1) {
/* update gradient */
gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
}
}
}
}
});
}
template <typename scalar_t>
static void max_pool2d_with_indices_backward_out_frame(
scalar_t *gradInput_data,
scalar_t *gradOutput_data,
int64_t *indices_data,
int64_t nbatch,
int64_t nInputPlane,
int64_t inputWidth,
int64_t inputHeight,
int64_t outputWidth,
int64_t outputHeight,
int dW,
int dH)
{
at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
for (auto p = start; p < end; p++) {
max_pool2d_with_indices_backward_single_out_frame<scalar_t>(
gradInput_data+p*nInputPlane*inputWidth*inputHeight,
gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
indices_data+p*nInputPlane*outputWidth*outputHeight,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
dW, dH);
}
});
max_pool2d_kernel(
kCPU, output, indices, input,
kW, kH,
dW, dH,
padW, padH,
dilationW, dilationH);
}
Tensor& max_pool2d_with_indices_backward_out_cpu_template(
Tensor& gradInput,
const Tensor& gradOutput_,
const Tensor& gradOutput,
const Tensor& input,
const Tensor& indices,
IntArrayRef kernel_size,
@ -333,11 +122,13 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(
TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
/* get contiguous gradOutput */
const Tensor gradOutput = gradOutput_.contiguous();
TORCH_CHECK(input.dtype() == gradOutput.dtype(),
"expected dtype ", input.dtype(), " for `gradOutput` but got dtype ", gradOutput.dtype());
TORCH_CHECK(input.dtype() == gradInput.dtype(),
"expected dtype ", input.dtype(), " for `gradInput` but got dtype ", gradInput.dtype());
/* resize */
gradInput.resize_as_(input);
gradInput.resize_(input.sizes(), input.suggest_memory_format());
gradInput.zero_();
/* sizes */
@ -354,7 +145,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(
max_pool2d_backward_shape_check(
input,
gradOutput_,
gradOutput,
indices,
nbatch,
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
@ -363,48 +154,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu_template(
outputHeight_for_shape_check, outputWidth_for_shape_check,
input.suggest_memory_format());
/* backprop */
if (input.ndimension() == 3)
{
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
"max_pool2d_with_indices_backward",
[&] {
/* get raw pointers */
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
int64_t *indices_data = indices.data_ptr<int64_t>();
max_pool2d_with_indices_backward_single_out_frame(
gradInput_data, gradOutput_data,
indices_data,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
dW, dH);
}
);
}
else
{
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
"max_pool2d_with_indices_backward",
[&] {
/* get raw pointers */
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
int64_t *indices_data = indices.data_ptr<int64_t>();
max_pool2d_with_indices_backward_out_frame<scalar_t>(
gradInput_data, gradOutput_data,
indices_data,
nbatch,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
dW, dH);
}
);
}
max_pool2d_backward_kernel(kCPU, gradInput, gradOutput, indices);
return gradInput;
}
@ -461,7 +211,8 @@ std::tuple<Tensor, Tensor> max_pool2d_with_indices_cpu(
return std::tuple<Tensor, Tensor>(output, indices);
}
Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
Tensor& max_pool2d_with_indices_backward_out_cpu(
const Tensor& gradOutput,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
@ -473,7 +224,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
{
max_pool2d_with_indices_backward_out_cpu_template(
gradInput,
gradOutput_,
gradOutput,
input,
indices,
kernel_size,
@ -485,7 +236,7 @@ Tensor& max_pool2d_with_indices_backward_out_cpu(const Tensor& gradOutput_,
}
Tensor max_pool2d_with_indices_backward_cpu(
const Tensor& gradOutput_,
const Tensor& gradOutput,
const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
@ -494,10 +245,10 @@ Tensor max_pool2d_with_indices_backward_cpu(
bool ceil_mode,
const Tensor& indices)
{
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto gradInput = at::empty({0}, input.options());
max_pool2d_with_indices_backward_out_cpu_template(
gradInput,
gradOutput_,
gradOutput,
input,
indices,
kernel_size,
@ -508,5 +259,8 @@ Tensor max_pool2d_with_indices_backward_cpu(
return gradInput;
}
DEFINE_DISPATCH(max_pool2d_kernel);
DEFINE_DISPATCH(max_pool2d_backward_kernel);
} // at::native
} // at

View File

@ -514,4 +514,20 @@ static inline void checkLinalgCompatibleDtype(const std::string& fn_name, Scalar
out_name, " with dtype ", out_type);
}
/*
Two types of 'other' tensors are supported when solving
a system of linear equations matmul(input, x) = other:
* 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
* 2-dimensional (2D) tensor or batch of 2D tensors (matrix case).
The original torch.solve supported only the matrix case, while NumPy works for both cases.
For the batched input we need to be able to distinguish them.
Let input.shape = (batch_dimensions, m, n), then 'other' is of vector type if other.shape == (batch_dimensions, m).
This rule is compatible with NumPy, see https://github.com/numpy/numpy/blob/v1.20.0/numpy/linalg/linalg.py#L384-L389
*/
static inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& other) {
auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
return vector_case;
}
}} // namespace at::native

View File

@ -1,14 +1,20 @@
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <ATen/NativeFunctions.h>
#include <ATen/div_rtn.h>
#include <tuple>
#include <ATen/native/DispatchStub.h>
#pragma once
namespace at {
namespace native {
using max_pool2d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input,
int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH);
using max_pool2d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel);
DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
namespace {
template <typename dest_t, typename src_t>

View File

@ -0,0 +1,359 @@
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
#include <ATen/cpu/vec256/vec256.h>
#include <ATen/native/Pool.h>
#include <ATen/native/cpu/utils.h>
namespace at { namespace native {
namespace {
template <typename scalar_t>
void cpu_max_pool(
Tensor& output_,
Tensor indices_,
const Tensor& input_,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int dilationW, int dilationH) {
auto input = input_.contiguous();
auto output = output_.contiguous();
auto indices = indices_.contiguous();
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
int64_t numel = output.numel();
int64_t ndim = input.ndimension();
// treat batch size and channels as one dimension
int64_t channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
int64_t input_height = input.size(-2);
int64_t input_width = input.size(-1);
int64_t output_height = output.size(-2);
int64_t output_width = output.size(-1);
// parallel on dim N, C, H, W
at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
int64_t c = 0;
int64_t oh = 0;
int64_t ow = 0;
data_index_init(begin, c, channels, oh, output_height, ow, output_width);
for (int64_t i = begin; i < end; i++) {
int64_t ih0 = oh * dH - padH;
int64_t iw0 = ow * dW - padW;
int64_t ih1 = std::min(ih0 + (kH - 1) * dilationH + 1, input_height);
int64_t iw1 = std::min(iw0 + (kW - 1) * dilationW + 1, input_width);
while(ih0 < 0) { ih0 += dilationH; }
while(iw0 < 0) { iw0 += dilationW; }
// local pointers
scalar_t* input_ptr = input_data + c * input_height * input_width;
// compute local max
int64_t maxindex = ih0 * input_width + iw0;
scalar_t maxval = -std::numeric_limits<scalar_t>::infinity();
for (int64_t ih = ih0; ih < ih1; ih += dilationH) {
for (int64_t iw = iw0; iw < iw1; iw += dilationW) {
int64_t index = ih * input_width + iw;
scalar_t val = input_ptr[index];
if ((val > maxval) || std::isnan(val)) {
maxval = val;
maxindex = index;
}
}
}
// set output to local max and store location of max
output_data[i] = maxval;
indices_data[i] = maxindex;
// move on to next output index
data_index_step(c, channels, oh, output_height, ow, output_width);
}
});
if (!output_.is_contiguous()) {
output_.copy_(output);
}
if (!indices_.is_contiguous()) {
indices_.copy_(indices);
}
}
template <typename scalar_t>
void cpu_max_pool_channels_last(
Tensor& output_,
Tensor indices_,
const Tensor& input_,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int dilationW, int dilationH) {
TORCH_CHECK(input_.ndimension() == 4,
"max pooling with channels last format supports tensors with 4 dims");
auto memory_format = at::MemoryFormat::ChannelsLast;
auto input = input_.contiguous(memory_format);
auto output = output_.contiguous(memory_format);
auto indices = indices_.contiguous(memory_format);
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
int64_t nbatch = input.size(0);
int64_t channels = input.size(1);
int64_t input_height = input.size(2);
int64_t input_width = input.size(3);
int64_t output_height = output.size(2);
int64_t output_width = output.size(3);
using Vec = vec256::Vec256<scalar_t>;
using integer_t = vec256::int_same_size_t<scalar_t>;
using iVec = vec256::Vec256<integer_t>;
// for the convience of vectorization, use integer of the same size of scalar_t,
// e.g. int32_t for float, int64_t for double
// need to make sure doesn't overflow
TORCH_CHECK(input_height <= std::ceil((double)std::numeric_limits<integer_t>::max() / (double)input_width));
// parallel on dim N, H, W
at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
int64_t n = 0;
int64_t oh = 0;
int64_t ow = 0;
data_index_init(begin, n, nbatch, oh, output_height, ow, output_width);
int64_t size = channels;
int64_t len = size - (size % Vec::size());
// temp buffer holding index with integer_t
std::unique_ptr<integer_t []> index_buffer(new integer_t[len]);
for (int64_t i = begin; i < end; i++) {
int64_t ih0 = oh * dH - padH;
int64_t iw0 = ow * dW - padW;
int64_t ih1 = std::min(ih0 + (kH - 1) * dilationH + 1, input_height);
int64_t iw1 = std::min(iw0 + (kW - 1) * dilationW + 1, input_width);
while(ih0 < 0) { ih0 += dilationH; }
while(iw0 < 0) { iw0 += dilationW; }
scalar_t* out = output_data + i * channels;
int64_t* ind = indices_data + i * channels;
// Pass I: init out lane
iVec index0_vec = iVec(ih0 * input_width + iw0);
Vec out_vec = Vec(-std::numeric_limits<scalar_t>::infinity());
int64_t d1 = 0;
for (; d1 < len; d1 += Vec::size()) {
index0_vec.store(index_buffer.get() + d1);
out_vec.store(out + d1);
}
for (; d1 < size; d1++) {
ind[d1] = ih0 * input_width + iw0;
out[d1] = -std::numeric_limits<scalar_t>::infinity();
}
// Pass II: compute local max
for (int64_t ih = ih0; ih < ih1; ih += dilationH) {
for (int64_t iw = iw0; iw < iw1; iw += dilationW) {
scalar_t* in = input_data + n * input_height * input_width * channels +
ih * input_width * channels + iw * channels;
int64_t d2 = 0;
for (; d2 < len; d2 += Vec::size()) {
iVec index_vec = iVec(ih * input_width + iw);
Vec val_vec = Vec::loadu(in + d2);
iVec maxindex_vec = iVec::loadu(index_buffer.get() + d2);
Vec maxval_vec = Vec::loadu(out + d2);
// true = all ones, false = all zeros
Vec mask = (val_vec > maxval_vec) | val_vec.isnan();
iVec imask = vec256::cast<integer_t>(mask);
Vec out_vec = Vec::blendv(maxval_vec, val_vec, mask);
iVec ind_vec = iVec::blendv(maxindex_vec, index_vec, imask);
out_vec.store(out + d2);
ind_vec.store(index_buffer.get() + d2);
}
for (; d2 < size; d2++) {
int64_t index = ih * input_width + iw;
scalar_t val = in[d2];
int64_t maxindex = ind[d2];
scalar_t maxval = out[d2];
bool mask = (val > maxval) || std::isnan(val);
out[d2] = mask ? val : maxval;
ind[d2] = mask ? index : maxindex;
}
}
}
// convert indice data type
vec256::convert<integer_t, int64_t>(index_buffer.get(), ind, len);
// move on to next output index
data_index_step(n, nbatch, oh, output_height, ow, output_width);
}
});
if (!output_.is_contiguous(memory_format)) {
output_.copy_(output);
}
if (!indices_.is_contiguous(memory_format)) {
indices_.copy_(indices);
}
}
template <typename scalar_t>
void cpu_max_pool_backward(
Tensor& grad_input_,
const Tensor& grad_output_,
const Tensor& indices_) {
auto grad_output = grad_output_.contiguous();
auto indices = indices_.contiguous();
auto grad_input = grad_input_.contiguous();
auto grad_output_data = grad_output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
auto grad_input_data = grad_input.data_ptr<scalar_t>();
int64_t ndim = grad_output.ndimension();
// treat batch size and channels as one dimension
int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
int64_t input_height = grad_input.size(-2);
int64_t input_width = grad_input.size(-1);
int64_t output_height = grad_output.size(-2);
int64_t output_width = grad_output.size(-1);
// parallel on dim of N, C
at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
for (int64_t c = begin; c < end; c++) {
scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
int64_t * indices_ptr = indices_data + c * output_height * output_width;
for (int64_t oh = 0; oh < output_height; oh++) {
for (int64_t ow = 0; ow < output_width; ow++) {
// retrieve position of max
int64_t index = oh * output_width + ow;
int64_t maxindex = indices_ptr[index];
if (maxindex != -1) {
// update gradient
grad_input_ptr[maxindex] += grad_output_ptr[index];
}
}
}
}
});
if (!grad_input_.is_contiguous()) {
grad_input_.copy_(grad_input);
}
}
template <typename scalar_t>
void cpu_max_pool_backward_channels_last(
Tensor& grad_input_,
const Tensor& grad_output_,
const Tensor& indices_) {
TORCH_CHECK(grad_output_.ndimension() == 4,
"max pooling backward with channels last format supports tensors with 4 dims.");
auto memory_format = at::MemoryFormat::ChannelsLast;
auto grad_input = grad_input_.contiguous(memory_format);
auto grad_output = grad_output_.contiguous(memory_format);
auto indices = indices_.contiguous(memory_format);
auto grad_input_data = grad_input.data_ptr<scalar_t>();
auto grad_output_data = grad_output.data_ptr<scalar_t>();
auto indices_data = indices.data_ptr<int64_t>();
int64_t nbatch = grad_input.size(0);
int64_t channels = grad_input.size(1);
int64_t input_height = grad_input.size(2);
int64_t input_width = grad_input.size(3);
int64_t output_height = grad_output.size(2);
int64_t output_width = grad_output.size(3);
// parallel on dim N
at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
for (int64_t n = begin; n < end; n++) {
scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels;
scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
int64_t* indices_ptr = indices_data + n * output_height * output_width * channels;
for (int64_t oh = 0; oh < output_height; oh++) {
for (int64_t ow = 0; ow < output_width; ow++) {
scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels;
// TODO: gcc vectorization
for (int64_t c = 0; c < channels; c++) {
int64_t maxindex = ind[c];
if (maxindex != -1) {
grad_input_ptr[maxindex * channels + c] += gout[c];
}
}
}
}
}
});
if (!grad_input_.is_contiguous(memory_format)) {
grad_input_.copy_(grad_input);
}
}
void max_pool2d_kernel_impl(
Tensor& output,
Tensor& indices,
const Tensor& input,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int dilationW, int dilationH) {
switch (input.suggest_memory_format()) {
case at::MemoryFormat::Contiguous: {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool2d", [&] {
cpu_max_pool<scalar_t>(output, indices, input, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
});
break;
}
case at::MemoryFormat::ChannelsLast: {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool2d_channels_last", [&] {
cpu_max_pool_channels_last<scalar_t>(output, indices, input, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
});
break;
}
default:
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
}
}
void max_pool2d_backward_kernel_impl(
Tensor& grad_input,
const Tensor& grad_output,
const Tensor& indices) {
switch (grad_output.suggest_memory_format()) {
case at::MemoryFormat::Contiguous: {
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_pool2d_backward", [&] {
cpu_max_pool_backward<scalar_t>(grad_input, grad_output, indices);
});
break;
}
case at::MemoryFormat::ChannelsLast: {
AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_pool2d_backward_channels_last", [&] {
cpu_max_pool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
});
break;
}
default:
TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
}
}
} // anonymous namespace
REGISTER_DISPATCH(max_pool2d_kernel, &max_pool2d_kernel_impl);
REGISTER_DISPATCH(max_pool2d_backward_kernel, &max_pool2d_backward_kernel_impl);
}} // at::native

View File

@ -190,109 +190,6 @@ __global__ void atomicadaptivemaxgradinput(
}
}
// 4d tensor B x D x H x W
void adaptive_max_pool2d_out_cuda_template(
Tensor& output,
Tensor& indices,
const Tensor& input,
IntArrayRef output_size)
{
TensorArg output_arg{ output, "output", 1 };
TensorArg indices_arg{ indices, "indices", 2 };
TensorArg input_arg{ input, "input", 3 };
checkAllSameGPU("adaptive_max_pool2d_cuda", {output_arg, indices_arg, input_arg});
for (int64_t i = 0; i < input.ndimension(); i++) {
TORCH_CHECK(input.size(i) > 0,
"adaptive_max_pool2d_cuda(): expected input to have non-empty spatial dimensions, "
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
"empty");
}
TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
TORCH_CHECK(output_size.size() == 2,
"adaptive_max_pool2d: internal error: output_size.size() must be 2");
int64_t osizeH = output_size[0];
int64_t osizeW = output_size[1];
if (input.ndimension() == 3) {
int64_t sizeD = input.size(0);
int64_t isizeH = input.size(1);
int64_t isizeW = input.size(2);
int64_t istrideD = input.stride(0);
int64_t istrideH = input.stride(1);
int64_t istrideW = input.stride(2);
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
"adaptive_max_pool2d_cuda",
[&] {
output.resize_({sizeD, osizeH, osizeW});
indices.resize_({sizeD, osizeH, osizeW});
scalar_t *input_data = input.data_ptr<scalar_t>();
scalar_t *output_data = output.data_ptr<scalar_t>();
int64_t *indices_data = indices.data_ptr<int64_t>();
// cuda blocks & threads:
int blocksH = (int)(16L / sizeD);
blocksH = blocksH < 1 ? 1 : blocksH;
dim3 blocks(sizeD, blocksH);
dim3 threads(32, 8);
// run maxpool kernel
adaptivemaxpool <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> (
input_data, output_data,
indices_data,
isizeH, isizeW, osizeH, osizeW,
istrideD, istrideH, istrideW);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
);
} else {
Tensor input_ = input.contiguous();
int64_t sizeB = input_.size(0);
int64_t sizeD = input_.size(1);
int64_t isizeH = input_.size(2);
int64_t isizeW = input_.size(3);
int64_t istrideD = input_.stride(1);
int64_t istrideH = input_.stride(2);
int64_t istrideW = input_.stride(3);
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input_.scalar_type(),
"adaptive_max_pool2d_cuda",
[&] {
output.resize_({sizeB, sizeD, osizeH, osizeW});
indices.resize_({sizeB, sizeD, osizeH, osizeW});
scalar_t *input_data = input_.data_ptr<scalar_t>();
scalar_t *output_data = output.data_ptr<scalar_t>();
int64_t *indices_data = indices.data_ptr<int64_t>();
// cuda blocks & threads:
int blocksH = (int)(16L / sizeD);
blocksH = blocksH < 1 ? 1 : blocksH;
dim3 blocks(sizeB*sizeD, blocksH);
dim3 threads(32, 8);
// run maxpool kernel
adaptivemaxpool <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> (
input_data, output_data,
indices_data,
isizeH, isizeW, osizeH, osizeW,
istrideD, istrideH, istrideW);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
);
}
}
void adaptive_max_pool2d_backward_out_cuda_template(
Tensor& gradInput,
const Tensor& gradOutput_,
@ -409,31 +306,108 @@ void adaptive_max_pool2d_backward_out_cuda_template(
} // namespace
std::tuple<Tensor&, Tensor&> adaptive_max_pool2d_out_cuda(const Tensor& input,
IntArrayRef output_size,
Tensor& output,
Tensor& indices)
{
adaptive_max_pool2d_out_cuda_template(
output,
indices,
input,
output_size);
return std::tuple<Tensor&, Tensor&>(output, indices);
}
// 4d tensor B x D x H x W
std::tuple<Tensor, Tensor> adaptive_max_pool2d_cuda(
const Tensor& input,
IntArrayRef output_size)
{
Tensor output = at::empty({0}, input.options());
Tensor indices = at::empty({0}, input.options().dtype(kLong));
adaptive_max_pool2d_out_cuda_template(
output,
indices,
input,
output_size);
return std::tuple<Tensor, Tensor>(output, indices);
TORCH_IMPL_FUNC(adaptive_max_pool2d_out_cuda)
(const Tensor& input,
IntArrayRef output_size,
const Tensor& output,
const Tensor& indices) {
TensorArg output_arg{output, "output", 1};
TensorArg indices_arg{indices, "indices", 2};
TensorArg input_arg{input, "input", 3};
checkAllSameGPU(
"adaptive_max_pool2d_cuda", {output_arg, indices_arg, input_arg});
int64_t osizeH = output_size[0];
int64_t osizeW = output_size[1];
if (input.ndimension() == 3) {
int64_t sizeD = input.size(0);
int64_t isizeH = input.size(1);
int64_t isizeW = input.size(2);
int64_t istrideD = input.stride(0);
int64_t istrideH = input.stride(1);
int64_t istrideW = input.stride(2);
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input.scalar_type(), "adaptive_max_pool2d_cuda", [&] {
scalar_t* input_data = input.data_ptr<scalar_t>();
scalar_t* output_data = output.data_ptr<scalar_t>();
int64_t* indices_data = indices.data_ptr<int64_t>();
// cuda blocks & threads:
int blocksH = (int)(16L / sizeD);
blocksH = blocksH < 1 ? 1 : blocksH;
dim3 blocks(sizeD, blocksH);
dim3 threads(32, 8);
// run maxpool kernel
adaptivemaxpool<<<
blocks,
threads,
0,
at::cuda::getCurrentCUDAStream()>>>(
input_data,
output_data,
indices_data,
isizeH,
isizeW,
osizeH,
osizeW,
istrideD,
istrideH,
istrideW);
C10_CUDA_KERNEL_LAUNCH_CHECK();
});
} else {
Tensor input_ = input.contiguous();
int64_t sizeB = input_.size(0);
int64_t sizeD = input_.size(1);
int64_t isizeH = input_.size(2);
int64_t isizeW = input_.size(3);
int64_t istrideD = input_.stride(1);
int64_t istrideH = input_.stride(2);
int64_t istrideW = input_.stride(3);
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf,
kBFloat16,
input_.scalar_type(),
"adaptive_max_pool2d_cuda",
[&] {
scalar_t* input_data = input_.data_ptr<scalar_t>();
scalar_t* output_data = output.data_ptr<scalar_t>();
int64_t* indices_data = indices.data_ptr<int64_t>();
// cuda blocks & threads:
int blocksH = (int)(16L / sizeD);
blocksH = blocksH < 1 ? 1 : blocksH;
dim3 blocks(sizeB * sizeD, blocksH);
dim3 threads(32, 8);
// run maxpool kernel
adaptivemaxpool<<<
blocks,
threads,
0,
at::cuda::getCurrentCUDAStream()>>>(
input_data,
output_data,
indices_data,
isizeH,
isizeW,
osizeH,
osizeW,
istrideD,
istrideH,
istrideW);
C10_CUDA_KERNEL_LAUNCH_CHECK();
});
}
}
Tensor& adaptive_max_pool2d_backward_out_cuda(const Tensor& gradOutput_,

View File

@ -291,90 +291,6 @@ void atomicadaptivemaxgradinput_loop(
}
}
// 5d tensor B x D x T x H x W
void adaptive_max_pool3d_out_cuda_template(
Tensor& output,
Tensor& indices,
const Tensor& input_,
IntArrayRef output_size)
{
TensorArg output_arg{ output, "output", 1 };
TensorArg indices_arg{ indices, "indices", 2 };
TensorArg input_arg{ input_, "input_", 3 };
checkAllSameGPU("adaptive_max_pool3d_cuda", {output_arg, indices_arg, input_arg});
for (int64_t i = 0; i < input_.ndimension(); i++) {
TORCH_CHECK(input_.size(i) > 0,
"adaptive_max_pool3d_cuda(): expected input to have non-empty spatial dimensions, "
"but input has sizes ", input_.sizes(), " with dimension ", i, " being "
"empty");
}
TORCH_CHECK((input_.ndimension() == 4 || input_.ndimension() == 5),
"non-empty 4D or 5D (batch mode) tensor expected for input");
TORCH_CHECK(output_size.size() == 3,
"adaptive_max_pool3d: internal error: output_size.size() must be 3");
int64_t osizeT = output_size[0];
int64_t osizeH = output_size[1];
int64_t osizeW = output_size[2];
int64_t sizeD, isizeT, isizeH, isizeW;
int64_t istrideD, istrideT, istrideH, istrideW;
int64_t totalZ;
const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous();
if (input.ndimension() == 4) {
sizeD = input.size(0);
isizeT = input.size(1);
isizeH = input.size(2);
isizeW = input.size(3);
istrideD = input.stride(0);
istrideT = input.stride(1);
istrideH = input.stride(2);
istrideW = input.stride(3);
output.resize_({sizeD, osizeT, osizeH, osizeW});
indices.resize_({sizeD, osizeT, osizeH, osizeW});
totalZ = sizeD * osizeT;
} else {
int64_t sizeB = input.size(0);
sizeD = input.size(1);
isizeT = input.size(2);
isizeH = input.size(3);
isizeW = input.size(4);
istrideD = input.stride(1);
istrideT = input.stride(2);
istrideH = input.stride(3);
istrideW = input.stride(4);
output.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
indices.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
totalZ = sizeB * sizeD * osizeT;
}
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
"adaptive_max_pool3d_cuda",
[&] {
scalar_t *input_data = input.data_ptr<scalar_t>();
scalar_t *output_data = output.data_ptr<scalar_t>();
int64_t *indices_data = indices.data_ptr<int64_t>();
adaptivemaxpool_loop(
input_data, output_data, indices_data, totalZ, isizeT, isizeH, isizeW,
osizeT, osizeH, osizeW, istrideD, istrideT, istrideH, istrideW);
}
);
}
void adaptive_max_pool3d_backward_out_cuda_template(
Tensor& gradInput,
const Tensor& gradOutput_,
@ -460,31 +376,79 @@ void adaptive_max_pool3d_backward_out_cuda_template(
} // namespace
std::tuple<Tensor&, Tensor&> adaptive_max_pool3d_out_cuda(const Tensor& input,
IntArrayRef output_size,
Tensor& output,
Tensor& indices)
{
adaptive_max_pool3d_out_cuda_template(
output,
indices,
input,
output_size);
return std::tuple<Tensor&, Tensor&>(output, indices);
}
// 5d tensor B x D x T x H x W
std::tuple<Tensor, Tensor> adaptive_max_pool3d_cuda(
const Tensor& input,
IntArrayRef output_size)
{
Tensor output = at::empty({0}, input.options());
Tensor indices = at::empty({0}, input.options().dtype(kLong));
adaptive_max_pool3d_out_cuda_template(
output,
indices,
input,
output_size);
return std::tuple<Tensor, Tensor>(output, indices);
TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cuda)
(const Tensor& input,
IntArrayRef output_size,
const Tensor& output,
const Tensor& indices) {
TensorArg output_arg{output, "output", 1};
TensorArg indices_arg{indices, "indices", 2};
TensorArg input_arg{input, "input", 3};
checkAllSameGPU(
"adaptive_max_pool3d_cuda", {output_arg, indices_arg, input_arg});
int64_t osizeT = output_size[0];
int64_t osizeH = output_size[1];
int64_t osizeW = output_size[2];
int64_t sizeD, isizeT, isizeH, isizeW;
int64_t istrideD, istrideT, istrideH, istrideW;
int64_t totalZ;
const Tensor& input_ = input.ndimension() == 4 ? input : input.contiguous();
if (input_.ndimension() == 4) {
sizeD = input_.size(0);
isizeT = input_.size(1);
isizeH = input_.size(2);
isizeW = input_.size(3);
istrideD = input_.stride(0);
istrideT = input_.stride(1);
istrideH = input_.stride(2);
istrideW = input_.stride(3);
totalZ = sizeD * osizeT;
} else {
int64_t sizeB = input_.size(0);
sizeD = input_.size(1);
isizeT = input_.size(2);
isizeH = input_.size(3);
isizeW = input_.size(4);
istrideD = input_.stride(1);
istrideT = input_.stride(2);
istrideH = input_.stride(3);
istrideW = input_.stride(4);
totalZ = sizeB * sizeD * osizeT;
}
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input_.scalar_type(), "adaptive_max_pool3d_cuda", [&] {
scalar_t* input_data = input_.data_ptr<scalar_t>();
scalar_t* output_data = output.data_ptr<scalar_t>();
int64_t* indices_data = indices.data_ptr<int64_t>();
adaptivemaxpool_loop(
input_data,
output_data,
indices_data,
totalZ,
isizeT,
isizeH,
isizeW,
osizeT,
osizeH,
osizeW,
istrideD,
istrideT,
istrideH,
istrideW);
});
}
Tensor& adaptive_max_pool3d_backward_out_cuda(const Tensor& gradOutput_,

View File

@ -1192,7 +1192,41 @@ void magmaGels<c10::complex<double>>(
reinterpret_cast<magmaDoubleComplex*>(hwork), lwork, info);
AT_CUDA_CHECK(cudaGetLastError());
}
#endif
namespace {
/*
MAGMA can return errors both as a return value and in the info argument.
The return value and info should always be identical.
In general, the meaning is as given in this table.
Predefined error codes are large negative numbers. Using the symbolic
constants below is preferred, but the numeric values can be found in
include/magma_types.h.
Info | Description
----------- | -----------
info = 0 (MAGMA_SUCCESS) | Successful exit
info < 0, but small | For info = -i, the i-th argument had an illegal value
info > 0 | Function-specific error such as singular matrix
MAGMA_ERR_DEVICE_ALLOC | Could not allocate GPU device memory
MAGMA_ERR_HOST_ALLOC | Could not allocate CPU host memory
MAGMA_ERR_ILLEGAL_VALUE | An argument had an illegal value (deprecated; instead it should return -i to say the i-th argument was bad)
MAGMA_ERR_INVALID_PTR | Can't free pointer
MAGMA_ERR_NOT_IMPLEMENTED | Function or option not implemented
MAGMA_ERR_NOT_SUPPORTED | Function or option not supported on the current architecture
*/
void checkMagmaInternalError(magma_int_t info, const std::string& magma_function_name) {
// if info > 0 the error is function-specific, do nothing in this case
TORCH_CHECK(info >= 0,
"MAGMA error: ",
magma_strerror(info),
", info = ", info,
", when calling ", magma_function_name);
}
} // anonymous namespace
#endif // USE_MAGMA
#define ALLOCATE_ARRAY(name, type, size) \
auto storage_##name = pin_memory<type>(size); \
@ -1968,7 +2002,7 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau, int64_t n_columns)
template <typename scalar_t>
static void apply_qr(Tensor& Q, Tensor& R, int64_t q_size_minus_2, int64_t r_size_minus_1, int64_t n_columns,
bool compute_q, std::vector<int64_t>& infos) {
bool compute_q) {
#ifndef USE_MAGMA
AT_ERROR("qr: MAGMA library not found in "
"compilation. Please rebuild with MAGMA.");
@ -1996,10 +2030,7 @@ AT_ERROR("qr: MAGMA library not found in "
for (int64_t i = 0; i < batch_size; i++) {
scalar_t* r_working_ptr = &r_data[i * r_matrix_stride];
magmaGeqrf<scalar_t>(m, n, r_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/true);
infos[i] = info;
if (info != 0) {
return;
}
checkMagmaInternalError(info, "geqrf");
}
if (!compute_q) {
// this is for mode='r'
@ -2017,15 +2048,10 @@ AT_ERROR("qr: MAGMA library not found in "
for (int64_t i = 0; i < batch_size; i++) {
scalar_t* q_working_ptr = &q_data[i * q_matrix_stride];
magmaGeqrf<scalar_t>(m, n, q_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/false);
infos[i] = info;
if (info != 0) {
return;
}
checkMagmaInternalError(info, "geqrf");
magmaOrgqr<scalar_t>(m, n_columns, k, q_working_ptr, m, tau_data, work_data, nb, &info);
infos[i] = info;
if (info != 0) {
return;
}
checkMagmaInternalError(info, "orgqr");
}
#endif
}
@ -2033,7 +2059,6 @@ AT_ERROR("qr: MAGMA library not found in "
std::tuple<Tensor,Tensor> _linalg_qr_helper_cuda(const Tensor& self, std::string mode) {
bool compute_q, reduced;
std::tie(compute_q, reduced) = _parse_qr_mode(mode);
std::vector<int64_t> infos(batchCount(self), 0);
// Setup input geometry and inputs for apply_qr
std::vector<int64_t> q_sizes, q_strides;
@ -2066,13 +2091,8 @@ std::tuple<Tensor,Tensor> _linalg_qr_helper_cuda(const Tensor& self, std::string
int64_t n = r_working_copy.size(-1);
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "qr_cuda", [&]{
apply_qr<scalar_t>(q_working_copy, r_working_copy, m, n, n_columns_q, compute_q, infos);
apply_qr<scalar_t>(q_working_copy, r_working_copy, m, n, n_columns_q, compute_q);
});
if (self.dim() > 2) {
batchCheckErrors(infos, "qr_cuda");
} else {
singleCheckErrors(infos[0], "qr_cuda");
}
if (compute_q) {
q_working_copy = q_working_copy.narrow(-1, 0, n_columns_q);
@ -2647,6 +2667,11 @@ TORCH_CHECK(false, "torch.linalg.lstsq: MAGMA library not found in "
auto trans = MagmaNoTrans;
auto m = magma_int_cast(a.size(-2), "m");
auto n = magma_int_cast(a.size(-1), "n");
TORCH_CHECK(
m >= n,
"torch.linalg.lstsq: only overdetermined systems (input.size(-2) >= input.size(-1)) are allowed on CUDA");
auto nrhs = magma_int_cast(b.size(-1), "nrhs");
auto ldda = std::max<magma_int_t>(1, m);
auto lddb = std::max<magma_int_t>(1, std::max(m, n));

View File

@ -669,7 +669,7 @@ cunn_SoftMaxBackward(scalar_t *gradInput, outscalar_t *output, outscalar_t *grad
const int grad_output_shift = ((uint64_t)gradOutput) % ALIGN_BYTES / sizeof(outscalar_t);
accscalar_t threadSum = ilpReduce<AddFloat, ILP, outscalar_t, accscalar_t>(
shift, gradOutput, classes, AddFloat<outscalar_t, accscalar_t>(), accscalar_t(0));
grad_output_shift, gradOutput, classes, AddFloat<outscalar_t, accscalar_t>(), accscalar_t(0));
accscalar_t sum_k = blockReduce<Add, accscalar_t>(
sdata, threadSum, Add<accscalar_t>(), accscalar_t(0));

View File

@ -99,7 +99,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
indices.resize_as_(self);
indices.zero_();
}
return {values, indices};
return std::forward_as_tuple(values, indices);
}
Tensor self_;
@ -153,7 +153,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
}
if (numel == 0) {
return {values, indices};
return std::forward_as_tuple(values, indices);
}
int64_t numel_or_intmax = std::min(numel, static_cast<int64_t>(std::numeric_limits<int>::max()));
@ -206,7 +206,7 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
if (indices_tmp.defined()) {
indices.copy_(indices_tmp);
}
return {values, indices};
return std::forward_as_tuple(values, indices);
}
std::tuple<Tensor &,Tensor &> sort_out_cuda(const Tensor & self, int64_t dim, bool descending, Tensor & values, Tensor & indices) {

View File

@ -590,7 +590,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
// we compute the "linear index" of the output element,
// and convert it to the equivalent "linear index" of the input element.
ushort offset = 4 * s2 + idx;
ushort linear_idx = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
int64_t linear_idx = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
if(linear_idx >= numel1){
value[idx] = 0;
continue;
@ -615,6 +615,98 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
}
}
constant bool transpose_in_is_arr = (ushort_arg_3 > 1 || ushort_arg_4 > 4);
constant bool transpose_in_is_tex = !transpose_in_is_arr;
constant bool transpose_out_is_arr = (ushort_arg_5 > 1 || ushort_arg_6 > 4);
constant bool transpose_out_is_tex = !transpose_out_is_arr;
kernel void transpose(texture2d_array<half, access::read>in_arr[[texture(0),function_constant(transpose_in_is_arr)]],
texture2d<half, access::read> in_tex[[texture(0), function_constant(transpose_in_is_tex)]],
texture2d_array<half, access::write>out_arr[[texture(1),function_constant(transpose_out_is_arr)]],
texture2d<half, access::write> out_tex[[texture(1), function_constant(transpose_out_is_tex)]],
constant ushort* inSizeBuffer [[buffer(0)]],
constant ushort* outSizeBuffer [[buffer(1)]],
device ushort* indexBuffer [[buffer(2)]],
ushort3 gid[[thread_position_in_grid]]) {
const ushort dim0 = ushort_arg_0;
const ushort dim1 = ushort_arg_1;
const ushort dim = ushort_arg_2;
const ushort N1 = ushort_arg_3;
const ushort C1 = ushort_arg_4;
const ushort N2 = ushort_arg_5;
const ushort C2 = ushort_arg_6;
ushort W1,W2,H1,H2;
if(transpose_in_is_arr) {
W1 = in_arr.get_width();
H1 = in_arr.get_height();
} else {
W1 = in_tex.get_width();
H1 = in_tex.get_height();
}
if(transpose_out_is_arr) {
W2 = out_arr.get_width();
H2 = out_arr.get_height();
} else {
W2 = out_tex.get_width();
H2 = out_tex.get_height();
}
if (gid.x >= W2 || gid.y >= H2) {
return;
}
const int numel = H2 * W2 * C2 * N2;
const ushort slices2 = divRoundUp(C2, 4);
const ushort slices1 = divRoundUp(C1, 4);
const ushort n2 = gid.z / slices2;
const ushort s2 = gid.z - n2 * slices2;
half4 value;
for (int idx = 0; idx < 4; ++idx){
ushort offset = 4 * s2 + idx;
int64_t linear_idx2 = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x;
if(linear_idx2 >= numel) {
value[idx] = 0;
continue;
}
ushort d2 = 0;
for(int j = dim-1; j>=0; --j){
d2 = outSizeBuffer[j];
indexBuffer[j] = linear_idx2 % d2;
linear_idx2 /= d2;
}
// swap dims
ushort tmp = indexBuffer[dim0];
indexBuffer[dim0] = indexBuffer[dim1];
indexBuffer[dim1] = tmp;
int64_t linear_idx1 = 0;
ushort m = 1;
ushort d1 = 0;
for(int k = dim-1; k>=0; --k) {
d1 = indexBuffer[k];
linear_idx1 += d1 * m;
m *= inSizeBuffer[k];
}
auto x1 = linear_idx1 % W1;
auto y1 = ((int)(linear_idx1/W1)) % H1;
auto c1 = ((int)(linear_idx1/W1/H1) % C1);
auto n1 = ((int)(linear_idx1/W1/H1/C1) % N1);
auto z1 = (int)c1 / 4 + n1 * slices1;
auto pos = c1 % 4;
if(transpose_in_is_arr) {
value[idx] = in_arr.read(ushort2(x1, y1), z1)[pos];
} else {
value[idx] = in_tex.read(ushort2(x1, y1))[pos];
}
}
if(transpose_out_is_arr) {
out_arr.write(value, gid.xy, gid.z);
} else {
out_tex.write(value, gid.xy);
}
}
)PT_METAL_SHADERS";
#endif /* MPSCNNShaders_h */

View File

@ -1,15 +1,22 @@
#include <ATen/Tensor.h>
#include <ATen/native/metal/mpscnn/MPSCNNContext.h>
#include <ATen/native/metal/MetalCommandBuffer.h>
#include <ATen/native/metal/MetalTensorImpl.h>
#include <ATen/native/metal/MetalTensorImplStorage.h>
#include <vector>
#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
typedef float16_t fp16_t;
#else
typedef uint16_t fp16_t;
#endif
namespace at {
namespace native {
namespace metal {
std::vector<uint16_t> Fp32ToFp16(const std::vector<float>& src);
std::vector<float> Fp16ToFp32(const std::vector<uint16_t>& src);
std::vector<fp16_t> Fp32ToFp16(const std::vector<float>& src);
std::vector<float> Fp16ToFp32(const std::vector<fp16_t>& src);
std::vector<float> NCHWToNC4(
const float* src,
@ -67,6 +74,15 @@ static inline MetalCommandBuffer* getCommandBufferFromTensor(
return cmdBuffer;
}
template<typename T>
id<MTLBuffer>makeMTLBuffer(const std::vector<T>& src) {
id<MTLBuffer> buffer = [[MPSCNNContext sharedInstance].device
newBufferWithLength:src.size() * sizeof(T)
options:MTLResourceOptionCPUCacheModeWriteCombined];
memcpy(buffer.contents, src.data(), src.size() * sizeof(T));
return buffer;
}
} // namespace metal
} // namespace native
} // namespace at

View File

@ -1,28 +1,27 @@
#import <ATen/native/metal/MetalUtils.h>
#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
#import <Accelerate/Accelerate.h>
namespace at {
namespace native {
namespace metal {
std::vector<uint16_t> Fp32ToFp16(const std::vector<float>& src) {
unsigned long count = src.size();
std::vector<uint16_t> output(count, 0);
vImage_Buffer float32{(void*)src.data(), 1, count, count * sizeof(float)};
vImage_Buffer float16{
(void*)output.data(), 1, count, count * sizeof(uint16_t)};
if (vImageConvert_PlanarFtoPlanar16F(&float32, &float16, 0) !=
kvImageNoError) {
TORCH_CHECK(false);
}
std::vector<fp16_t> Fp32ToFp16(const std::vector<float>& src) {
unsigned long count = src.size();
std::vector<fp16_t> output(count, 0);
vImage_Buffer float32{(void*)src.data(), 1, count, count * sizeof(float)};
vImage_Buffer float16{(void*)output.data(), 1, count, count * sizeof(fp16_t)};
if (vImageConvert_PlanarFtoPlanar16F(&float32, &float16, 0) !=
kvImageNoError) {
TORCH_CHECK(false);
}
return output;
}
std::vector<float> Fp16ToFp32(const std::vector<uint16_t>& src) {
std::vector<float> Fp16ToFp32(const std::vector<fp16_t>& src) {
unsigned long count = src.size();
std::vector<float> output(count, 0);
vImage_Buffer float16{(void*)src.data(), 1, count, count * sizeof(uint16_t)};
vImage_Buffer float16{(void*)src.data(), 1, count, count * sizeof(fp16_t)};
vImage_Buffer float32{(void*)output.data(), 1, count, count * sizeof(float)};
if (vImageConvert_Planar16FtoPlanarF(&float16, &float32, 0) !=
kvImageNoError) {

View File

@ -1,3 +1,4 @@
#import <ATen/native/metal/MetalUtils.h>
#import <ATen/native/metal/mpscnn/MPSCNNUtils.h>
#import <ATen/native/metal/mpscnn/MPSCNNClampOp.h>
#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
@ -35,9 +36,9 @@
[encoder setTexture:[_X texture] atIndex:0];
[encoder setTexture:[_Y texture] atIndex:1];
id<MTLBuffer> clampBuffer = [[MPSCNNContext sharedInstance].device
newBufferWithLength:2 * sizeof(fp16)
newBufferWithLength:2 * sizeof(fp16_t)
options:MTLResourceOptionCPUCacheModeWriteCombined];
fp16* clampBufferPtr = (fp16*)[clampBuffer contents];
fp16_t* clampBufferPtr = (fp16_t*)[clampBuffer contents];
clampBufferPtr[0] = _min.floatValue;
clampBufferPtr[1] = _max.floatValue;
[encoder setBuffer:clampBuffer offset:0 atIndex:0];

View File

@ -1,12 +1,6 @@
#import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
typedef float16_t fp16;
#else
typedef uint16_t fp16;
#endif
@protocol MPSCNNOp<NSObject>
@property(nonatomic, strong) MPSCNNKernel* kernel;

View File

@ -1,6 +1,7 @@
#import <ATen/Tensor.h>
#import <ATen/native/metal/MetalCommandBuffer.h>
#import <ATen/native/metal/MetalTensorImpl.h>
#import <ATen/native/metal/MetalUtils.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
@ -10,7 +11,7 @@ namespace metal {
MPSImage* createStaticImage(const std::vector<int64_t>& sizes);
MPSImage* createStaticImage(
const uint16_t* src,
const fp16_t* src,
const std::vector<int64_t>& sizes);
MPSImage* createStaticImage(
const float* src,
@ -35,7 +36,7 @@ MPSTemporaryImage* createTemporaryImage(
void copyToHost(float* dst, MPSImage* image);
std::vector<uint16_t> staticImageToFp16Array(MPSImage* image);
std::vector<fp16_t> staticImageToFp16Array(MPSImage* image);
at::Tensor staticImageToTensor(MPSImage* image);
static inline MPSImage* imageFromTensor(const Tensor& tensor) {

View File

@ -25,7 +25,7 @@ MPSImage* createStaticImage(const std::vector<int64_t>& sizes) {
}
MPSImage* createStaticImage(
const uint16_t* src,
const fp16_t* src,
const std::vector<int64_t>& sizes) {
int64_t N = sizes[0];
int64_t C = sizes[1];
@ -45,7 +45,7 @@ MPSImage* createStaticImage(
int64_t slices = (C + 3) / 4 * N;
int64_t numComponents = image.featureChannels < 3 ? image.featureChannels : 4;
int64_t bytesPerRow = W * numComponents * sizeof(uint16_t);
int64_t bytesPerRow = W * numComponents * sizeof(fp16_t);
uint8_t* ptr = (uint8_t*)src;
for (int i = 0; i < slices; ++i) {
[image.texture replaceRegion:MTLRegionMake2D(0, 0, W, H)
@ -256,7 +256,7 @@ void copyToHost(float* dst, MPSImage* image) {
memcpy(dst, buffer.contents, buffer.length);
}
std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {
std::vector<fp16_t> staticImageToFp16Array(MPSImage* image) {
if (image.pixelFormat == MTLPixelFormatR16Float ||
image.pixelFormat == MTLPixelFormatRG16Float ||
image.pixelFormat == MTLPixelFormatRGBA16Float) {
@ -265,8 +265,8 @@ std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {
int64_t numComponents =
image.featureChannels < 3 ? image.featureChannels : 4;
int64_t count = image.width * image.height * image.numberOfImages * C;
std::vector<uint16_t> output(count, 0);
int64_t bytesPerRow = image.width * numComponents * sizeof(uint16_t);
std::vector<fp16_t> output(count, 0);
int64_t bytesPerRow = image.width * numComponents * sizeof(fp16_t);
uint8_t* buffer = (uint8_t*)output.data();
for (int i = 0; i < slices * image.numberOfImages; ++i) {
[image.texture getBytes:buffer
@ -285,8 +285,8 @@ std::vector<uint16_t> staticImageToFp16Array(MPSImage* image) {
at::Tensor staticImageToTensor(MPSImage* image) {
auto outputSize = [image sizes];
std::vector<uint16_t> fp16 = staticImageToFp16Array(image);
auto fp32 = metal::Fp16ToFp32(fp16);
std::vector<fp16_t> fp16Array = staticImageToFp16Array(image);
auto fp32 = metal::Fp16ToFp32(fp16Array);
std::vector<float> fp32_nchw = metal::NC4ToNCHW(fp32.data(), outputSize);
auto tensor = at::empty(outputSize);
int64_t size_bytes = c10::multiply_integers(outputSize) * sizeof(float);

View File

@ -24,6 +24,9 @@ bool test_div();
bool test_div_broadcast();
bool test_div_broadcast2();
bool test_t();
bool test_transpose();
bool test_transpose2();
bool test_transpose3();
bool test_view();
bool test_view2();
bool test_view3();
@ -43,5 +46,8 @@ bool test_upsampling_nearest2d_vec();
bool test_adaptive_avg_pool2d();
bool test_hardtanh_();
bool test_reshape();
bool test_mean_dim();
bool test_mean_dim2();
bool test_mean_dim3();
#endif

View File

@ -4,7 +4,6 @@
#import <ATen/native/metal/mpscnn/MPSImageUtils.h>
#import <ATen/native/metal/mpscnn/tests/MPSCNNTests.h>
#import <ATen/native/metal/ops/MetalConvolution.h>
#import <ATen/native/metal/ops/MetalTranspose.h>
#import <Foundation/Foundation.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
@ -490,7 +489,7 @@ bool test_t() {
auto X1 = at::rand({H, W}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto Y1 = at::t(X1).contiguous();
auto X2 = X1.metal();
auto Y2 = at::native::metal::t(X2).cpu();
auto Y2 = at::t(X2).cpu();
return almostEqual(Y1, Y2);
});
if (!b) {
@ -500,6 +499,39 @@ bool test_t() {
return result;
}
bool test_transpose() {
__block std::vector<int64_t> size {1, 2, 2, 5};
return TEST(size, __PRETTY_FUNCTION__, ^bool{
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto Y1 = at::transpose(X1, 1, 3).contiguous();
auto X2 = X1.metal();
auto Y2 = at::transpose(X2, 1, 3).cpu();
return almostEqual(Y1, Y2);
});
}
bool test_transpose2() {
__block std::vector<int64_t> size {1, 2, 58, 28, 28};
return TEST(size, __PRETTY_FUNCTION__, ^bool{
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto Y1 = at::transpose(X1, 1, 2).contiguous();
auto X2 = X1.metal();
auto Y2 = at::transpose(X2, 1, 2).cpu();
return almostEqual(Y1, Y2);
});
}
bool test_transpose3() {
__block std::vector<int64_t> size {4, 5, 6};
return TEST(size, __PRETTY_FUNCTION__, ^bool{
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto Y1 = at::transpose(X1, 2, 0).contiguous();
auto X2 = X1.metal();
auto Y2 = at::transpose(X2, 2, 0).cpu();
return almostEqual(Y1, Y2);
});
}
bool test_view() {
// array -> array
__block std::vector<int64_t> size{1, 10, 2, 2};
@ -775,3 +807,38 @@ bool test_hardtanh_() {
return true;
#endif
}
bool test_mean_dim() {
__block std::vector<int64_t> size{1, 5, 2, 2};
return TEST(size, __PRETTY_FUNCTION__, ^bool {
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto Y1 = at::mean(X1, {2,3}, true);
auto X2 = X1.metal();
auto Y2 = at::mean(X2, {2,3}, true).cpu();
return almostEqual(Y1, Y2);
});
}
bool test_mean_dim2() {
__block std::vector<int64_t> size{1, 5, 2, 2};
return TEST(size, __PRETTY_FUNCTION__, ^bool {
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto Y1 = at::mean(X1, {1,3}, false);
auto X2 = X1.metal();
auto Y2 = at::mean(X2, {1,3}, false).cpu();
return almostEqual(Y1, Y2);
});
}
bool test_mean_dim3() {
__block std::vector<int64_t> size{1, 5, 2, 2};
return TEST(size, __PRETTY_FUNCTION__, ^bool {
auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
auto Y1 = at::mean(X1, {0,1,2,3});
PRINT_TENSOR("Y1", Y1);
auto X2 = X1.metal();
auto Y2 = at::mean(X2, {0,1,2,3}).cpu();
PRINT_TENSOR("Y2", Y2);
return almostEqual(Y1, Y2);
});
}

View File

@ -0,0 +1,84 @@
#include <ATen/Tensor.h>
#import <ATen/native/metal/MetalCommandBuffer.h>
#import <ATen/native/metal/MetalTensorImpl.h>
#import <ATen/native/metal/MetalTensorImplStorage.h>
#import <ATen/native/metal/MetalUtils.h>
#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
#import <ATen/native/metal/mpscnn/MPSImage+Tensor.h>
#import <ATen/native/metal/mpscnn/MPSImageUtils.h>
#include <ATen/ATen.h>
#include <ATen/native/ReduceOpsUtils.h>
#include <torch/library.h>
namespace at {
namespace native {
namespace metal {
API_AVAILABLE(ios(11.3), macos(10.13))
static inline MPSNNReduceUnary* kernelForReducedDim(int dim) {
id<MTLDevice> device = [MPSCNNContext sharedInstance].device;
if (dim == 3) {
return [[MPSNNReduceRowMean alloc] initWithDevice:device];
} else if (dim == 2) {
return [[MPSNNReduceColumnMean alloc] initWithDevice:device];
} else if (dim == 1) {
return [[MPSNNReduceFeatureChannelsMean alloc] initWithDevice:device];
}
return nil;
}
Tensor wrapper_mean_dim(
const Tensor& input,
IntArrayRef dims,
bool keepdim,
c10::optional<ScalarType> dtype) {
if (@available(iOS 11.3, *)) {
MPSImage* X = imageFromTensor(input);
auto textureSize = input.sizes().vec();
TORCH_CHECK(textureSize.size() == 4);
// TODO: [T87340633] Support reducing the batch dimension
TORCH_CHECK(textureSize[0] == 1);
auto mask = make_dim_mask(dims, input.dim());
MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
MPSImage* Y = nil;
for (int dim : dims) {
textureSize[dim] = 1;
MPSNNReduceUnary* kernel = kernelForReducedDim(dim);
if (kernel) {
Y = createTemporaryImage(commandBuffer, textureSize);
[kernel encodeToCommandBuffer:commandBuffer.buffer
sourceImage:X
destinationImage:Y];
X = Y;
}
}
MetalTensorImplStorage mt{textureSize};
mt.texture()->setCommandBuffer(commandBuffer);
mt.texture()->copyFromTexture(Y);
auto shape = DimVector(input.sizes());
for (int dim = shape.size() - 1; dim >= 0; dim--) {
if (mask[dim]) {
if (keepdim) {
shape[dim] = 1;
} else {
shape.erase(shape.begin() + dim);
}
}
}
auto output = makeTensor(std::move(mt), input.options()).view(shape);
return output;
} else {
// TODO: [T87350528] Fallback to shader kernels for 10.0 users
TORCH_CHECK(
false, "MPSNNReduceUnary is only available on iOS 11.3 and above");
}
}
TORCH_LIBRARY_IMPL(aten, Metal, m) {
m.impl("mean.dim", TORCH_FN(wrapper_mean_dim));
};
}
}
}

View File

@ -1,17 +0,0 @@
#ifndef MetalCopy_h
#define MetalCopy_h
#include <ATen/Tensor.h>
namespace at {
namespace native {
namespace metal {
// TODO: Remove the header once we are able to call it through dispatcher
Tensor t(const Tensor& input);
} // namespace metal
} // namespace native
} // namespace at
#endif

View File

@ -3,37 +3,94 @@
#import <ATen/native/metal/MetalTensorImplStorage.h>
#import <ATen/native/metal/MetalUtils.h>
#import <ATen/native/metal/mpscnn/MPSCNNContext.h>
#import <ATen/native/metal/mpscnn/MPSCNNUtils.h>
#import <ATen/native/metal/mpscnn/MPSImage+Tensor.h>
#import <ATen/native/metal/mpscnn/MPSImageUtils.h>
#include <ATen/ATen.h>
#include <torch/library.h>
namespace at {
namespace native {
namespace metal {
Tensor transpose(const Tensor& input, int64_t dim0, int64_t dim1) {
TORCH_CHECK(input.is_metal());
auto ndims = input.dim();
dim0 = maybe_wrap_dim(dim0, ndims);
dim1 = maybe_wrap_dim(dim1, ndims);
if (dim0 == dim1) {
return input;
}
auto outputSizes = input.sizes().vec();
std::swap(outputSizes[dim0], outputSizes[dim1]);
MPSImage* X = imageFromTensor(input);
MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
if (input.dim() == 2) {
MetalTensorImplStorage mt{outputSizes};
mt.texture()->allocateTemporaryTextureStorage(outputSizes, commandBuffer);
MPSImage* Y = mt.texture()->image();
MPSImageTranspose* transpose = [[MPSImageTranspose alloc]
initWithDevice:[MPSCNNContext sharedInstance].device];
[transpose encodeToCommandBuffer:commandBuffer.buffer
sourceImage:X
destinationImage:Y];
auto output = makeTensor(std::move(mt), input.options());
return output;
} else {
id<MTLBuffer> sizeBuf1 = makeMTLBuffer<ushort>(
std::vector<ushort>{input.sizes().begin(), input.sizes().end()});
id<MTLBuffer> sizeBuf2 = makeMTLBuffer<ushort>(
std::vector<ushort>{outputSizes.begin(), outputSizes.end()});
id<MTLBuffer> indexBuf = makeMTLBuffer(std::vector<ushort>(input.dim(), 1));
MetalTensorImplStorage mt{outputSizes};
mt.texture()->allocateTemporaryTextureStorage(outputSizes, commandBuffer);
MPSImage* Y = mt.texture()->image();
id<MTLComputeCommandEncoder> encoder =
[commandBuffer.buffer computeCommandEncoder];
id<MTLComputePipelineState> state =
[[MPSCNNContext sharedInstance] specializedPipelineState:@"transpose"
Constants:@[
@(dim0),
@(dim1),
@(input.dim()),
@(X.numberOfImages),
@(X.featureChannels),
@(Y.numberOfImages),
@(Y.featureChannels),
]];
[encoder setComputePipelineState:state];
[encoder setTexture:[X texture] atIndex:0];
[encoder setTexture:[Y texture] atIndex:1];
[encoder setBuffer:sizeBuf1 offset:0 atIndex:0];
[encoder setBuffer:sizeBuf2 offset:0 atIndex:1];
[encoder setBuffer:indexBuf offset:0 atIndex:2];
const auto& launchParams =
mpscnn::spatialPointwiseKernelLaunchParams(state, Y);
[encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
[encoder endEncoding];
[X markRead];
[Y markRead];
auto output = makeTensor(std::move(mt), input.options());
return output;
}
}
Tensor t(const Tensor& input) {
TORCH_CHECK(input.is_metal());
TORCH_CHECK(input.is_metal());
TORCH_CHECK(input.dim() == 2);
auto strides = input.strides().vec();
auto sizes = input.sizes().vec();
MPSImage* X = imageFromTensor(input);
TORCH_CHECK(X.numberOfImages == 1);
TORCH_CHECK(X.featureChannels == 1);
MetalTensorImplStorage mt({sizes[1], sizes[0]});
MetalCommandBuffer* commandBuffer = getCommandBufferFromTensor(input);
mt.texture()->allocateTemporaryTextureStorage(
{1, 1, sizes[1], sizes[0]}, commandBuffer);
MPSImage* Y = mt.texture()->image();
MPSImageTranspose* transpose = [[MPSImageTranspose alloc]
initWithDevice:[MPSCNNContext sharedInstance].device];
[transpose encodeToCommandBuffer:commandBuffer.buffer
sourceImage:X
destinationImage:Y];
auto output = makeTensor(std::move(mt), input.options());
return output;
return metal::transpose(input, 0, input.dim() < 2 ? 0 : 1);
}
TORCH_LIBRARY_IMPL(aten, Metal, m) {
m.impl("t", TORCH_FN(t));
m.impl("transpose.int", TORCH_FN(transpose));
};
}
}
}

View File

@ -5034,9 +5034,6 @@
structured_delegate: digamma.out
variants: method
- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
variants: method
- func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
variants: method
dispatch:
@ -5804,6 +5801,11 @@
dispatch:
CompositeExplicitAutograd: polygamma
- func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
variants: method
dispatch:
CompositeExplicitAutograd: polygamma_
- func: erfinv(Tensor self) -> Tensor
structured_delegate: erfinv.out
variants: method, function
@ -7439,6 +7441,7 @@
# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
python_module: nn
structured: True
dispatch:
CPU: adaptive_max_pool2d_out_cpu
CUDA: adaptive_max_pool2d_out_cuda
@ -7446,9 +7449,7 @@
# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
python_module: nn
dispatch:
CPU: adaptive_max_pool2d_cpu
CUDA: adaptive_max_pool2d_cuda
structured_delegate: adaptive_max_pool2d.out
- func: adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
python_module: nn
@ -7465,6 +7466,7 @@
# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
python_module: nn
structured: True
dispatch:
CPU: adaptive_max_pool3d_out_cpu
CUDA: adaptive_max_pool3d_out_cuda
@ -7472,9 +7474,7 @@
# Return: (Tensor output, Tensor indices)
- func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
python_module: nn
dispatch:
CPU: adaptive_max_pool3d_cpu
CUDA: adaptive_max_pool3d_cuda
structured_delegate: adaptive_max_pool3d.out
- func: adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
python_module: nn
@ -8637,6 +8637,12 @@
dispatch:
CompositeExplicitAutograd: linalg_lstsq
- func: linalg_lstsq.out(Tensor self, Tensor b, float? cond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values)
python_module: linalg
variants: function
dispatch:
CPU, CUDA: linalg_lstsq_out
- func: _lstsq_helper_(Tensor(a!) self, Tensor(b!) rank, Tensor(c!) singular_values, Tensor(d!) infos, Tensor a, float cond, str driver_name) -> Tensor(a!)
variants: function
dispatch:

View File

@ -12,6 +12,8 @@ namespace {
template <typename T>
class MinMax : public ::testing::Test {};
template <typename T>
class Nan : public ::testing::Test {};
template <typename T>
class Interleave : public ::testing::Test {};
template <typename T>
class SignManipulation : public ::testing::Test {};
@ -67,6 +69,7 @@ namespace {
TYPED_TEST_CASE(Comparison, RealFloatIntTestedTypes);
TYPED_TEST_CASE(Bitwise, FloatIntTestedTypes);
TYPED_TEST_CASE(MinMax, RealFloatIntTestedTypes);
TYPED_TEST_CASE(Nan, RealFloatTestedTypes);
TYPED_TEST_CASE(Interleave, RealFloatIntTestedTypes);
TYPED_TEST_CASE(SignManipulation, FloatIntTestedTypes);
TYPED_TEST_CASE(Rounding, RealFloatTestedTypes);
@ -435,9 +438,29 @@ namespace {
[](const vec& v) { return v.erfinv(); },
createDefaultUnaryTestCase<vec>(TestSeed(), false, true));
}
TYPED_TEST(Nan, IsNan) {
using vec = TypeParam;
using VT = ValueType<TypeParam>;
CACHE_ALIGN VT test_vals[vec::size()];
CACHE_ALIGN VT expected_vals[vec::size()];
auto vals = 1 << (vec::size());
for (int val = 0; val < vals; ++val) {
for (int i = 0; i < vec::size(); ++i) {
if (val & (1 << i)) {
test_vals[i] = std::numeric_limits<VT>::quiet_NaN();
// All bits are set to 1 if true, otherwise 0.
// same rule as at::Vec256<T>::binary_pred.
std::memset(static_cast<void*>(&expected_vals[i]), 0xFF, sizeof(VT));
} else {
test_vals[i] = (VT)0.123;
std::memset(static_cast<void*>(&expected_vals[i]), 0, sizeof(VT));
}
}
vec actual = vec::loadu(test_vals).isnan();
vec expected = vec::loadu(expected_vals);
AssertVec256<vec>(NAME_INFO(isnan), expected, actual).check();
}
}
TYPED_TEST(LGamma, LGamma) {
using vec = TypeParam;
using UVT = UvalueType<vec>;

View File

@ -32,8 +32,7 @@ static void FusedOverhead(benchmark::State& state) {
}
static void UnfusedOverhead(benchmark::State& state) {
torch::NoGradGuard ng;
torch::AutoNonVariableTypeMode nv;
c10::InferenceMode guard;
overrideCanFuseOnCPU(false);
Module m("m");

View File

@ -244,7 +244,7 @@ if __name__ == '__main__':
vlrnns = ['vl_cudnn', 'vl_jit', 'vl_py']
if args.print_json:
print_stderr = lambda *args, **kwargs: None # noqa
print_stderr = lambda *args, **kwargs: None # noqa: E731,F811
print_stderr(args)
bench_args = copy.deepcopy(vars(args))

View File

@ -239,7 +239,6 @@ def varlen_lstm_factory(cell, script):
def dynamic_rnn(sequences: List[Tensor], hiddens: Tuple[Tensor, Tensor], wih: Tensor,
whh: Tensor, bih: Tensor, bhh: Tensor
) -> Tuple[List[Tensor], Tuple[List[Tensor], List[Tensor]]]:
# noqa
hx, cx = hiddens
hxs = hx.unbind(1)
cxs = cx.unbind(1)

View File

@ -1,11 +1,11 @@
import operator_benchmark as op_bench
from pt import ( # noqa
add_test, as_strided_test, batchnorm_test, binary_test, cat_test, # noqa
channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test, # noqa
fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test, # noqa
softmax_test, hardsigmoid_test, hardswish_test, layernorm_test, # noqa
groupnorm_test, interpolate_test, instancenorm_test, remainder_test, softmax_test, # noqa
split_test, sum_test, tensor_to_test # noqa
from pt import ( # noqa: F401
add_test, as_strided_test, batchnorm_test, binary_test, cat_test,
channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test,
fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test,
softmax_test, hardsigmoid_test, hardswish_test, layernorm_test,
groupnorm_test, interpolate_test, instancenorm_test, remainder_test,
split_test, sum_test, tensor_to_test
)
if __name__ == "__main__":

View File

@ -1,5 +1,5 @@
import operator_benchmark as op_bench
from pt import ( # noqa
from pt import ( # noqa: F401
qactivation_test,
qarithmetic_test,
qbatchnorm_test,

View File

@ -1,9 +1,9 @@
import operator_benchmark as op_bench
from pt import ( # noqa
unary_test, # noqa
from pt import ( # noqa: F401
unary_test,
)
import benchmark_all_other_test # noqa
import benchmark_all_quantized_test # noqa
import benchmark_all_other_test # noqa: F401
import benchmark_all_quantized_test # noqa: F401
if __name__ == "__main__":
op_bench.benchmark_runner.main()

View File

@ -7,7 +7,7 @@ import copy
import ast
# needs to be imported after torch
import torch.utils.cpp_extension as cpp_extension # noqa
import torch.utils.cpp_extension as cpp_extension # noqa: F401
import benchmark_utils
from collections import namedtuple

View File

@ -1,7 +1,7 @@
import time
import json
import torch
import cpp_extension # noqa
import cpp_extension # noqa: F401
"""PyTorch performance microbenchmarks.

View File

@ -1,6 +1,6 @@
import operator_benchmark as op_bench
import benchmark_caffe2 as op_bench_c2
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core

View File

@ -1,6 +1,6 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core

View File

@ -1,6 +1,6 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
import numpy

View File

@ -1,6 +1,6 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core, dyndep
dyndep.InitOpsLibrary("@/caffe2/caffe2/fb/operators:clip_ranges_op")

View File

@ -1,7 +1,7 @@
import operator_benchmark as op_bench
import benchmark_caffe2 as op_bench_c2
import random
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
@ -17,9 +17,9 @@ cross_product_configs = {
cat_configs_short = op_bench.config_list(
attr_names=['sizes', 'N', 'axis'],
attrs=[
[(1, 1, 1), 2, 0], # noqa
[(512, 512, 2), 2, 1], # noqa
[(128, 1024, 2), 2, 1], # noqa
[(1, 1, 1), 2, 0], # noqa: E241
[(512, 512, 2), 2, 1], # noqa: E241
[(128, 1024, 2), 2, 1], # noqa: E241
],
cross_product_configs=cross_product_configs,
tags=['short'],
@ -29,14 +29,14 @@ cat_configs_short = op_bench.config_list(
cat_configs_static_runtime = op_bench.config_list(
attr_names=['sizes', 'N', 'axis', 'add_axis'],
attrs=[
[(1, 40), 5, 1, 1], # noqa
[[(1, 160), (1, 14)], -1, 1, 0], # noqa
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1, 0], # noqa
[[(1, 580), (1, 174)], -1, 1, 0], # noqa
[(20, 40), 5, 1, 1], # noqa
[[(20, 160), (20, 14)], -1, 1, 0], # noqa
[[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1, 0], # noqa
[[(20, 580), (20, 174)], -1, 1, 0], # noqa
[(1, 40), 5, 1, 1],
[[(1, 160), (1, 14)], -1, 1, 0],
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1, 0],
[[(1, 580), (1, 174)], -1, 1, 0],
[(20, 40), 5, 1, 1],
[[(20, 160), (20, 14)], -1, 1, 0],
[[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1, 0],
[[(20, 580), (20, 174)], -1, 1, 0],
],
cross_product_configs=cross_product_configs,
tags=['static_runtime'],
@ -45,22 +45,22 @@ cat_configs_static_runtime = op_bench.config_list(
cat_configs_long = op_bench.config_list(
attr_names=['sizes', 'N', 'axis'],
attrs=[
[(2**10, 2**10, 2), 2, 0], # noqa
[(2**10+1, 2**10-1, 2), 2, 1], # noqa
[(2**10, 2**10, 2), 2, 2], # noqa
[(2**10, 2**10, 2), 2, 0], # noqa: E241
[(2**10+1, 2**10-1, 2), 2, 1], # noqa: E226,E241
[(2**10, 2**10, 2), 2, 2], # noqa: E241
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa: E201,E226,E241
5, 0],
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa: E201,E226,E241,E272
5, 1],
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa: E201,E241,E272
5, 2],
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa: E241
50, 0],
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa: E241,E272
50, 1],
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa: E226,E241,E272
50, 2],
],
cross_product_configs=cross_product_configs,
@ -71,9 +71,9 @@ cat_configs_long = op_bench.config_list(
cat_configs_multidim = op_bench.config_list(
attr_names=['sizes', 'N', 'axis', 'dtype'],
attrs=[
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa: E241
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa: E241
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa: E226,E241
],
cross_product_configs=cross_product_configs,
tags=['multidim'],

View File

@ -1,7 +1,7 @@
import operator_benchmark as op_bench
import benchmark_caffe2 as op_bench_c2
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core
"""Microbenchmarks for MatMul operator"""

View File

@ -1,6 +1,6 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core

View File

@ -1,6 +1,6 @@
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
from caffe2.python import core

View File

@ -1,5 +1,5 @@
# TODO (mingzhe09088): get rid of noqa
import benchmark_runner # noqa
from benchmark_pytorch import TorchBenchmarkBase # noqa
from benchmark_test_generator import * # noqa
from benchmark_utils import * # noqa
import benchmark_runner # noqa: F401
from benchmark_pytorch import TorchBenchmarkBase # noqa: F401
from benchmark_test_generator import * # noqa: F401,F403
from benchmark_utils import * # noqa: F401,F403

View File

@ -14,9 +14,9 @@ cross_product_configs = {
cat_configs_short = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attrs=[
[(1, 1, 1), 2, 0], # noqa
[(512, 512, 2), 2, 1], # noqa
[(128, 1024, 2), 2, 1], # noqa
[(1, 1, 1), 2, 0], # noqa: E241
[(512, 512, 2), 2, 1], # noqa: E241
[(128, 1024, 2), 2, 1], # noqa: E241
],
cross_product_configs=cross_product_configs,
tags=['short'],
@ -26,12 +26,12 @@ cat_configs_short = op_bench.config_list(
cat_configs_static_runtime = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attrs=[
[[(1, 160), (1, 14)], -1, 1], # noqa
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1], # noqa
[[(1, 580), (1, 174)], -1, 1], # noqa
[[(20, 160), (20, 14)], -1, 1], # noqa
[[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1], # noqa
[[(20, 580), (20, 174)], -1, 1], # noqa
[[(1, 160), (1, 14)], -1, 1],
[[(1, 20, 40), (1, 4, 40), (1, 5, 40)], -1, 1],
[[(1, 580), (1, 174)], -1, 1],
[[(20, 160), (20, 14)], -1, 1],
[[(20, 20, 40), (20, 4, 40), (20, 5, 40)], -1, 1],
[[(20, 580), (20, 174)], -1, 1],
],
cross_product_configs=cross_product_configs,
tags=['static_runtime'],
@ -40,22 +40,22 @@ cat_configs_static_runtime = op_bench.config_list(
cat_configs_long = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attrs=[
[(2**10, 2**10, 2), 2, 0], # noqa
[(2**10+1, 2**10-1, 2), 2, 1], # noqa
[(2**10, 2**10, 2), 2, 2], # noqa
[(2**10, 2**10, 2), 2, 0], # noqa: E241
[(2**10+1, 2**10-1, 2), 2, 1], # noqa: E226,E241
[(2**10, 2**10, 2), 2, 2], # noqa: E241
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa: E201,E226,E241
5, 0],
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa: E201,E226,E241,E272
5, 1],
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa: E201,E241,E272
5, 2],
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa: E241
50, 0],
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa: E241,E272
50, 1],
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa: E226,E241,E272
50, 2],
],
cross_product_configs=cross_product_configs,
@ -66,9 +66,9 @@ cat_configs_long = op_bench.config_list(
cat_configs_multidim = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attrs=[
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa: E241
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa: E241
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa: E226,E241
],
cross_product_configs=cross_product_configs,
tags=['multidim'],

View File

@ -8,17 +8,17 @@ r"""Microbenchmarks for the quantized activations."""
qactivation_long_configs = op_bench.cross_product_configs(
dims=(
# VGG-16 relu's with original shape: (-1, 3, 224, 224)
( 64, 224, 224), # ReLU-1 # noqa
(128, 112, 112), # ReLU-6 # noqa
(256, 56, 56), # ReLU-11 # noqa
(512, 28, 28), # ReLU-18 # noqa
(512, 14, 14), # ReLU-25 # noqa
( 64, 224, 224), # ReLU-1 # noqa: E201
(128, 112, 112), # ReLU-6
(256, 56, 56), # ReLU-11 # noqa: E241
(512, 28, 28), # ReLU-18 # noqa: E241
(512, 14, 14), # ReLU-25 # noqa: E241
# Batch = 16
(16, 64, 224, 224), # ReLU-1 # noqa
(16, 128, 112, 112), # ReLU-6 # noqa
(16, 256, 56, 56), # ReLU-11 # noqa
(16, 512, 28, 28), # ReLU-18 # noqa
(16, 512, 14, 14), # ReLU-25 # noqa
(16, 64, 224, 224), # ReLU-1 # noqa: E241
(16, 128, 112, 112), # ReLU-6
(16, 256, 56, 56), # ReLU-11 # noqa: E241
(16, 512, 28, 28), # ReLU-18 # noqa: E241
(16, 512, 14, 14), # ReLU-25 # noqa: E241
),
contig=(False, True),
inplace=(False, True),

View File

@ -43,7 +43,7 @@ qobserver_per_tensor_configs_short = op_bench.config_list(
cross_product_configs={
'qscheme': (torch.per_tensor_affine, torch.per_tensor_symmetric)
},
**qobserver_short_configs_dict, # noqa
**qobserver_short_configs_dict,
)
qobserver_per_tensor_configs_long = op_bench.cross_product_configs(
@ -67,7 +67,7 @@ q_hist_observer_per_tensor_configs_short = op_bench.config_list(
cross_product_configs={
'qscheme': (torch.per_tensor_affine, torch.per_tensor_symmetric)
},
**q_hist_observer_short_configs_dict, # noqa
**q_hist_observer_short_configs_dict,
)
q_hist_observer_per_tensor_configs_long = op_bench.cross_product_configs(

View File

@ -6,11 +6,11 @@ import operator_benchmark as op_bench
qpool2d_long_configs = op_bench.config_list(
attrs=(
# C H W k s p
( 1, 3, 3, (3, 3), (1, 1), (0, 0)), # dummy # noqa
( 3, 64, 64, (3, 3), (2, 2), (1, 1)), # dummy # noqa
( 1, 3, 3, (3, 3), (1, 1), (0, 0)), # dummy # noqa: E201,E241
( 3, 64, 64, (3, 3), (2, 2), (1, 1)), # dummy # noqa: E201,E241
# VGG16 pools with original input shape: (-1, 3, 224, 224)
( 64, 224, 224, (2, 2), (2, 2), (0, 0)), # MaxPool2d-4 # noqa
(256, 56, 56, (2, 2), (2, 2), (0, 0)), # MaxPool2d-16 # noqa
( 64, 224, 224, (2, 2), (2, 2), (0, 0)), # MaxPool2d-4 # noqa: E201
(256, 56, 56, (2, 2), (2, 2), (0, 0)), # MaxPool2d-16 # noqa: E241
),
attr_names=('C', 'H', 'W', # Input layout
'k', 's', 'p'), # Pooling parameters
@ -23,7 +23,7 @@ qpool2d_long_configs = op_bench.config_list(
)
qpool2d_short_configs = op_bench.config_list(
attrs=((1, 3, 3, (3, 3), (1, 1), (0, 0)),), # dummy # noqa
attrs=((1, 3, 3, (3, 3), (1, 1), (0, 0)),), # dummy
attr_names=('C', 'H', 'W', # Input layout
'k', 's', 'p'), # Pooling parameters
cross_product_configs={
@ -37,15 +37,15 @@ qpool2d_short_configs = op_bench.config_list(
qadaptive_avgpool2d_long_configs = op_bench.cross_product_configs(
input_size=(
# VGG16 pools with original input shape: (-1, 3, 224, 224)
(112, 112), # MaxPool2d-9 # noqa
(112, 112), # MaxPool2d-9
),
output_size=(
(448, 448),
# VGG16 pools with original input shape: (-1, 3, 224, 224)
(224, 224), # MaxPool2d-4 # noqa
(112, 112), # MaxPool2d-9 # noqa
( 56, 56), # MaxPool2d-16 # noqa
( 14, 14), # MaxPool2d-30 # noqa
(224, 224), # MaxPool2d-4
(112, 112), # MaxPool2d-9
( 56, 56), # MaxPool2d-16 # noqa: E201,E241
( 14, 14), # MaxPool2d-30 # noqa: E201,E241
),
N=(1, 4),
C=(1, 3, 64, 128),

View File

@ -10,8 +10,8 @@ from typing import List
stack_configs_static_runtime = op_bench.config_list(
attr_names=['sizes', 'N'],
attrs=[
[(20, 40), 5], # noqa
[(1, 40), 5], # noqa
[(20, 40), 5],
[(1, 40), 5],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
@ -23,9 +23,9 @@ stack_configs_static_runtime = op_bench.config_list(
stack_configs_short = op_bench.config_list(
attr_names=['sizes', 'N'],
attrs=[
[(1, 1, 1), 2], # noqa
[(512, 512, 2), 2], # noqa
[(128, 1024, 2), 2], # noqa
[(1, 1, 1), 2], # noqa: E241
[(512, 512, 2), 2], # noqa: E241
[(128, 1024, 2), 2], # noqa: E241
],
cross_product_configs={
'device': ['cpu', 'cuda'],
@ -37,9 +37,9 @@ stack_configs_short = op_bench.config_list(
stack_configs_long = op_bench.config_list(
attr_names=['sizes', 'N'],
attrs=[
[(2**10, 2**10, 2), 2], # noqa
[(2**10+1, 2**10-1, 2), 2], # noqa
[(2**10, 2**10, 2), 2], # noqa
[(2**10, 2**10, 2), 2], # noqa: E241
[(2**10+1, 2**10-1, 2), 2], # noqa: E226,E241
[(2**10, 2**10, 2), 2], # noqa: E241
],
cross_product_configs={
'device': ['cpu', 'cuda'],
@ -52,9 +52,9 @@ stack_configs_long = op_bench.config_list(
stack_configs_multidim = op_bench.config_list(
attr_names=['sizes', 'N'],
attrs=[
[(2**6, 2**5, 2**2, 2**4, 2**5), 2], # noqa
[(2**4, 2**5, 2**2, 2**4, 2**5), 8], # noqa
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17], # noqa
[(2**6, 2**5, 2**2, 2**4, 2**5), 2], # noqa: E241
[(2**4, 2**5, 2**2, 2**4, 2**5), 8], # noqa: E241
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17], # noqa: E226,E241
],
cross_product_configs={
'device': ['cpu', 'cuda'],

View File

@ -1,6 +1,6 @@
import unittest
import cpp_extension # noqa
import cpp_extension # noqa: F401
import torch

View File

@ -86,7 +86,7 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
return Backend::QuantizedCPU;
} else if (t == DispatchKey::QuantizedCUDA) {
return Backend::QuantizedCUDA;
} else if (t == DispatchKey::XPU) {
} else if (t == DispatchKey::XPU || t == DispatchKey::AutogradXPU) {
return Backend::XPU;
} else if (t == DispatchKey::SparseXPU) {
return Backend::SparseXPU;

View File

@ -146,6 +146,8 @@ DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
switch (t) {
case DispatchKey::CPU:
return DispatchKey::AutogradCPU;
case DispatchKey::XPU:
return DispatchKey::AutogradXPU;
case DispatchKey::CUDA:
return DispatchKey::AutogradCUDA;
case DispatchKey::XLA:

View File

@ -19,6 +19,10 @@
#include <execinfo.h>
#endif
#ifdef FBCODE_CAFFE2
#include <common/process/StackTrace.h>
#endif
namespace c10 {
#if SUPPORTS_BACKTRACE
@ -167,7 +171,14 @@ std::string get_backtrace(
size_t frames_to_skip,
size_t maximum_number_of_frames,
bool skip_python_frames) {
#if SUPPORTS_BACKTRACE
#ifdef FBCODE_CAFFE2
// For some reason, the stacktrace implementation in fbcode is
// better than ours, see https://github.com/pytorch/pytorch/issues/56399
// When it's available, just use that.
facebook::process::StackTrace st;
return st.toString();
#elif SUPPORTS_BACKTRACE
// We always skip this frame (backtrace).
frames_to_skip += 1;

View File

@ -209,7 +209,6 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
float weight_sum = 0.0;
if (!label_prob_mode_) {
const int* label_data = T.data<int>();
const float* Xdata = X.data<float>();
for (int i = 0; i < N; ++i) {
CAFFE_ENFORCE(

View File

@ -26,6 +26,15 @@ C10_EXPORT std::string DeviceTypeName(const int32_t& d) {
return at::DeviceTypeName(static_cast<at::DeviceType>(d));
}
void setTotalBytesLimit(::google::protobuf::io::CodedInputStream& stream, int bytes_limit, int warning_threshold) {
#if GOOGLE_PROTOBUF_VERSION >= 3011000
// Only take one parameter since protobuf 3.11
stream.SetTotalBytesLimit(bytes_limit);
#else
stream.SetTotalBytesLimit(bytes_limit, warning_threshold);
#endif
}
C10_EXPORT int DeviceId(const DeviceOption& option) {
switch (option.device_type()) {
case PROTO_CPU:
@ -136,7 +145,7 @@ C10_EXPORT bool ParseProtoFromLargeString(
::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
// Set PlanDef message size limit to 2G.
coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
return proto->ParseFromCodedStream(&coded_stream);
}
@ -149,7 +158,7 @@ C10_EXPORT bool ReadProtoFromBinaryFile(
// Total bytes hard limit / warning limit are set to 2GB and 512MB
// respectively.
::google::protobuf::io::CodedInputStream coded_stream(&stream);
coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
return proto->ParseFromCodedStream(&coded_stream);
}
@ -200,7 +209,7 @@ C10_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
// Set PlanDef message size limit to 2G.
coded_stream.SetTotalBytesLimit(2147483647, 512LL << 20);
setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
return proto->ParseFromCodedStream(&coded_stream);
}
@ -244,7 +253,13 @@ C10_EXPORT bool ReadProtoFromBinaryFile(
std::unique_ptr<CodedInputStream> coded_input(
new CodedInputStream(raw_input.get()));
// A hack to manually allow using very large protocol buffers.
coded_input->SetTotalBytesLimit(2147483647, 536870912);
#if GOOGLE_PROTOBUF_VERSION >= 3011000
// Only take one parameter since protobuf 3.11
coded_input->SetTotalBytesLimit(2147483647);
#else
// Total bytes hard limit / warning limit are set to 2GB and 512MB respectively.
coded_input->SetTotalBytesLimit(2147483647, 536870912);
#endif
bool success = proto->ParseFromCodedStream(coded_input.get());
coded_input.reset();
raw_input.reset();

View File

@ -1,5 +1,6 @@
#include "caffe2/utils/signal_handler.h"
#include "caffe2/core/logging.h"
#include <c10/util/Backtrace.h>
#if defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
@ -149,63 +150,11 @@ const char* getSignalName(int signum) {
return nullptr;
}
_Unwind_Reason_Code unwinder(struct _Unwind_Context* context, void* userInfo) {
auto& pcs = *reinterpret_cast<std::vector<uintptr_t>*>(userInfo);
pcs.push_back(_Unwind_GetIP(context));
return _URC_NO_REASON;
}
std::vector<uintptr_t> getBacktrace() {
std::vector<uintptr_t> pcs;
_Unwind_Backtrace(unwinder, &pcs);
return pcs;
}
void printBlobSizes() {
::caffe2::Workspace::ForEach(
[&](::caffe2::Workspace* ws) { ws->PrintBlobSizes(); });
}
void printStacktrace() {
std::vector<uintptr_t> pcs = getBacktrace();
Dl_info info;
size_t i = 0;
for (uintptr_t pcAddr : pcs) {
const void* pc = reinterpret_cast<const void*>(pcAddr);
const char* path = nullptr;
const char* name = "???";
char* demangled = nullptr;
int offset = -1;
std::cerr << "[" << i << "] ";
if (dladdr(pc, &info)) {
path = info.dli_fname;
name = info.dli_sname ?: "???";
offset = reinterpret_cast<uintptr_t>(pc) -
reinterpret_cast<uintptr_t>(info.dli_saddr);
int status;
demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
if (status == 0) {
name = demangled;
}
}
std::cerr << name;
if (offset >= 0) {
std::cerr << "+" << reinterpret_cast<void*>(offset);
}
std::cerr << "(" << pc << ")";
if (path) {
std::cerr << " in " << path;
}
std::cerr << std::endl;
if (demangled) {
free(demangled);
}
i += 1;
}
}
void callPreviousSignalHandler(
struct sigaction* action,
int signum,
@ -229,7 +178,7 @@ void stacktraceSignalHandler(bool needsLock) {
pid_t tid = syscall(SYS_gettid);
std::cerr << fatalSignalName << "(" << fatalSignum << "), PID: " << ::getpid()
<< ", Thread " << tid << ": " << std::endl;
printStacktrace();
std::cerr << c10::get_backtrace();
std::cerr << std::endl;
if (needsLock) {
pthread_mutex_unlock(&writingMutex);

View File

@ -18,7 +18,7 @@ Features described in this documentation are classified by release status:
breaking changes can happen and notice will be given one release ahead
of time).
*Beta:* Features are tagged as Beta because the API may change based on
*Beta:* These features are tagged as Beta because the API may change based on
user feedback, because the performance needs to improve, or because
coverage across operators is not yet complete. For Beta features, we are
committing to seeing the feature through to the Stable classification.

View File

@ -19,7 +19,6 @@ TorchScript
.. toctree::
:maxdepth: 1
:caption: Language Reference
jit_language_reference_v2

View File

@ -39,7 +39,7 @@ files =
benchmarks/instruction_counts,
tools/autograd/*.py,
tools/clang_tidy.py,
tools/codegen/*.py,
tools/codegen,
tools/extract_scripts.py,
tools/mypy_wrapper.py,
tools/print_test_stats.py,

View File

@ -697,7 +697,7 @@ class TestBenchmarkUtils(TestCase):
2000 /usr/include/c++/8/bits/atomic_base.h:at::Tensor at::detail::make_tensor ... t_null_type<c10::StorageImpl> >&&, c10::DispatchKey&&, caffe2::TypeMeta&)
2000 /usr/include/c++/8/array:at::Tensor& c10::Dispatcher::callWithDispatchKe ... , c10::Scalar)> const&, c10::DispatchKey, at::Tensor&, c10::Scalar) const
Total: 8869966""" # noqa
Total: 8869966""" # noqa: B950
)
self.regularizeAndAssertExpectedInline(
@ -935,7 +935,7 @@ class TestBenchmarkUtils(TestCase):
compute_optimized | \x1b[2m\x1b[91m 3 \x1b[0m\x1b[0m | 4.0 | 11 | \x1b[92m\x1b[1m 2100 \x1b[0m\x1b[0m | 2100
special_case (square) | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 8 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 1700 \x1b[0m\x1b[0m
Times are in microseconds (us).""" # noqa
Times are in microseconds (us).""" # noqa: B950
)
compare.colorize(rowwise=True)
@ -949,7 +949,7 @@ class TestBenchmarkUtils(TestCase):
compute_optimized | \x1b[92m\x1b[1m 3 \x1b[0m\x1b[0m | 4.0 | \x1b[2m\x1b[91m 11 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 2100 \x1b[0m\x1b[0m | \x1b[31m\x1b[1m 2100 \x1b[0m\x1b[0m
special_case (square) | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | | \x1b[31m\x1b[1m 8 \x1b[0m\x1b[0m | | \x1b[31m\x1b[1m 1700 \x1b[0m\x1b[0m
Times are in microseconds (us).""" # noqa
Times are in microseconds (us).""" # noqa: B950
)
def print_new_expected(s: str) -> None:

View File

@ -61,13 +61,14 @@ TEST(GradModeTest, TestRequiresGradViewOpExiting) {
if (requires_grad) {
ASSERT_THROWS_WITH(view_out.mul_(2), // go through kernels: VariableType, InplaceOrView, CPU
"A view was created in no_grad mode and is being modified inplace")
"a leaf Variable that requires grad is being used in an in-place operation")
} else {
view_out.mul_(2);
}
tmp = view_out.view({2, 3});
ASSERT_EQ(tmp.requires_grad(), requires_grad);
assert_tensor_creation_meta(tmp, torch::autograd::CreationMeta::NO_GRAD_MODE);
// TODO: update when above error is fixed
// assert_tensor_creation_meta(tmp, torch::autograd::CreationMeta::NO_GRAD_MODE);
}
}

View File

@ -424,7 +424,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
ASSERT_TRUE(almost_equal(tensor[2], 3.125));
tensor = torch::tensor({1.5f, 2.25f, 3.125f});
ASSERT_TRUE(tensor.is_variable());
ASSERT_EQ(tensor.numel(), 3);
ASSERT_EQ(tensor.sizes(), std::vector<int64_t>({3}));
ASSERT_EQ(tensor.dtype(), default_dtype);
@ -433,7 +432,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
ASSERT_TRUE(almost_equal(tensor[2], 3.125f));
tensor = torch::tensor(at::ArrayRef<float>({1.5f, 2.25f, 3.125f}));
ASSERT_TRUE(tensor.is_variable());
ASSERT_EQ(tensor.numel(), 3);
ASSERT_EQ(tensor.dtype(), default_dtype);
ASSERT_TRUE(almost_equal(tensor[0], 1.5));
@ -441,7 +439,6 @@ void test_TorchTensorCtorSingleDimFloatingType_expected_dtype(c10::ScalarType de
ASSERT_TRUE(almost_equal(tensor[2], 3.125));
tensor = torch::tensor(std::vector<float>({1.5f, 2.25f, 3.125f}));
ASSERT_TRUE(tensor.is_variable());
ASSERT_EQ(tensor.numel(), 3);
ASSERT_EQ(tensor.sizes(), std::vector<int64_t>({3}));
ASSERT_EQ(tensor.dtype(), default_dtype);

View File

@ -46,6 +46,7 @@ TEST(TensorpipeSerialize, Base) {
tensorpipe::Descriptor::Tensor t;
t.length = tpTensor.length;
t.sourceDevice = tpTensor.buffer.device();
t.targetDevice = tpTensor.targetDevice;
t.metadata = tpTensor.metadata;
recvingTpDescriptor.tensors.push_back(std::move(t));
}

View File

@ -4324,7 +4324,8 @@ TEST(LoopNest, fuseLoopsSimple) {
auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
auto par = Block::make({forJ, forK});
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4364,7 +4365,8 @@ TEST(LoopNest, fuseLoopsMultiple) {
auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
auto par = Block::make({forI, forJ, forK});
auto fused_loop = LoopNest::fuseLoops({forI, forJ, forK});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forI, forJ, forK}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4421,7 +4423,8 @@ TEST(LoopNest, fuseLoopsNested) {
auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
auto par = Block::make({forM, forN});
auto fused_loop = LoopNest::fuseLoops({forM, forN});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4481,7 +4484,8 @@ TEST(LoopNest, fuseLoopsNested2D) {
50,
Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)))));
auto par = Block::make({forI, forM});
auto fused_loop = LoopNest::fuseLoops({forI, forM});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4522,7 +4526,8 @@ TEST(LoopNest, fuseLoopsNested2DInner) {
auto forN = For::make(
n, 0, 100, Store::make(b_buf, {i, n}, Add::make(i, Mul::make(n, 100))));
auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
auto fused_loop = LoopNest::fuseLoops({forJ, forN});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
std::ostringstream oss;
oss << *forI;
@ -4557,8 +4562,8 @@ TEST(LoopNest, fuseLoopsDifferentStopBounds) {
auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
auto forK = For::make(k, 0, 50, Store::make(b_buf, {j}, Mul::make(20, k)));
auto par = Block::make({forJ, forK});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forJ, forK}), "Loops with different stop bounds");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
}
TEST(LoopNest, fuseLoopsDifferentStartBounds) {
@ -4578,8 +4583,8 @@ TEST(LoopNest, fuseLoopsDifferentStartBounds) {
auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
auto par = Block::make({forJ, forK});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forJ, forK}), "Loops with different start bounds");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
}
TEST(LoopNest, fuseLoopsNotContiguous) {
@ -4601,8 +4606,8 @@ TEST(LoopNest, fuseLoopsNotContiguous) {
auto initB = Store::make(b_buf, {0}, 0);
auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
auto par = Block::make({forJ, initB, forK});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forJ, forK}), "Only contiguous loops can be fused");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
}
TEST(LoopNest, fuseLoopsWithDifferentParents) {
@ -4628,8 +4633,8 @@ TEST(LoopNest, fuseLoopsWithDifferentParents) {
auto initB = Store::make(b_buf, {0}, 0);
auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
auto par = Block::make({forI, initB, forK});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forJ, forK}), "loops with different parents");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
}
TEST(LoopNest, fuseLoopsWithVariableBounds) {
@ -4650,7 +4655,8 @@ TEST(LoopNest, fuseLoopsWithVariableBounds) {
auto forJ = For::make(j, 0, N, Store::make(a_buf, {j}, Mul::make(10, j)));
auto forK = For::make(k, 0, N, Store::make(b_buf, {j}, Mul::make(20, k)));
auto par = Block::make({forJ, forK});
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4686,7 +4692,8 @@ TEST(LoopNest, fuseLoopsWithExprBounds) {
auto forJ = For::make(j, 0, M + N, Store::make(a_buf, {j}, Mul::make(10, j)));
auto forK = For::make(k, 0, M + N, Store::make(b_buf, {j}, Mul::make(20, k)));
auto par = Block::make({forJ, forK});
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4722,7 +4729,8 @@ TEST(LoopNest, fuseLoopsWithDifferentExprBounds) {
auto forJ = For::make(j, M, N * 2, Store::make(a_buf, {j}, Mul::make(10, j)));
auto forK = For::make(k, M, N + N, Store::make(b_buf, {j}, Mul::make(20, k)));
auto par = Block::make({forJ, forK});
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4757,7 +4765,8 @@ TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) {
For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(30, k)));
auto par = Block::make({forJ, forK});
auto fused_loop = LoopNest::fuseLoops({forJ, forK});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4803,7 +4812,8 @@ TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) {
auto forM = For::make(m, 0, 20, forN);
auto par = Block::make({forI, forM});
auto fused_loop = LoopNest::fuseLoops({forI, forM});
For* fused_loop;
ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
std::ostringstream oss;
oss << *par;
@ -4839,9 +4849,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies1) {
auto forK =
For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
auto par = Block::make({forJ, forK});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forJ, forK}),
"not valid since it results in a loop carried dependence");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
}
TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
@ -4861,9 +4870,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
auto forK =
For::make(k, 10, 100, Store::make(a_buf, {k + 50}, Mul::make(20, k)));
auto par = Block::make({forJ, forK});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forJ, forK}),
"not valid since it results in a loop carried dependence");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
}
TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
@ -4905,9 +4913,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
auto par = Block::make({forM, forN});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forM, forN}),
"not valid since it results in a loop carried dependence");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
}
TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
@ -4948,9 +4955,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
50,
Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)))));
auto par = Block::make({forI, forM});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forI, forM}),
"not valid since it results in a loop carried dependence");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
}
TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
@ -4977,9 +4983,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
100,
Store::make(a_buf, {i, n + 1}, Add::make(i, Mul::make(n, 100))));
auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forJ, forN}),
"not valid since it results in a loop carried dependence");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
}
TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
@ -5004,9 +5009,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
Store::make(
b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
auto par = Block::make({forJ, forK});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forJ, forK}),
"not valid since it results in a loop carried dependence");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
}
TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
@ -5031,9 +5035,8 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
auto par = Block::make({forK, forJ});
ASSERT_THROWS_WITH(
LoopNest::fuseLoops({forK, forJ}),
"not valid since it results in a loop carried dependence");
For* fused_loop;
ASSERT_FALSE(LoopNest::fuseLoops({forK, forJ}, &fused_loop));
}
TEST(LoopNest, areLoopsPerfectlyNested) {

View File

@ -0,0 +1,15 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
def main():
print("Success, smoke test")
if __name__ == "__main__":
main()

View File

@ -32,7 +32,7 @@ def test_skippable_repr():
def forward(self, x):
yield stash("hello", x)
return self.conv(x) # noqa
return self.conv(x) # noqa: B901
m = Hello()
assert (

View File

@ -30,7 +30,7 @@ def test_1to3(balance, checkpoint, setup_rpc):
def forward(self, input):
yield stash("1to3", input)
output = self.conv(input)
return output # noqa
return output # noqa: B901
class Layer2(nn.Module):
def __init__(self):
@ -73,7 +73,7 @@ def test_none_skip(setup_rpc):
class Stash(nn.Module):
def forward(self, input):
yield stash("none", None)
return input # noqa
return input # noqa: B901
@skippable(pop=["none"])
class Pop(nn.Module):

View File

@ -19,7 +19,7 @@ class Pass(nn.Module):
class StashFoo(nn.Module):
def forward(self, input):
yield stash("foo", input)
return input # noqa
return input # noqa: B901
@skippable(pop=["foo"])
@ -33,7 +33,7 @@ class PopFoo(nn.Module):
class StashBar(nn.Module):
def forward(self, input):
yield stash("bar", input)
return input # noqa
return input # noqa: B901
@skippable(pop=["bar"])

View File

@ -17,7 +17,7 @@ from torch.distributed.pipeline.sync.skip.tracker import current_skip_tracker
class Stash(nn.Module):
def forward(self, input):
yield stash("skip", input)
return input # noqa
return input # noqa: B901
@skippable(pop=["skip"])

View File

@ -24,7 +24,7 @@ def test_stash(skip_tracker):
class Stash(nn.Module):
def forward(self, input):
yield stash("foo", input)
return input * 2 # noqa
return input * 2 # noqa: B901
l1 = Stash()
@ -41,13 +41,13 @@ def test_pop():
class Stash(nn.Module):
def forward(self, input):
yield stash("foo", input)
return input * 2 # noqa
return input * 2 # noqa: B901
@skippable(pop=["foo"])
class Pop(nn.Module):
def forward(self, input):
foo = yield pop("foo")
return foo # noqa
return foo
l1 = Stash()
l2 = Pop()
@ -83,7 +83,7 @@ def test_stash_not_declared():
class Stash(nn.Module):
def forward(self, input):
yield stash("foo", input)
return input * 2 # noqa
return input * 2 # noqa: B901
l1 = Stash()
@ -96,13 +96,13 @@ def test_pop_not_declared():
class Stash(nn.Module):
def forward(self, input):
yield stash("foo", input)
return input * 2 # noqa
return input * 2 # noqa: B901
@skippable()
class Pop(nn.Module):
def forward(self, input):
foo = yield pop("foo")
return foo # noqa
return foo
l1 = Stash()
l2 = Pop()
@ -130,7 +130,7 @@ def test_stash_none():
class Stash(nn.Module):
def forward(self, input):
yield stash("foo", None)
return input * 2 # noqa
return input * 2 # noqa: B901
l1 = Stash()
l1(torch.tensor(42))

View File

@ -40,7 +40,7 @@ def test_default_skip_tracker_by_data_parallel():
class Stash(nn.Module):
def forward(self, input):
yield stash("foo", input)
return input * 2 # noqa
return input * 2 # noqa: B901
@skippable(pop=["foo"])
class Pop(nn.Module):

View File

@ -12,7 +12,7 @@ def test_clock_cycles():
assert list(_clock_cycles(1, 3)) == [[(0, 0)], [(0, 1)], [(0, 2)]]
assert list(_clock_cycles(3, 1)) == [[(0, 0)], [(1, 0)], [(2, 0)]]
assert list(_clock_cycles(3, 3)) == [ # noqa
assert list(_clock_cycles(3, 3)) == [
[(0, 0)],
[(1, 0), (0, 1)],
[(2, 0), (1, 1), (0, 2)],
@ -20,7 +20,7 @@ def test_clock_cycles():
[(2, 2)],
]
assert list(_clock_cycles(4, 2)) == [ # noqa
assert list(_clock_cycles(4, 2)) == [
[(0, 0)],
[(1, 0), (0, 1)],
[(2, 0), (1, 1)],

View File

@ -49,6 +49,7 @@ from torch.testing._internal.common_distributed import (
create_device,
with_dist_debug_levels,
with_nccl_blocking_wait,
create_tcp_store,
)
from torch.testing._internal.common_utils import (
TestCase,
@ -299,27 +300,9 @@ class PrefixFileStoreTest(TestCase, StoreTestBase):
def _create_store(self):
return c10d.PrefixStore(self.prefix, self.filestore)
def create_tcp_store(addr, world_size=1, wait_for_workers=True):
"""
Creates a TCP store. Retries if the chosen port is already in use.
"""
ports = []
for _ in range(10):
try:
port = common.find_free_port()
ports.append(port)
return c10d.TCPStore(addr, port, world_size, True, wait_for_workers=wait_for_workers)
except RuntimeError as error:
if str(error) == "Address already in use":
continue
raise
raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports))
class TCPStoreTest(TestCase, StoreTestBase):
def _create_store(self):
store = create_tcp_store("localhost")
store = create_tcp_store()
store.set_timeout(timedelta(seconds=300))
return store
@ -329,7 +312,7 @@ class TCPStoreTest(TestCase, StoreTestBase):
else:
err_msg_reg = "^Address already in use$"
with self.assertRaisesRegex(RuntimeError, err_msg_reg):
addr = "localhost"
addr = DEFAULT_HOSTNAME
port = common.find_free_port()
# Use noqa to silence flake8.
@ -418,7 +401,7 @@ class TCPStoreTest(TestCase, StoreTestBase):
class PrefixTCPStoreTest(TestCase, StoreTestBase):
def setUp(self):
super(PrefixTCPStoreTest, self).setUp()
self.tcpstore = create_tcp_store("localhost")
self.tcpstore = create_tcp_store()
self.prefix = "test_prefix"
self.tcpstore.set_timeout(timedelta(seconds=300))
@ -652,7 +635,7 @@ class RendezvousFileTest(TestCase):
@skip_if_win32()
class RendezvousTCPTest(TestCase):
def create_tcp_url(self):
addr = "localhost"
addr = DEFAULT_HOSTNAME
port = common.find_free_port()
url = "tcp://%s:%d?world_size=%d" % (addr, port, 1)
return url

Some files were not shown because too many files have changed in this diff Show More