diff --git a/.bazelrc b/.bazelrc index 310eb293389d..67b5bfe76b92 100644 --- a/.bazelrc +++ b/.bazelrc @@ -1,6 +1,7 @@ build --copt=--std=c++14 build --copt=-I. build --copt=-isystem --copt bazel-out/k8-fastbuild/bin +build --experimental_ui_max_stdouterr_bytes=2048576 # Configuration to disable tty features for environments like CI build:no-tty --curses no @@ -11,3 +12,8 @@ build:no-tty --show_progress_rate_limit 10 build:gpu --define=cuda=true # define a separate build folder for faster switching between configs build:gpu --platform_suffix=-gpu +# rules_cuda configuration +build:gpu --@rules_cuda//cuda:enable_cuda +build:gpu --@rules_cuda//cuda:cuda_targets=sm_52 +build:gpu --@rules_cuda//cuda:compiler=nvcc +build:gpu --repo_env=CUDA_PATH=/usr/local/cuda diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 1899429e0bb4..7d428014cd79 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1 +1 @@ -Fixes #{issue number} +Fixes #ISSUE_NUMBER diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index f822adf6c4b5..6439e1c0416f 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -20,13 +20,13 @@ "linux-docs-push", "linux-vulkan-bionic-py3.6-clang9", "linux-xenial-cuda11.3-py3.6-gcc7", + "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test", "linux-xenial-py3-clang5-mobile-build", "linux-xenial-py3-clang5-mobile-custom-build-static", "linux-xenial-py3.6-clang7-asan", "linux-xenial-py3.6-clang7-onnx", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7", - "linux-xenial-py3.6-gcc7-bazel-test", "macos-10-15-py3-arm64", "macos-10-15-py3-lite-interpreter-x86-64", "macos-11-py3-x86-64", @@ -48,7 +48,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" ], "ciflow/bazel": [ - "linux-xenial-py3.6-gcc7-bazel-test" + "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test" ], "ciflow/cpu": [ "caffe2-linux-xenial-py3.6-gcc5.4", @@ -56,11 +56,11 @@ "linux-docs", "linux-docs-push", "linux-vulkan-bionic-py3.6-clang9", + "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test", "linux-xenial-py3.6-clang7-asan", "linux-xenial-py3.6-clang7-onnx", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7", - "linux-xenial-py3.6-gcc7-bazel-test", "parallelnative-linux-xenial-py3.6-gcc5.4", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", @@ -85,13 +85,13 @@ "linux-docs", "linux-vulkan-bionic-py3.6-clang9", "linux-xenial-cuda11.3-py3.6-gcc7", + "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test", "linux-xenial-py3-clang5-mobile-build", "linux-xenial-py3-clang5-mobile-custom-build-static", "linux-xenial-py3.6-clang7-asan", "linux-xenial-py3.6-clang7-onnx", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7", - "linux-xenial-py3.6-gcc7-bazel-test", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "win-vs2019-cpu-py3", @@ -126,13 +126,13 @@ "linux-docs-push", "linux-vulkan-bionic-py3.6-clang9", "linux-xenial-cuda11.3-py3.6-gcc7", + "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test", "linux-xenial-py3-clang5-mobile-build", "linux-xenial-py3-clang5-mobile-custom-build-static", "linux-xenial-py3.6-clang7-asan", "linux-xenial-py3.6-clang7-onnx", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7", - "linux-xenial-py3.6-gcc7-bazel-test", "parallelnative-linux-xenial-py3.6-gcc5.4", "periodic-libtorch-linux-bionic-cuda11.5-py3.6-gcc7", "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", @@ -185,6 +185,40 @@ "ciflow/slow-gradcheck": [ "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck" ], + "ciflow/trunk": [ + "caffe2-linux-xenial-py3.6-gcc5.4", + "docker-builds", + "ios-12-5-1-arm64", + "ios-12-5-1-arm64-coreml", + "ios-12-5-1-arm64-custom-ops", + "ios-12-5-1-arm64-full-jit", + "ios-12-5-1-arm64-metal", + "ios-12-5-1-x86-64", + "ios-12-5-1-x86-64-coreml", + "ios-12-5-1-x86-64-full-jit", + "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", + "libtorch-linux-xenial-cuda11.3-py3.6-gcc7", + "linux-bionic-cuda10.2-py3.9-gcc7", + "linux-bionic-py3.6-clang9", + "linux-docs", + "linux-vulkan-bionic-py3.6-clang9", + "linux-xenial-cuda11.3-py3.6-gcc7", + "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test", + "linux-xenial-py3-clang5-mobile-build", + "linux-xenial-py3-clang5-mobile-custom-build-static", + "linux-xenial-py3.6-clang7-asan", + "linux-xenial-py3.6-clang7-onnx", + "linux-xenial-py3.6-gcc5.4", + "linux-xenial-py3.6-gcc7", + "macos-10-15-py3-arm64", + "macos-10-15-py3-lite-interpreter-x86-64", + "macos-11-py3-x86-64", + "parallelnative-linux-xenial-py3.6-gcc5.4", + "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", + "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", + "win-vs2019-cpu-py3", + "win-vs2019-cuda11.3-py3" + ], "ciflow/vulkan": [ "linux-vulkan-bionic-py3.6-clang9" ], diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index fe9633623547..0015c455c222 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -72,6 +72,7 @@ LABEL_CIFLOW_SLOW_GRADCHECK = "ciflow/slow-gradcheck" LABEL_CIFLOW_DOCKER = "ciflow/docker" LABEL_CIFLOW_IOS = "ciflow/ios" LABEL_CIFLOW_MACOS = "ciflow/macos" +LABEL_CIFLOW_TRUNK = "ciflow/trunk" @dataclass @@ -114,6 +115,8 @@ class CIFlowConfig: def __post_init__(self) -> None: self.labels.add(LABEL_CIFLOW_ALL) + if LABEL_CIFLOW_SCHEDULED not in self.labels: + self.labels.add(LABEL_CIFLOW_TRUNK) assert all(label.startswith(LABEL_CIFLOW_PREFIX) for label in self.labels) self.gen_root_job_condition() @@ -224,6 +227,7 @@ class CIWorkflow: assert LABEL_CIFLOW_CPU in self.ciflow_config.labels if self.is_scheduled: assert LABEL_CIFLOW_DEFAULT not in self.ciflow_config.labels + assert LABEL_CIFLOW_TRUNK not in self.ciflow_config.labels assert LABEL_CIFLOW_SCHEDULED in self.ciflow_config.labels if self.build_with_debug: assert self.build_environment.endswith("-debug") @@ -605,8 +609,8 @@ ANDROID_WORKFLOWS = [ BAZEL_WORKFLOWS = [ CIWorkflow( arch="linux", - build_environment="linux-xenial-py3.6-gcc7-bazel-test", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7", + build_environment="linux-xenial-cuda11.3-py3.6-gcc7-bazel-test", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", test_runner_type=LINUX_CPU_TEST_RUNNER, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BAZEL, LABEL_CIFLOW_CPU, LABEL_CIFLOW_LINUX}, diff --git a/.github/workflows/generated-caffe2-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-caffe2-linux-xenial-py3.6-gcc5.4.yml index 81d7bdda452e..4427f56db6d1 100644 --- a/.github/workflows/generated-caffe2-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-caffe2-linux-xenial-py3.6-gcc5.4.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml index 6b5140ebf9f4..d1b105b8f608 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml @@ -29,12 +29,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml index 1fcb3f62548f..38a640b88700 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml @@ -29,12 +29,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml index 743fb50d30d6..e96213f6859c 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml @@ -29,12 +29,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml index e3523bac9929..c9ebc182e5ca 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml @@ -29,12 +29,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-ios-12-5-1-arm64.yml b/.github/workflows/generated-ios-12-5-1-arm64.yml index 642edaef8e12..5b1ce93aa6e0 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64.yml @@ -29,12 +29,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml b/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml index 3c797272bc63..7de34ee9f9fe 100644 --- a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml +++ b/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml @@ -29,12 +29,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml index 49c5c5a9d89f..c9b77d9c187b 100644 --- a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml +++ b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml @@ -29,12 +29,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-ios-12-5-1-x86-64.yml b/.github/workflows/generated-ios-12-5-1-x86-64.yml index ab896312adf7..a32aae9b53ff 100644 --- a/.github/workflows/generated-ios-12-5-1-x86-64.yml +++ b/.github/workflows/generated-ios-12-5-1-x86-64.yml @@ -29,12 +29,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index 1d8a6d436bc9..cccbfe6aac48 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index ca1eacb76d1a..e5b19d94f39b 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index ed6088c55a36..6182c692a4e8 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository_owner == 'pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml index 24ea32c45df4..5817687c0d59 100644 --- a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-docs.yml b/.github/workflows/generated-linux-docs.yml index a9dcc1a5d009..0fa49205cc59 100644 --- a/.github/workflows/generated-linux-docs.yml +++ b/.github/workflows/generated-linux-docs.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/docs') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/docs') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/docs') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/docs') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml index a68668acffa8..178a3bad0af7 100644 --- a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml +++ b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/vulkan') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/vulkan') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/vulkan')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/vulkan')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7-bazel-test.yml similarity index 95% rename from .github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml rename to .github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7-bazel-test.yml index ac02841fd832..515dc274ab7e 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7-bazel-test.yml @@ -1,7 +1,7 @@ # @generated DO NOT EDIT MANUALLY # Template is at: .github/templates/bazel_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-py3.6-gcc7-bazel-test +name: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test on: pull_request: @@ -14,8 +14,8 @@ on: workflow_dispatch: env: - BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7 + BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 @@ -33,7 +33,7 @@ env: SHA1: ${{ github.event.pull_request.head.sha || github.sha }} PYTORCH_RETRY_TEST_CASES: 1 concurrency: - group: linux-xenial-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + group: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: @@ -62,7 +62,7 @@ jobs: runs-on: linux.2xlarge needs: [ciflow_should_run] env: - JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-build-and-test + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-build-and-test NUM_TEST_SHARDS: 1 steps: - name: Display EC2 information @@ -302,7 +302,7 @@ jobs: env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-test + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-test PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} TAG: ${{ steps.parse-ref.outputs.tag }} diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 5d875c6d8e71..4eb048f23e70 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml index 5ee190174e03..89fc5c17e984 100644 --- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml +++ b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml index c4b46ddd24e7..6d7fcaed6b86 100644 --- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml +++ b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml index cefe299d7562..c92271b2592c 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/sanitizers') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/sanitizers') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/sanitizers')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/sanitizers') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml index c3fa2c75dfbc..e4177958d3dd 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/onnx') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/onnx') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/onnx')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/onnx') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index a6680d0fd49e..375dcbcdbf24 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository_owner == 'pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml index 40a1b900d9f2..94d84a20b191 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository_owner == 'pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-macos-10-15-py3-arm64.yml b/.github/workflows/generated-macos-10-15-py3-arm64.yml index 3fa0f0cacf72..64108c2da72c 100644 --- a/.github/workflows/generated-macos-10-15-py3-arm64.yml +++ b/.github/workflows/generated-macos-10-15-py3-arm64.yml @@ -32,12 +32,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml b/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml index c54322afefa7..d98e6874da61 100644 --- a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml +++ b/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml @@ -34,12 +34,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml index 1ef99b1f0935..23542a870ca4 100644 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ b/.github/workflows/generated-macos-11-py3-x86-64.yml @@ -34,12 +34,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml index a7faee91c0d9..4ba6bf59a999 100644 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || (false)) }} steps: diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml index b92817a2dfb8..93f08e024bc4 100644 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml +++ b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml index 010c1d8d3167..faa935a61b65 100644 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml +++ b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml @@ -43,12 +43,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository == 'pytorch/pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index d9f064b46bd4..5392555ff08e 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -47,12 +47,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/win') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository_owner == 'pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index e38c713d0e54..3bb47d35fa94 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -48,12 +48,12 @@ jobs: timeout-minutes: 240 env: IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }} - LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win') }} + LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/win') }} LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} if: ${{ (github.repository_owner == 'pytorch') && ( (github.event_name == 'push') || (github.event_name == 'schedule') || - (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) || + (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) || ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/'))) }} steps: diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 813a9710873b..49c41a831bed 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -207,11 +207,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then get_bazel - # first build the whole torch for CPU-only + # first build torch for CPU-only tools/bazel build --config=no-tty :torch - # then build selected set of targets with GPU-support. - # TODO: eventually this should converge to building the whole :torch with GPU-support - tools/bazel build --config=no-tty --config=gpu //c10 + # then build everything with CUDA + tools/bazel build --config=no-tty --config=gpu :all else # check that setup.py would fail with bad arguments echo "The next three invocations are expected to fail with invalid command error messages." diff --git a/BUILD.bazel b/BUILD.bazel index 23fe73cef91e..b65d77b882b8 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -3,7 +3,7 @@ load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") load("@rules_proto//proto:defs.bzl", "proto_library") load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_proto_library", "cc_test") load("//third_party:substitution.bzl", "header_template_rule") -load("//:tools/build_variables.bzl", "torch_cpp_srcs", "libtorch_python_core_sources", "libtorch_core_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "jit_core_sources") +load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs") load("//tools/rules:cu.bzl", "cu_library") load("//tools/config:defs.bzl", "if_cuda") load("//:aten.bzl", "intern_build_aten_ops", "generate_aten") @@ -15,6 +15,7 @@ COMMON_COPTS = [ "-DHAVE_SHM_UNLINK=1", "-D_FILE_OFFSET_BITS=64", "-DHAVE_GCC_GET_CPUID", + "-DTH_BLAS_MKL", "-DUSE_GCC_GET_CPUID", "-DTH_HAVE_THREAD", "-DUSE_FBGEMM", @@ -37,11 +38,11 @@ py_binary( ], ) +aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + glob(["aten/src/ATen/templates/**"]) + generate_aten( name = "generated_cpp", - srcs = [ - "aten/src/ATen/native/native_functions.yaml", - ] + glob(["aten/src/ATen/templates/**"]), + srcs = aten_generation_srcs, outs = [ "aten/src/ATen/Declarations.yaml", "aten/src/ATen/RegisterBackendSelect.cpp", @@ -62,8 +63,6 @@ generate_aten( "aten/src/ATen/RegisterSchema.cpp", "aten/src/ATen/CPUFunctions.h", "aten/src/ATen/CPUFunctions_inl.h", - "aten/src/ATen/CUDAFunctions.h", - "aten/src/ATen/CUDAFunctions_inl.h", "aten/src/ATen/CompositeExplicitAutogradFunctions.h", "aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h", "aten/src/ATen/CompositeImplicitAutogradFunctions.h", @@ -82,6 +81,8 @@ generate_aten( "aten/src/ATen/MetaFunctions.h", "aten/src/ATen/MetaFunctions_inl.h", "aten/src/ATen/MethodOperators.h", + "aten/src/ATen/NativeMetaFunctions.h", + "aten/src/ATen/RegistrationDeclarations.h", "aten/src/ATen/core/TensorBody.h", "aten/src/ATen/core/TensorMethods.cpp", "aten/src/ATen/core/ATenOpList.cpp", @@ -89,6 +90,23 @@ generate_aten( generator=":gen", ) +# this hack is due to https://github.com/bazelbuild/bazel/issues/281 +# since `outs` cannot be configured with if_cuda, we rerun the same command and declare cuda related files separately here. +genrule( + name = "generated_cuda_cpp", + srcs = aten_generation_srcs, + outs = [ + "aten/src/ATen/CUDAFunctions.h", + "aten/src/ATen/CUDAFunctions_inl.h", + "aten/src/ATen/RegisterCUDA.cpp", + "aten/src/ATen/RegisterQuantizedCUDA.cpp", + "aten/src/ATen/RegisterSparseCUDA.cpp", + "aten/src/ATen/RegisterSparseCsrCUDA.cpp", + ], + cmd = "$(location :gen) --source-path `dirname $(location aten/src/ATen/native/native_functions.yaml)`/.. --install_dir `dirname $(location aten/src/ATen/RegisterCUDA.cpp)`", + tools = [":gen"], +) + py_library( name = "tools_codegen", srcs = glob(["tools/codegen/**/*.py"]), @@ -230,7 +248,7 @@ filegroup( filegroup( name = "aten_native_mkl_cpp", - srcs = glob(["aten/src/ATen/native/mkl/*.cpp"]), + srcs = glob(["aten/src/ATen/native/mkl/*.cpp", "aten/src/ATen/mkl/*.cpp"]), ) filegroup( @@ -266,135 +284,40 @@ filegroup( ) filegroup( - name = "aten_cuda_srcs", - srcs = [ - "aten/src/ATen/cuda/CUDABlas.cpp", - "aten/src/ATen/cuda/CUDASolver.cpp", - "aten/src/ATen/cuda/CUDAContext.cpp", - "aten/src/ATen/cuda/CUDAGeneratorImpl.cpp", - "aten/src/ATen/cuda/CUDAGraph.cpp", - "aten/src/ATen/cuda/CuSparseHandlePool.cpp", - "aten/src/ATen/cuda/CublasHandlePool.cpp", - "aten/src/ATen/cuda/CusolverDnHandlePool.cpp", - "aten/src/ATen/cuda/PinnedMemoryAllocator.cpp", - "aten/src/ATen/cuda/detail/CUDAHooks.cpp", - "aten/src/ATen/cudnn/AutocastRNN.cpp", - "aten/src/ATen/cudnn/Descriptors.cpp", - "aten/src/ATen/cudnn/Handle.cpp", - "aten/src/ATen/cudnn/Types.cpp", - "aten/src/ATen/native/cuda/CUDAUnaryOps.cpp", - "aten/src/ATen/native/cuda/TensorShapeCUDA.cpp", - "aten/src/ATen/native/cudnn/AffineGridGenerator.cpp", - "aten/src/ATen/native/cudnn/BatchNorm.cpp", - "aten/src/ATen/native/cudnn/Conv.cpp", - "aten/src/ATen/native/cudnn/GridSampler.cpp", - "aten/src/ATen/native/cudnn/LossCTC.cpp", - "aten/src/ATen/native/cudnn/RNN.cpp", - "aten/src/ATen/native/miopen/BatchNorm_miopen.cpp", - "aten/src/ATen/native/miopen/Conv_miopen.cpp", - "aten/src/ATen/native/miopen/RNN_miopen.cpp", - "aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp", - "aten/src/ATen/native/sparse/cuda/SparseBlas.cpp", - "aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp", - ], + name = "aten_cuda_cpp_srcs", + srcs = glob( + [ + "aten/src/ATen/cuda/*.cpp", + "aten/src/ATen/cuda/detail/*.cpp", + "aten/src/ATen/cudnn/*.cpp", + "aten/src/ATen/native/cuda/*.cpp", + "aten/src/ATen/native/cudnn/*.cpp", + "aten/src/ATen/native/miopen/*.cpp", + "aten/src/ATen/native/sparse/cuda/*.cpp", + "aten/src/THC/*.cpp", + ], + ), ) filegroup( - name = "aten_srcs_cu", - srcs = [ - "aten/src/ATen/cuda/cub.cu.cc", - "aten/src/ATen/cuda/detail/IndexUtils.cu.cc", - "aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc", - "aten/src/ATen/native/cuda/Activation.cu.cc", - "aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc", - "aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc", - "aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu.cc", - "aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu.cc", - "aten/src/ATen/native/cuda/AveragePool2d.cu.cc", - "aten/src/ATen/native/cuda/AveragePool3d.cu.cc", - "aten/src/ATen/native/cuda/BatchLinearAlgebra.cu.cc", - "aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu.cc", - "aten/src/ATen/native/cuda/BinaryArithmeticKernel.cu.cc", - "aten/src/ATen/native/cuda/BinaryCompareKernel.cu.cc", - "aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu.cc", - "aten/src/ATen/native/cuda/CUDAScalar.cu.cc", - "aten/src/ATen/native/cuda/Col2Im.cu.cc", - "aten/src/ATen/native/cuda/Copy.cu.cc", - "aten/src/ATen/native/cuda/CrossKernel.cu.cc", - "aten/src/ATen/native/cuda/DilatedMaxPool2d.cu.cc", - "aten/src/ATen/native/cuda/DilatedMaxPool3d.cu.cc", - "aten/src/ATen/native/cuda/DistanceKernel.cu.cc", - "aten/src/ATen/native/cuda/Distributions.cu.cc", - "aten/src/ATen/native/cuda/Dropout.cu.cc", - "aten/src/ATen/native/cuda/Embedding.cu.cc", - "aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu.cc", - "aten/src/ATen/native/cuda/EmbeddingBag.cu.cc", - "aten/src/ATen/native/cuda/FillKernel.cu.cc", - "aten/src/ATen/native/cuda/FractionalMaxPool2d.cu.cc", - "aten/src/ATen/native/cuda/FractionalMaxPool3d.cu.cc", - "aten/src/ATen/native/cuda/GridSampler.cu.cc", - "aten/src/ATen/native/cuda/Im2Col.cu.cc", - "aten/src/ATen/native/cuda/IndexKernel.cu.cc", - "aten/src/ATen/native/cuda/Indexing.cu.cc", - "aten/src/ATen/native/cuda/Lerp.cu.cc", - "aten/src/ATen/native/cuda/LinearAlgebra.cu.cc", - "aten/src/ATen/native/cuda/Loss.cu.cc", - "aten/src/ATen/native/cuda/LossCTC.cu.cc", - "aten/src/ATen/native/cuda/MaxUnpooling.cu.cc", - "aten/src/ATen/native/cuda/MultinomialKernel.cu.cc", - "aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu.cc", - "aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu.cc", - "aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu.cc", - "aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu.cc", - "aten/src/ATen/native/cuda/NLLLoss2d.cu.cc", - "aten/src/ATen/native/cuda/Normalization.cu.cc", - "aten/src/ATen/native/cuda/PointwiseOpsKernel.cu.cc", - "aten/src/ATen/native/cuda/PowKernel.cu.cc", - "aten/src/ATen/native/cuda/RNN.cu.cc", - "aten/src/ATen/native/cuda/RangeFactories.cu.cc", - "aten/src/ATen/native/cuda/Reduce.cu.cc", - "aten/src/ATen/native/cuda/ReduceOpsKernel.cu.cc", - "aten/src/ATen/native/cuda/ReflectionPad.cu.cc", - "aten/src/ATen/native/cuda/Repeat.cu.cc", - "aten/src/ATen/native/cuda/ReplicationPadding.cu.cc", - "aten/src/ATen/native/cuda/Resize.cu.cc", - "aten/src/ATen/native/cuda/SegmentReduce.cu.cc", - "aten/src/ATen/native/cuda/SoftMax.cu.cc", - "aten/src/ATen/native/cuda/SortingKthValue.cu.cc", - "aten/src/ATen/native/cuda/SparseMM.cu.cc", - "aten/src/ATen/native/cuda/SpectralOps.cu.cc", - "aten/src/ATen/native/cuda/SummaryOps.cu.cc", - "aten/src/ATen/native/cuda/TensorCompare.cu.cc", - "aten/src/ATen/native/cuda/TensorFactories.cu.cc", - "aten/src/ATen/native/cuda/TensorTopK.cu.cc", - "aten/src/ATen/native/cuda/TensorTransformations.cu.cc", - "aten/src/ATen/native/cuda/TriangularOps.cu.cc", - "aten/src/ATen/native/cuda/UnaryOpsKernel.cu.cc", - "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu.cc", - "aten/src/ATen/native/cuda/Unique.cu.cc", - "aten/src/ATen/native/cuda/UpSampleBicubic2d.cu.cc", - "aten/src/ATen/native/cuda/UpSampleBilinear2d.cu.cc", - "aten/src/ATen/native/cuda/UpSampleLinear1d.cu.cc", - "aten/src/ATen/native/cuda/UpSampleNearest1d.cu.cc", - "aten/src/ATen/native/cuda/UpSampleNearest2d.cu.cc", - "aten/src/ATen/native/cuda/UpSampleNearest3d.cu.cc", - "aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu.cc", - "aten/src/ATen/native/cuda/WeightNorm.cu.cc", - "aten/src/ATen/native/cuda/layer_norm_kernel.cu.cc", - "aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu.cc", - "aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu.cc", - "aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu.cc", - "aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu.cc", - ], + name = "aten_cu_srcs", + srcs = glob([ + "aten/src/ATen/cuda/*.cu", + "aten/src/ATen/cuda/detail/*.cu", + "aten/src/ATen/native/cuda/*.cu", + "aten/src/ATen/native/quantized/cuda/*.cu", + "aten/src/ATen/native/sparse/cuda/*.cu", + ]), ) header_template_rule( name = "aten_src_ATen_config", src = "aten/src/ATen/Config.h.in", out = "aten/src/ATen/Config.h", + include = "aten/src", substitutions = { "@AT_MKLDNN_ENABLED@": "1", - "@AT_MKL_ENABLED@": "0", + "@AT_MKL_ENABLED@": "1", "@AT_FFTW_ENABLED@": "0", "@AT_POCKETFFT_ENABLED@": "0", "@AT_NNPACK_ENABLED@": "0", @@ -413,6 +336,7 @@ header_template_rule( name = "aten_src_ATen_cuda_config", src = "aten/src/ATen/cuda/CUDAConfig.h.in", out = "aten/src/ATen/cuda/CUDAConfig.h", + include = "aten/src", substitutions = { "@AT_CUDNN_ENABLED@": "1", "@AT_ROCM_ENABLED@": "0", @@ -429,18 +353,19 @@ cc_library( ] + glob([ "aten/src/**/*.h", "aten/src/**/*.hpp", + "aten/src/ATen/cuda/**/*.cuh", + "aten/src/ATen/native/**/*.cuh", "aten/src/TH/**/*.cpp", "aten/src/THC/*.cuh", + "aten/src/THC/generic/*.cu", ], - exclude = [ - "aten/src/ATen/Config.h", - ],) + [ - ":generated_cpp", + ) + [ ":aten_src_ATen_config", + ":generated_cpp", + ":generated_cuda_cpp", ], includes = [ "aten/src", - "aten/src/TH", ], deps = [ "//c10:headers", @@ -464,6 +389,7 @@ intern_build_aten_ops( ":aten_headers", "@sleef", "@fbgemm", + "@mkl", ], ) @@ -530,12 +456,17 @@ cc_binary( cc_library( name = "aten_cuda_cpp", - srcs = [":aten_cuda_srcs"], + srcs = [ + ":aten_cuda_cpp_srcs", + ":generated_cuda_cpp", + ], + hdrs = [":aten_src_ATen_cuda_config"], copts = ATEN_COPTS, visibility = ["//visibility:public"], deps = [ ":aten", "@cuda", + "@cuda//:cusolver", "@cuda//:nvrtc", "@cudnn", ], @@ -552,9 +483,7 @@ torch_cuda_half_options = [ cu_library( name = "aten_cuda", - srcs = [ - ":aten_srcs_cu", - ], + srcs = [":aten_cu_srcs"], copts = ATEN_COPTS + torch_cuda_half_options, visibility = ["//visibility:public"], deps = [ @@ -618,6 +547,7 @@ header_template_rule( filegroup( name = "caffe2_contrib_srcs", srcs = [ + "caffe2/contrib/aten/aten_op.cc", "caffe2/contrib/gloo/allgather_ops.cc", "caffe2/contrib/gloo/allreduce_ops.cc", "caffe2/contrib/gloo/barrier_ops.cc", @@ -787,6 +717,7 @@ filegroup( "caffe2/operators/conv_op_eigen.cc", "caffe2/operators/conv_op_shared.cc", "caffe2/operators/conv_transpose_gradient_op.cc", + "caffe2/operators/conv_transpose_op.cc", "caffe2/operators/conv_transpose_op_mobile.cc", "caffe2/operators/copy_op.cc", "caffe2/operators/copy_rows_to_tensor_op.cc", @@ -1182,7 +1113,7 @@ filegroup( ) filegroup( - name = "caffe2_cuda_srcs", + name = "caffe2_cuda_cpp_srcs", srcs = [ "caffe2/contrib/aten/aten_op_gpu.cc", "caffe2/contrib/gloo/allreduce_ops_gpu.cc", @@ -1251,155 +1182,155 @@ filegroup( filegroup( name = "caffe2_cu_srcs", srcs = [ - "caffe2/core/context_gpu.cu.cc", - "caffe2/operators/abs_op.cu.cc", - "caffe2/operators/accumulate_op.cu.cc", - "caffe2/operators/accuracy_op.cu.cc", - "caffe2/operators/acos_op.cu.cc", - "caffe2/operators/affine_channel_op.cu.cc", - "caffe2/operators/alias_with_name.cu.cc", - "caffe2/operators/arg_ops.cu.cc", - "caffe2/operators/asin_op.cu.cc", - "caffe2/operators/assert_op.cu.cc", - "caffe2/operators/atan_op.cu.cc", - "caffe2/operators/batch_gather_ops.cu.cc", - "caffe2/operators/batch_matmul_op.cu.cc", - "caffe2/operators/batch_moments_op.cu.cc", - "caffe2/operators/batch_permutation_op.cu.cc", - "caffe2/operators/batch_sparse_to_dense_op.cu.cc", - "caffe2/operators/boolean_mask_ops.cu.cc", - "caffe2/operators/boolean_unmask_ops.cu.cc", - "caffe2/operators/bucketize_op.cu.cc", - "caffe2/operators/cast_op.cu.cc", - "caffe2/operators/cbrt_op.cu.cc", - "caffe2/operators/ceil_op.cu.cc", - "caffe2/operators/channel_backprop_stats_op.cu.cc", - "caffe2/operators/channel_shuffle_op.cu.cc", - "caffe2/operators/channel_stats_op.cu.cc", - "caffe2/operators/channelwise_conv3d_op_cudnn.cu.cc", - "caffe2/operators/clip_op.cu.cc", - "caffe2/operators/copy_op.cu.cc", - "caffe2/operators/cos_op.cu.cc", - "caffe2/operators/cosh_op.cu.cc", - "caffe2/operators/cosine_embedding_criterion_op.cu.cc", - "caffe2/operators/cross_entropy_op.cu.cc", - "caffe2/operators/cube_op.cu.cc", - "caffe2/operators/data_couple_gpu.cu.cc", - "caffe2/operators/deform_conv_op.cu.cc", - "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu.cc", - "caffe2/operators/distance_op.cu.cc", - "caffe2/operators/dropout_op.cu.cc", - "caffe2/operators/elementwise_div_op.cu.cc", - "caffe2/operators/elementwise_linear_op.cu.cc", - "caffe2/operators/elementwise_mul_op.cu.cc", - "caffe2/operators/elementwise_ops.cu.cc", - "caffe2/operators/elu_op.cu.cc", - "caffe2/operators/enforce_finite_op.cu.cc", - "caffe2/operators/ensure_cpu_output_op.cu.cc", - "caffe2/operators/erf_op.cu.cc", - "caffe2/operators/filler_op.cu.cc", - "caffe2/operators/find_op.cu.cc", - "caffe2/operators/floor_op.cu.cc", - "caffe2/operators/gather_op.cu.cc", - "caffe2/operators/gelu_op.cu.cc", - "caffe2/operators/generate_proposals_op.cu.cc", - "caffe2/operators/generate_proposals_op_util_nms_gpu.cu.cc", - "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu.cc", - "caffe2/operators/given_tensor_fill_op.cu.cc", - "caffe2/operators/glu_op.cu.cc", - "caffe2/operators/group_norm_op.cu.cc", - "caffe2/operators/gru_unit_op_gpu.cu.cc", - "caffe2/operators/half_float_ops.cu.cc", - "caffe2/operators/hard_sigmoid_op.cu.cc", - "caffe2/operators/instance_norm_op.cu.cc", - "caffe2/operators/integral_image_op.cu.cc", - "caffe2/operators/layer_norm_op.cu.cc", - "caffe2/operators/leaky_relu_op.cu.cc", - "caffe2/operators/lengths_pad_op.cu.cc", - "caffe2/operators/lengths_tile_op.cu.cc", - "caffe2/operators/local_response_normalization_op.cu.cc", - "caffe2/operators/logit_op.cu.cc", - "caffe2/operators/loss_op.cu.cc", - "caffe2/operators/lp_pool_op.cu.cc", - "caffe2/operators/lstm_unit_op_gpu.cu.cc", - "caffe2/operators/margin_ranking_criterion_op.cu.cc", - "caffe2/operators/max_pool_with_index.cu.cc", - "caffe2/operators/mean_op.cu.cc", - "caffe2/operators/mem_query_op.cu.cc", - "caffe2/operators/minmax_ops.cu.cc", - "caffe2/operators/moments_op.cu.cc", - "caffe2/operators/multi_class_accuracy_op.cu.cc", - "caffe2/operators/normalize_ops.cu.cc", - "caffe2/operators/one_hot_ops.cu.cc", - "caffe2/operators/pack_segments.cu.cc", - "caffe2/operators/pad_op_gpu.cu.cc", - "caffe2/operators/perplexity_op.cu.cc", - "caffe2/operators/piecewise_linear_transform_op.cu.cc", - "caffe2/operators/pool_op.cu.cc", - "caffe2/operators/pow_op.cu.cc", - "caffe2/operators/prelu_op.cu.cc", - "caffe2/operators/reciprocal_op.cu.cc", - "caffe2/operators/reduce_front_back_max_ops.cu.cc", - "caffe2/operators/reduce_front_back_sum_mean_ops.cu.cc", - "caffe2/operators/reduce_ops.cu.cc", - "caffe2/operators/reduction_ops.cu.cc", - "caffe2/operators/relu_n_op.cu.cc", - "caffe2/operators/relu_op.cu.cc", - "caffe2/operators/replace_nan_op.cu.cc", - "caffe2/operators/resize_3d_op.cu.cc", - "caffe2/operators/resize_op.cu.cc", - "caffe2/operators/reverse_packed_segs_op.cu.cc", - "caffe2/operators/rmac_regions_op.cu.cc", - "caffe2/operators/rnn/recurrent_network_op_gpu.cu.cc", - "caffe2/operators/roi_align_gradient_op.cu.cc", - "caffe2/operators/roi_align_op.cu.cc", - "caffe2/operators/roi_align_rotated_gradient_op.cu.cc", - "caffe2/operators/roi_align_rotated_op.cu.cc", - "caffe2/operators/roi_pool_op.cu.cc", - "caffe2/operators/rsqrt_op.cu.cc", - "caffe2/operators/scale_blobs_op.cu.cc", - "caffe2/operators/segment_reduction_op_gpu.cu.cc", - "caffe2/operators/selu_op.cu.cc", - "caffe2/operators/sequence_ops.cu.cc", - "caffe2/operators/sigmoid_op.cu.cc", - "caffe2/operators/sin_op.cu.cc", - "caffe2/operators/sinh_op.cu.cc", - "caffe2/operators/slice_op.cu.cc", - "caffe2/operators/softmax_ops.cu.cc", - "caffe2/operators/softplus_op.cu.cc", - "caffe2/operators/softsign_op.cu.cc", - "caffe2/operators/space_batch_op_gpu.cu.cc", - "caffe2/operators/sparse_normalize_op_gpu.cu.cc", - "caffe2/operators/sparse_to_dense_op.cu.cc", - "caffe2/operators/spatial_batch_norm_op.cu.cc", - "caffe2/operators/spatial_batch_norm_op_cudnn.cu.cc", - "caffe2/operators/stump_func_op.cu.cc", - "caffe2/operators/summarize_op.cu.cc", - "caffe2/operators/swish_op.cu.cc", - "caffe2/operators/tan_op.cu.cc", - "caffe2/operators/tanh_op.cu.cc", - "caffe2/operators/thresholded_relu_op.cu.cc", - "caffe2/operators/tile_op.cu.cc", - "caffe2/operators/top_k.cu.cc", - "caffe2/operators/transpose_op.cu.cc", - "caffe2/operators/unique_ops.cu.cc", - "caffe2/operators/upsample_op.cu.cc", - "caffe2/operators/utility_ops.cu.cc", - "caffe2/operators/weighted_sample_op.cu.cc", - "caffe2/sgd/adadelta_op_gpu.cu.cc", - "caffe2/sgd/adagrad_op_gpu.cu.cc", - "caffe2/sgd/adam_op_gpu.cu.cc", - "caffe2/sgd/fp16_momentum_sgd_op.cu.cc", - "caffe2/sgd/fp32_momentum_sgd_op.cu.cc", - "caffe2/sgd/lars_op_gpu.cu.cc", - "caffe2/sgd/momentum_sgd_op_gpu.cu.cc", - "caffe2/sgd/rmsprop_op_gpu.cu.cc", - "caffe2/sgd/yellowfin_op_gpu.cu.cc", - "caffe2/utils/math/broadcast.cu.cc", - "caffe2/utils/math/elementwise.cu.cc", - "caffe2/utils/math/reduce.cu.cc", - "caffe2/utils/math/transpose.cu.cc", - "caffe2/utils/math_gpu.cu.cc", + "caffe2/core/context_gpu.cu", + "caffe2/operators/abs_op.cu", + "caffe2/operators/accumulate_op.cu", + "caffe2/operators/accuracy_op.cu", + "caffe2/operators/acos_op.cu", + "caffe2/operators/affine_channel_op.cu", + "caffe2/operators/alias_with_name.cu", + "caffe2/operators/arg_ops.cu", + "caffe2/operators/asin_op.cu", + "caffe2/operators/assert_op.cu", + "caffe2/operators/atan_op.cu", + "caffe2/operators/batch_gather_ops.cu", + "caffe2/operators/batch_matmul_op.cu", + "caffe2/operators/batch_moments_op.cu", + "caffe2/operators/batch_permutation_op.cu", + "caffe2/operators/batch_sparse_to_dense_op.cu", + "caffe2/operators/boolean_mask_ops.cu", + "caffe2/operators/boolean_unmask_ops.cu", + "caffe2/operators/bucketize_op.cu", + "caffe2/operators/cast_op.cu", + "caffe2/operators/cbrt_op.cu", + "caffe2/operators/ceil_op.cu", + "caffe2/operators/channel_backprop_stats_op.cu", + "caffe2/operators/channel_shuffle_op.cu", + "caffe2/operators/channel_stats_op.cu", + "caffe2/operators/channelwise_conv3d_op_cudnn.cu", + "caffe2/operators/clip_op.cu", + "caffe2/operators/copy_op.cu", + "caffe2/operators/cos_op.cu", + "caffe2/operators/cosh_op.cu", + "caffe2/operators/cosine_embedding_criterion_op.cu", + "caffe2/operators/cross_entropy_op.cu", + "caffe2/operators/cube_op.cu", + "caffe2/operators/data_couple_gpu.cu", + "caffe2/operators/deform_conv_op.cu", + "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu", + "caffe2/operators/distance_op.cu", + "caffe2/operators/dropout_op.cu", + "caffe2/operators/elementwise_div_op.cu", + "caffe2/operators/elementwise_linear_op.cu", + "caffe2/operators/elementwise_mul_op.cu", + "caffe2/operators/elementwise_ops.cu", + "caffe2/operators/elu_op.cu", + "caffe2/operators/enforce_finite_op.cu", + "caffe2/operators/ensure_cpu_output_op.cu", + "caffe2/operators/erf_op.cu", + "caffe2/operators/filler_op.cu", + "caffe2/operators/find_op.cu", + "caffe2/operators/floor_op.cu", + "caffe2/operators/gather_op.cu", + "caffe2/operators/gelu_op.cu", + "caffe2/operators/generate_proposals_op.cu", + "caffe2/operators/generate_proposals_op_util_nms_gpu.cu", + "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu", + "caffe2/operators/given_tensor_fill_op.cu", + "caffe2/operators/glu_op.cu", + "caffe2/operators/group_norm_op.cu", + "caffe2/operators/gru_unit_op_gpu.cu", + "caffe2/operators/half_float_ops.cu", + "caffe2/operators/hard_sigmoid_op.cu", + "caffe2/operators/instance_norm_op.cu", + "caffe2/operators/integral_image_op.cu", + "caffe2/operators/layer_norm_op.cu", + "caffe2/operators/leaky_relu_op.cu", + "caffe2/operators/lengths_pad_op.cu", + "caffe2/operators/lengths_tile_op.cu", + "caffe2/operators/local_response_normalization_op.cu", + "caffe2/operators/logit_op.cu", + "caffe2/operators/loss_op.cu", + "caffe2/operators/lp_pool_op.cu", + "caffe2/operators/lstm_unit_op_gpu.cu", + "caffe2/operators/margin_ranking_criterion_op.cu", + "caffe2/operators/max_pool_with_index.cu", + "caffe2/operators/mean_op.cu", + "caffe2/operators/mem_query_op.cu", + "caffe2/operators/minmax_ops.cu", + "caffe2/operators/moments_op.cu", + "caffe2/operators/multi_class_accuracy_op.cu", + "caffe2/operators/normalize_ops.cu", + "caffe2/operators/one_hot_ops.cu", + "caffe2/operators/pack_segments.cu", + "caffe2/operators/pad_op_gpu.cu", + "caffe2/operators/perplexity_op.cu", + "caffe2/operators/piecewise_linear_transform_op.cu", + "caffe2/operators/pool_op.cu", + "caffe2/operators/pow_op.cu", + "caffe2/operators/prelu_op.cu", + "caffe2/operators/reciprocal_op.cu", + "caffe2/operators/reduce_front_back_max_ops.cu", + "caffe2/operators/reduce_front_back_sum_mean_ops.cu", + "caffe2/operators/reduce_ops.cu", + "caffe2/operators/reduction_ops.cu", + "caffe2/operators/relu_n_op.cu", + "caffe2/operators/relu_op.cu", + "caffe2/operators/replace_nan_op.cu", + "caffe2/operators/resize_3d_op.cu", + "caffe2/operators/resize_op.cu", + "caffe2/operators/reverse_packed_segs_op.cu", + "caffe2/operators/rmac_regions_op.cu", + "caffe2/operators/rnn/recurrent_network_op_gpu.cu", + "caffe2/operators/roi_align_gradient_op.cu", + "caffe2/operators/roi_align_op.cu", + "caffe2/operators/roi_align_rotated_gradient_op.cu", + "caffe2/operators/roi_align_rotated_op.cu", + "caffe2/operators/roi_pool_op.cu", + "caffe2/operators/rsqrt_op.cu", + "caffe2/operators/scale_blobs_op.cu", + "caffe2/operators/segment_reduction_op_gpu.cu", + "caffe2/operators/selu_op.cu", + "caffe2/operators/sequence_ops.cu", + "caffe2/operators/sigmoid_op.cu", + "caffe2/operators/sin_op.cu", + "caffe2/operators/sinh_op.cu", + "caffe2/operators/slice_op.cu", + "caffe2/operators/softmax_ops.cu", + "caffe2/operators/softplus_op.cu", + "caffe2/operators/softsign_op.cu", + "caffe2/operators/space_batch_op_gpu.cu", + "caffe2/operators/sparse_normalize_op_gpu.cu", + "caffe2/operators/sparse_to_dense_op.cu", + "caffe2/operators/spatial_batch_norm_op.cu", + "caffe2/operators/spatial_batch_norm_op_cudnn.cu", + "caffe2/operators/stump_func_op.cu", + "caffe2/operators/summarize_op.cu", + "caffe2/operators/swish_op.cu", + "caffe2/operators/tan_op.cu", + "caffe2/operators/tanh_op.cu", + "caffe2/operators/thresholded_relu_op.cu", + "caffe2/operators/tile_op.cu", + "caffe2/operators/top_k.cu", + "caffe2/operators/transpose_op.cu", + "caffe2/operators/unique_ops.cu", + "caffe2/operators/upsample_op.cu", + "caffe2/operators/utility_ops.cu", + "caffe2/operators/weighted_sample_op.cu", + "caffe2/sgd/adadelta_op_gpu.cu", + "caffe2/sgd/adagrad_op_gpu.cu", + "caffe2/sgd/adam_op_gpu.cu", + "caffe2/sgd/fp16_momentum_sgd_op.cu", + "caffe2/sgd/fp32_momentum_sgd_op.cu", + "caffe2/sgd/lars_op_gpu.cu", + "caffe2/sgd/momentum_sgd_op_gpu.cu", + "caffe2/sgd/rmsprop_op_gpu.cu", + "caffe2/sgd/yellowfin_op_gpu.cu", + "caffe2/utils/math/broadcast.cu", + "caffe2/utils/math/elementwise.cu", + "caffe2/utils/math/reduce.cu", + "caffe2/utils/math/transpose.cu", + "caffe2/utils/math_gpu.cu", ], ) @@ -1432,6 +1363,29 @@ cc_library( ], ) +py_binary( + name = "gen_op", + srcs = ["caffe2/contrib/aten/gen_op.py"], + deps = [":tools_codegen"], +) + +genrule( + name = "generated_caffe2_aten_op_headers", + srcs = [ + "caffe2/contrib/aten/aten_op_template.h", + "aten/src/ATen/Declarations.yaml", + ], + outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"], + cmd = """ + $(location :gen_op) \ + --output_prefix gen_ \ + --install_dir $(@D) \ + --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \ + --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \ + --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""", + tools = [":gen_op"], +) + cc_library( name = "caffe2_headers", hdrs = glob([ @@ -1472,7 +1426,7 @@ cc_library( ]) + if_cuda(glob([ "caffe2/**/*.cuh", "caffe2/image/*.h", - ])), + ])) + [":generated_caffe2_aten_op_headers"], copts = CAFFE2_COPTS, includes = [ "caffe2/contrib/aten", @@ -1554,7 +1508,7 @@ cc_library( "@fmt", ] + if_cuda( [ - ":caffe2_cpp_cuda", + ":caffe2_cuda_cpp", ":aten_cuda", "@tensorpipe//:tensorpipe_cuda", ], @@ -1567,8 +1521,8 @@ cc_library( ) cc_library( - name = "caffe2_cpp_cuda", - srcs = [":caffe2_cuda_srcs"], + name = "caffe2_cuda_cpp", + srcs = [":caffe2_cuda_cpp_srcs"], copts = CAFFE2_COPTS, visibility = ["//visibility:public"], deps = [ @@ -1586,7 +1540,6 @@ cu_library( deps = [ ":aten", ":caffe2_headers", - "@cub", "@cuda//:cublas", "@cuda//:curand", "@cudnn", @@ -1610,6 +1563,7 @@ PERF_COPTS = [ "-DHAVE_SHM_OPEN=1", "-DHAVE_SHM_UNLINK=1", "-DSLEEF_STATIC_LIBS=1", + "-DTH_BALS_MKL", "-D_FILE_OFFSET_BITS=64", "-DUSE_FBGEMM", "-fvisibility-inlines-hidden", @@ -1693,10 +1647,29 @@ genrule( srcs = ["torch/csrc/api/include/torch/version.h.in", "version.txt"], outs = ["torch/csrc/api/include/torch/version.h"], cmd = "$(location :gen_version_header) --template-path $(location torch/csrc/api/include/torch/version.h.in) --version-path $(location version.txt) --output-path $@", - tools = [':gen_version_header'] + tools = [':gen_version_header'], ) -torch_cuda_headers = glob(["torch/csrc/cuda/*.h"]) +py_binary( + name = "stringify_file", + srcs = ["torch/csrc/jit/codegen/cuda/tools/stringify_file.py"], +) + +generated_nvfuser_hdrs = ["generated_" + hdr for hdr in libtorch_nvfuser_generated_headers] + +[ + genrule( + name = name, + srcs = [src], + outs = ["nvfuser_resources/{}".format(hdr)], + cmd = "$(location :stringify_file) -i $< -o $@", + tools = [":stringify_file"], + ) + for name, src, hdr in zip(generated_nvfuser_hdrs, libtorch_nvfuser_runtime_sources, libtorch_nvfuser_generated_headers) +] + +torch_cuda_headers = glob(["torch/csrc/cuda/*.h"]) + generated_nvfuser_hdrs + cc_library( name = "torch_headers", hdrs = if_cuda( @@ -1707,6 +1680,7 @@ cc_library( "torch/csrc/**/*.h", "torch/csrc/distributed/c10d/*.hpp", "torch/lib/libshm/*.h", + "torch/csrc/generic/*.cpp", ], exclude = [ "torch/csrc/autograd/generated/VariableType.h", @@ -1743,21 +1717,25 @@ TORCH_COPTS = COMMON_COPTS + [ "-fno-trapping-math", ] +cu_library( + name = "torch_distributed_cuda", + srcs = ["torch/csrc/distributed/c10d/quantization/quantization_gpu.cu"], + deps = [":torch_headers"], +) + cc_library( name = "torch", srcs = if_cuda(glob( - [ - "torch/csrc/cuda/*.cpp", - "torch/csrc/autograd/functions/comm.cpp", - ], + libtorch_cuda_sources, exclude = [ "torch/csrc/cuda/python_nccl.cpp", "torch/csrc/cuda/nccl.cpp", + "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", ], )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + [ ":cpp_generated_code", ], - copts = TORCH_COPTS + if_cuda(["-DUSE_CUDA=1"]), + copts = TORCH_COPTS, defines = [ "CAFFE2_NIGHTLY_VERSION=20200115", ], @@ -1765,7 +1743,10 @@ cc_library( deps = [ ":caffe2", ":torch_headers", - ], + ] + if_cuda([ + ":torch_distributed_cuda", + "@cuda//:nvToolsExt", + ]), alwayslink = True, ) @@ -1783,10 +1764,9 @@ cc_library( "**/*.h", "**/*.cuh", ]) + [ - ":generated_code", + ":cpp_generated_code", ], includes = [ - ".", "torch/csrc/api/include", "torch/csrc/distributed", "torch/lib", @@ -1794,21 +1774,17 @@ cc_library( ], visibility = ["//visibility:public"], deps = [ - ":aten_headers", - ":caffe2_headers", - "//c10:headers", + ":torch_headers", ], ) cc_library( name = "torch_python", srcs = libtorch_python_core_sources + [":python_generated_code"], - hdrs = glob([ - "torch/csrc/generic/*.cpp", - ]), deps = [ ":torch", ":shm", + "@pybind11", ], ) @@ -1842,11 +1818,16 @@ cc_library( # Torch integration tests rely on a labeled data set from the MNIST database. # http://yann.lecun.com/exdb/mnist/ -cpp_api_tests = glob(["test/cpp/api/*.cpp"]) +# imethod.cpp is excluded since torch/csrc/deploy* build is not yet supported. +cpp_api_tests = glob( + ["test/cpp/api/*.cpp"], + exclude = ["test/cpp/api/imethod.cpp"], +) + [ cc_test( name = paths.split_extension(paths.basename(filename))[0].replace("-","_") + "_test", - size = "medium", + size = "medium", srcs = [filename], deps = [ ":test_support", diff --git a/WORKSPACE b/WORKSPACE index 9396a3451c36..0497bef41039 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,7 +1,22 @@ workspace(name = "pytorch") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") -load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository") +load("//tools/rules:workspace.bzl", "new_patched_local_repository") + +http_archive( + name = "rules_cuda", + sha256 = "f80438bee9906e9ecb1a8a4ae2365374ac1e8a283897281a2db2fb7fcf746333", + strip_prefix = "runtime-b1c7cce21ba4661c17ac72421c6a0e2015e7bef3/third_party/rules_cuda", + urls = ["https://github.com/tensorflow/runtime/archive/b1c7cce21ba4661c17ac72421c6a0e2015e7bef3.tar.gz"], +) + +load("@rules_cuda//cuda:dependencies.bzl", "rules_cuda_dependencies") + +rules_cuda_dependencies() + +load("@rules_cc//cc:repositories.bzl", "rules_cc_toolchains") + +rules_cc_toolchains() http_archive( name = "bazel_skylib", @@ -171,13 +186,14 @@ load("@rules_python//python:repositories.bzl", "py_repositories") py_repositories() -local_repository( - name = "local_config_cuda", - path = "third_party/tensorflow_cuda_bazel_build", +new_local_repository( + name = "cuda", + build_file = "@//third_party:cuda.BUILD", + path = "/usr/local/cuda", ) -# Wrapper to expose local_config_cuda in an agnostic way -new_empty_repository( - name = "cuda", - build_file = "//third_party:cuda.BUILD", +new_local_repository( + name = "cudnn", + build_file = "@//third_party:cudnn.BUILD", + path = "/usr/", ) diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h index aa47ae43b318..f82aacee2381 100644 --- a/aten/src/ATen/core/aten_interned_strings.h +++ b/aten/src/ATen/core/aten_interned_strings.h @@ -254,6 +254,7 @@ _(aten, conv_tbc) \ _(aten, conv_tbc_backward) \ _(aten, conv_transpose1d) \ _(aten, convolution) \ +_(aten, convolution_backward) \ _(aten, copy_sparse_to_sparse) \ _(aten, corrcoef) \ _(aten, cos) \ @@ -474,18 +475,8 @@ _(aten, min_values) \ _(aten, miopen_batch_norm) \ _(aten, miopen_batch_norm_backward) \ _(aten, miopen_convolution) \ -_(aten, miopen_convolution_backward) \ -_(aten, miopen_convolution_backward_bias) \ -_(aten, miopen_convolution_backward_input) \ -_(aten, miopen_convolution_backward_weight) \ _(aten, miopen_convolution_transpose) \ -_(aten, miopen_convolution_transpose_backward) \ -_(aten, miopen_convolution_transpose_backward_input) \ -_(aten, miopen_convolution_transpose_backward_weight) \ _(aten, miopen_depthwise_convolution) \ -_(aten, miopen_depthwise_convolution_backward) \ -_(aten, miopen_depthwise_convolution_backward_input) \ -_(aten, miopen_depthwise_convolution_backward_weight) \ _(aten, miopen_rnn) \ _(aten, miopen_rnn_backward) \ _(aten, mish) \ @@ -683,6 +674,7 @@ _(aten, take_along_dim) \ _(aten, tan) \ _(aten, tanh) \ _(aten, tanh_) \ +_(aten, tanh_backward) \ _(aten, tensor) \ _(aten, tensordot) \ _(aten, tensor_split) \ diff --git a/aten/src/ATen/mkl/SparseBlas.cpp b/aten/src/ATen/mkl/SparseBlas.cpp index cc71ea6fbfef..67dcb30e5283 100644 --- a/aten/src/ATen/mkl/SparseBlas.cpp +++ b/aten/src/ATen/mkl/SparseBlas.cpp @@ -11,16 +11,16 @@ namespace sparse { namespace { - template - MKL_Complex to_mkl_complex(c10::complex scalar) { - MKL_Complex mkl_scalar; - mkl_scalar.real = scalar.real(); - mkl_scalar.imag = scalar.imag(); - return mkl_scalar; - } - +template +MKL_Complex to_mkl_complex(c10::complex scalar) { + MKL_Complex mkl_scalar; + mkl_scalar.real = scalar.real(); + mkl_scalar.imag = scalar.imag(); + return mkl_scalar; } +} // namespace + // There are link errors when compiling with create_csr functions on Windows. // See https://github.com/pytorch/pytorch/pull/50937#issuecomment-779272492 #if !defined(_WIN32) @@ -60,6 +60,65 @@ void create_csr>( col_indx, reinterpret_cast(values))); } + +template <> +void create_bsr(MKL_SPARSE_CREATE_BSR_ARGTYPES(float)) { + TORCH_MKLSPARSE_CHECK(mkl_sparse_s_create_bsr( + A, + indexing, + block_layout, + rows, + cols, + block_size, + rows_start, + rows_end, + col_indx, + values)); +} +template <> +void create_bsr(MKL_SPARSE_CREATE_BSR_ARGTYPES(double)) { + TORCH_MKLSPARSE_CHECK(mkl_sparse_d_create_bsr( + A, + indexing, + block_layout, + rows, + cols, + block_size, + rows_start, + rows_end, + col_indx, + values)); +} +template <> +void create_bsr>( + MKL_SPARSE_CREATE_BSR_ARGTYPES(c10::complex)) { + TORCH_MKLSPARSE_CHECK(mkl_sparse_c_create_bsr( + A, + indexing, + block_layout, + rows, + cols, + block_size, + rows_start, + rows_end, + col_indx, + reinterpret_cast(values))); +} +template <> +void create_bsr>( + MKL_SPARSE_CREATE_BSR_ARGTYPES(c10::complex)) { + TORCH_MKLSPARSE_CHECK(mkl_sparse_z_create_bsr( + A, + indexing, + block_layout, + rows, + cols, + block_size, + rows_start, + rows_end, + col_indx, + reinterpret_cast(values))); +} #endif // !defined(_WIN32) template <> diff --git a/aten/src/ATen/mkl/SparseBlas.h b/aten/src/ATen/mkl/SparseBlas.h index 140803b30f98..7281b6950611 100644 --- a/aten/src/ATen/mkl/SparseBlas.h +++ b/aten/src/ATen/mkl/SparseBlas.h @@ -42,6 +42,31 @@ template <> void create_csr>( MKL_SPARSE_CREATE_CSR_ARGTYPES(c10::complex)); +#define MKL_SPARSE_CREATE_BSR_ARGTYPES(scalar_t) \ + sparse_matrix_t *A, const sparse_index_base_t indexing, \ + const sparse_layout_t block_layout, const MKL_INT rows, \ + const MKL_INT cols, MKL_INT block_size, MKL_INT *rows_start, \ + MKL_INT *rows_end, MKL_INT *col_indx, scalar_t *values + +template +inline void create_bsr(MKL_SPARSE_CREATE_BSR_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::mkl::sparse::create_bsr: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void create_bsr(MKL_SPARSE_CREATE_BSR_ARGTYPES(float)); +template <> +void create_bsr(MKL_SPARSE_CREATE_BSR_ARGTYPES(double)); +template <> +void create_bsr>( + MKL_SPARSE_CREATE_BSR_ARGTYPES(c10::complex)); +template <> +void create_bsr>( + MKL_SPARSE_CREATE_BSR_ARGTYPES(c10::complex)); + #define MKL_SPARSE_MV_ARGTYPES(scalar_t) \ const sparse_operation_t operation, const scalar_t alpha, \ const sparse_matrix_t A, const struct matrix_descr descr, \ diff --git a/aten/src/ATen/mkl/SparseDescriptors.h b/aten/src/ATen/mkl/SparseDescriptors.h index 2f4f8731adf3..46d656898a8d 100644 --- a/aten/src/ATen/mkl/SparseDescriptors.h +++ b/aten/src/ATen/mkl/SparseDescriptors.h @@ -92,21 +92,42 @@ class MklSparseCsrDescriptor crow_indices_ = prepare_indices_for_mkl(crow_indices); col_indices_ = prepare_indices_for_mkl(col_indices); + values_ = values.expect_contiguous(); - auto values_ptr = values.data_ptr(); + auto values_ptr = values_->data_ptr(); auto crow_indices_ptr = crow_indices_->data_ptr(); auto col_indices_ptr = col_indices_->data_ptr(); sparse_matrix_t raw_descriptor; - create_csr( - &raw_descriptor, - SPARSE_INDEX_BASE_ZERO, - rows, - cols, - crow_indices_ptr, - crow_indices_ptr + 1, - col_indices_ptr, - values_ptr); + + // Assuming that the last two dimensions are block elements of the matrix + if (values.dim() == 3) { + TORCH_CHECK( + values.size(-1) == values.size(-2), + "MKL Sparse doesn't support matrices with non-square blocks."); + auto block_size = mkl_int_cast(values.size(-1), "block_size"); + create_bsr( + &raw_descriptor, + SPARSE_INDEX_BASE_ZERO, + SPARSE_LAYOUT_ROW_MAJOR, + rows / block_size, + cols / block_size, + block_size, + crow_indices_ptr, + crow_indices_ptr + 1, + col_indices_ptr, + values_ptr); + } else { + create_csr( + &raw_descriptor, + SPARSE_INDEX_BASE_ZERO, + rows, + cols, + crow_indices_ptr, + crow_indices_ptr + 1, + col_indices_ptr, + values_ptr); + } descriptor_.reset(raw_descriptor); } @@ -119,6 +140,7 @@ class MklSparseCsrDescriptor private: c10::MaybeOwned crow_indices_; c10::MaybeOwned col_indices_; + c10::MaybeOwned values_; }; } // namespace sparse diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 8e7b32a610ef..4a50ed392e70 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -9,11 +9,23 @@ namespace at { namespace native { using cudnn_convolution_backward_fn = std::tuple(*)( const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array); +DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub); using cudnn_convolution_transpose_backward_fn = std::tuple(*)( const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array); -DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub); DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub); +using miopen_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub); +using miopen_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub); +using miopen_depthwise_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) struct ConvParams { diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index d3426b5984d2..ab4413893a73 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -25,8 +25,14 @@ namespace at { namespace native { DEFINE_DISPATCH(cudnn_convolution_backward_stub); DEFINE_DISPATCH(cudnn_convolution_transpose_backward_stub); DEFINE_DISPATCH(convolution_depthwise3x3_winograd_stub); +DEFINE_DISPATCH(miopen_convolution_backward_stub); +DEFINE_DISPATCH(miopen_convolution_transpose_backward_stub); +DEFINE_DISPATCH(miopen_depthwise_convolution_backward_stub); REGISTER_NO_CPU_DISPATCH(cudnn_convolution_backward_stub, cudnn_convolution_backward_fn); REGISTER_NO_CPU_DISPATCH(cudnn_convolution_transpose_backward_stub, cudnn_convolution_transpose_backward_fn); +REGISTER_NO_CPU_DISPATCH(miopen_convolution_backward_stub, miopen_convolution_backward_fn); +REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub, miopen_convolution_transpose_backward_fn); +REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub, miopen_depthwise_convolution_backward_fn); std::ostream& operator<<(std::ostream & out, const ConvParams& params) { out << "ConvParams {" @@ -1614,20 +1620,23 @@ std::tuple convolution_backward( case ConvBackend::Miopen: check_input_same_type_as_parameters(input, weight); std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) = - at::miopen_convolution_backward( + miopen_convolution_backward_stub( + input.device().type(), input.contiguous(backend_memory_format), grad_output, weight, params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic, output_mask); break; case ConvBackend::MiopenDepthwise: std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) = - at::miopen_depthwise_convolution_backward( + miopen_depthwise_convolution_backward_stub( + input.device().type(), input.contiguous(backend_memory_format), grad_output, weight, params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic, output_mask); break; case ConvBackend::MiopenTranspose: check_input_same_type_as_parameters(input, weight); std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) = - at::miopen_convolution_transpose_backward( + miopen_convolution_transpose_backward_stub( + input.device().type(), input.contiguous(backend_memory_format), grad_output, weight, params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic, output_mask); break; diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp index 66c6232ef2f9..5544f3661de7 100644 --- a/aten/src/ATen/native/Distance.cpp +++ b/aten/src/ATen/native/Distance.cpp @@ -156,7 +156,7 @@ Tensor _cdist_forward(const Tensor& x1, const Tensor& x2, const double p, c10::o return result; } -Tensor _cdist_backward(const Tensor& grad, const Tensor& _x1, const Tensor& _x2, const double p, const Tensor& cdist) { +Tensor _cdist_backward(const Tensor& _grad, const Tensor& _x1, const Tensor& _x2, const double p, const Tensor& _cdist) { // Broadcasting might generate non-contiguous Tensors, so handle it before doing checks int64_t c1 = _x1.size(-1); int64_t c2 = _x2.size(-1); @@ -182,17 +182,17 @@ Tensor _cdist_backward(const Tensor& grad, const Tensor& _x1, const Tensor& _x2, Tensor x1 = _x1; if (tensor1_expand_size != x1.sizes()) { - x1 = x1.expand(tensor1_expand_size).contiguous(); + x1 = x1.expand(tensor1_expand_size); } Tensor x2 = _x2; if (tensor2_expand_size != x2.sizes()) { - x2 = x2.expand(tensor2_expand_size).contiguous(); + x2 = x2.expand(tensor2_expand_size); } - TORCH_CHECK(x1.is_contiguous(), "_cdist_backward requires X1 to be contiguous"); - TORCH_CHECK(x2.is_contiguous(), "_cdist_backward requires X2 to be contiguous"); - TORCH_CHECK(cdist.is_contiguous(), "_cdist_backward requires dist to be contiguous"); - TORCH_CHECK(grad.is_contiguous(), "_cdist_backward requires grad to be contiguous"); + x1 = x1.contiguous(); + x2 = x2.contiguous(); + auto cdist = _cdist.contiguous(); + auto grad = _grad.contiguous(); int64_t n = x1.size(-2); int64_t m = x1.size(-1); auto device1 = x1.device().type(); diff --git a/aten/src/ATen/native/Itertools.cpp b/aten/src/ATen/native/Itertools.cpp index 1e4fc7b746ea..d1117b8c1d4d 100644 --- a/aten/src/ATen/native/Itertools.cpp +++ b/aten/src/ATen/native/Itertools.cpp @@ -37,7 +37,7 @@ Tensor cartesian_prod(TensorList tensors) { if (tensors.size() == 1) { return tensors[0]; } - std::vector grids = at::meshgrid(tensors); + std::vector grids = at::meshgrid(tensors, "ij"); for(Tensor &t : grids) { t = t.flatten(); } diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 34b45b2f793a..7eda0fe0988c 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -1,6 +1,7 @@ #include #include #include +#include // TODO: Remove the condition on AT_ROCM_ENABLED entirely, // don't build this file as part of CPU build. @@ -760,6 +761,228 @@ Tensor miopen_depthwise_convolution( return output_t; } +// --------------------------------------------------------------------- +// +// Convolution backward (bias) +// +// --------------------------------------------------------------------- + +Tensor miopen_convolution_backward_bias( + const Tensor& grad_output_t) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }; + + // TODO: Workaround since MIOpen does not support NHWC bias + // See #64426 + std::vector discard_dims; + for( int i = 0; i < grad_output_t.dim(); i++ ) { + if(i != output_channels_dim ) { + discard_dims.push_back(i); + } + } + + Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) ); + if( outputBias.dim() == 0 ) { + // always return a tensor of shape [_] + return outputBias.unsqueeze(0); + } + else { + return outputBias; + } + +/* MIOpen does not support NHWC bias. Activate once support is added. + auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options()); + + TensorArg grad_bias{ grad_bias_t, "result", 0 }; + + TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}), + static_cast(grad_output->dim())}; + TensorDescriptor odesc{*grad_output}; + + auto handle = getMiopenHandle(); + auto dataType = getMiopenDataType(*grad_bias); + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(), + &zero, bdesc.desc(), grad_bias->data_ptr())); + return *grad_bias; +*/ +} + +// --------------------------------------------------------------------- +// +// Convolution backward (weight) +// +// --------------------------------------------------------------------- + +void raw_miopen_convolution_backward_weight_out( + const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic) { + + auto dataType = getMiopenDataType(input); + miopenConvolutionMode_t c_mode = miopenConvolution; + + ConvolutionArgs args{ input, grad_output, grad_weight }; + args.handle = getMiopenHandle(); + setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic); + args.idesc.set(input); + args.wdesc.set(grad_weight, input.suggest_memory_format(), 0); + args.odesc.set(grad_output); + args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); + + miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionBackwardWeights( + args.handle, + &one, args.odesc.desc(), grad_output.data_ptr(), + args.idesc.desc(), input.data_ptr(), + args.cdesc.desc(), bwdFilterAlg, &zero, + args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); +} + +//Depthwise backward weights. +void raw_miopen_depthwise_convolution_backward_weight_out( + const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic) { + + auto dataType = getMiopenDataType(input); + miopenConvolutionMode_t c_mode = miopenDepthwise; + + ConvolutionArgs args{ input, grad_output, grad_weight }; + args.handle = getMiopenHandle(); + setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic); + args.idesc.set(input); + args.wdesc.set(grad_weight, input.suggest_memory_format(), 0); + args.odesc.set(grad_output); + args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); + + miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionBackwardWeights( + args.handle, + &one, args.odesc.desc(), grad_output.data_ptr(), + args.idesc.desc(), input.data_ptr(), + args.cdesc.desc(), bwdFilterAlg, &zero, + args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); +} + +Tensor miopen_depthwise_convolution_backward_weight( + CheckedFrom c, + IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + + checkAllSameType(c, {grad_output, input}); + checkAllSameGPU(c, {grad_output, input}); + + auto memory_format = at::MemoryFormat::Contiguous; + if (miopen_conv_use_channels_last(*input, *grad_output)) { + memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; + } + + Tensor grad_output_contig_t = grad_output->contiguous(memory_format); + // Make sure that NC11 strides follow formula + grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format); + TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; + + Tensor input_contig_t = input->contiguous(memory_format); + input_contig_t.resize_(input_contig_t.sizes(), memory_format); + TensorArg input_contig{ input_contig_t, "input", 2}; + + auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format); + + // For uniformity with everything else, although it seems grad_weight + // would be unambiguous too. + TensorArg grad_weight{ grad_weight_t, "result", 0 }; + convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); + + raw_miopen_depthwise_convolution_backward_weight_out( + *grad_weight, *grad_output_contig, *input_contig, + padding, stride, dilation, groups, benchmark, deterministic); + + return grad_weight_t; +} + +Tensor miopen_depthwise_convolution_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + input{ input_t, "input", 2 }; + return miopen_depthwise_convolution_backward_weight( + "miopen_depthwise_convolution_backward_weight", + weight_size, grad_output, input, + padding, stride, dilation, groups, benchmark, deterministic); +} + +Tensor miopen_convolution_backward_weight( + CheckedFrom c, + IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + + checkAllSameType(c, {grad_output, input}); + checkAllSameGPU(c, {grad_output, input}); + + auto memory_format = at::MemoryFormat::Contiguous; + if (miopen_conv_use_channels_last(*input, *grad_output)) { + memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; + } + + Tensor grad_output_contig_t = grad_output->contiguous(memory_format); + // Make sure that NC11 strides follow formula + grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format); + TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; + + Tensor input_contig_t = input->contiguous(memory_format); + input_contig_t.resize_(input_contig_t.sizes(), memory_format); + TensorArg input_contig{ input_contig_t, "input", 2}; + + auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format); + + // For uniformity with everything else, although it seems grad_weight + // would be unambiguous too. + TensorArg grad_weight{ grad_weight_t, "result", 0 }; + convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); + + raw_miopen_convolution_backward_weight_out( + *grad_weight, *grad_output_contig, *input_contig, + padding, stride, dilation, groups, benchmark, deterministic); + + return grad_weight_t; +} + +Tensor miopen_convolution_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + input{ input_t, "input", 2 }; + return miopen_convolution_backward_weight( + "miopen_convolution_backward_weight", + weight_size, grad_output, input, + padding, stride, dilation, groups, benchmark, deterministic); +} + Tensor miopen_convolution_transpose_backward_input( const Tensor& grad_output_t, const Tensor& weight_t, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, @@ -772,6 +995,21 @@ Tensor miopen_convolution_transpose_backward_input( grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); } +Tensor miopen_convolution_transpose_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + input{ input_t, "input", 2 }; + return miopen_convolution_backward_weight( + "miopen_convolution_backward_weight", + weight_size, input, grad_output, + padding, stride, dilation, groups, benchmark, deterministic); +} + std::tuple miopen_convolution_transpose_backward( const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, @@ -781,13 +1019,13 @@ std::tuple miopen_convolution_transpose_backwa Tensor grad_input, grad_weight, grad_bias; if (output_mask[0]) { - grad_input = at::miopen_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); + grad_input = miopen_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); } if (output_mask[1]) { - grad_weight = at::miopen_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); + grad_weight = miopen_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); } if (output_mask[2]) { - grad_bias = at::miopen_convolution_backward_bias(grad_output); + grad_bias = miopen_convolution_backward_bias(grad_output); } return std::tuple{grad_input, grad_weight, grad_bias}; @@ -994,13 +1232,13 @@ std::tuple miopen_convolution_backward( Tensor grad_input, grad_weight, grad_bias; if (output_mask[0]) { - grad_input = at::miopen_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); + grad_input = miopen_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); } if (output_mask[1]) { - grad_weight = at::miopen_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); + grad_weight = miopen_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); } if (output_mask[2]) { - grad_bias = at::miopen_convolution_backward_bias(grad_output); + grad_bias = miopen_convolution_backward_bias(grad_output); } return std::tuple{grad_input, grad_weight, grad_bias}; @@ -1015,13 +1253,13 @@ std::tuple miopen_depthwise_convolution_backwa Tensor grad_input, grad_weight, grad_bias; if (output_mask[0]) { - grad_input = at::miopen_depthwise_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); + grad_input = miopen_depthwise_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); } if (output_mask[1]) { - grad_weight = at::miopen_depthwise_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); + grad_weight = miopen_depthwise_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); } if (output_mask[2]) { - grad_bias = at::miopen_convolution_backward_bias(grad_output); + grad_bias = miopen_convolution_backward_bias(grad_output); } return std::tuple{grad_input, grad_weight, grad_bias}; @@ -1048,243 +1286,9 @@ Tensor miopen_convolution_transpose( return output_t; } -// --------------------------------------------------------------------- -// -// Convolution backward (weight) -// -// --------------------------------------------------------------------- - -void raw_miopen_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - - auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenConvolution; - - ConvolutionArgs args{ input, grad_output, grad_weight }; - args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(grad_weight, input.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); - - miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardWeights( - args.handle, - &one, args.odesc.desc(), grad_output.data_ptr(), - args.idesc.desc(), input.data_ptr(), - args.cdesc.desc(), bwdFilterAlg, &zero, - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); -} - -Tensor miopen_convolution_backward_weight( - CheckedFrom c, - IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - - checkAllSameType(c, {grad_output, input}); - checkAllSameGPU(c, {grad_output, input}); - - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *grad_output)) { - memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; - } - - Tensor grad_output_contig_t = grad_output->contiguous(memory_format); - // Make sure that NC11 strides follow formula - grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format); - TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; - - Tensor input_contig_t = input->contiguous(memory_format); - input_contig_t.resize_(input_contig_t.sizes(), memory_format); - TensorArg input_contig{ input_contig_t, "input", 2}; - - auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format); - - // For uniformity with everything else, although it seems grad_weight - // would be unambiguous too. - TensorArg grad_weight{ grad_weight_t, "result", 0 }; - convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); - - raw_miopen_convolution_backward_weight_out( - *grad_weight, *grad_output_contig, *input_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return grad_weight_t; -} - -//Depthwise backward weights. -void raw_miopen_depthwise_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - - auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenDepthwise; - - ConvolutionArgs args{ input, grad_output, grad_weight }; - args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(grad_weight, input.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups); - - miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardWeights( - args.handle, - &one, args.odesc.desc(), grad_output.data_ptr(), - args.idesc.desc(), input.data_ptr(), - args.cdesc.desc(), bwdFilterAlg, &zero, - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); -} - -Tensor miopen_depthwise_convolution_backward_weight( - CheckedFrom c, - IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - - checkAllSameType(c, {grad_output, input}); - checkAllSameGPU(c, {grad_output, input}); - - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *grad_output)) { - memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; - } - - Tensor grad_output_contig_t = grad_output->contiguous(memory_format); - // Make sure that NC11 strides follow formula - grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format); - TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; - - Tensor input_contig_t = input->contiguous(memory_format); - input_contig_t.resize_(input_contig_t.sizes(), memory_format); - TensorArg input_contig{ input_contig_t, "input", 2}; - - auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format); - - // For uniformity with everything else, although it seems grad_weight - // would be unambiguous too. - TensorArg grad_weight{ grad_weight_t, "result", 0 }; - convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); - - raw_miopen_depthwise_convolution_backward_weight_out( - *grad_weight, *grad_output_contig, *input_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return grad_weight_t; -} - -Tensor miopen_convolution_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_convolution_backward_weight( - "miopen_convolution_backward_weight", - weight_size, grad_output, input, - padding, stride, dilation, groups, benchmark, deterministic); -} - -Tensor miopen_convolution_transpose_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_convolution_backward_weight( - "miopen_convolution_backward_weight", - weight_size, input, grad_output, - padding, stride, dilation, groups, benchmark, deterministic); -} - -Tensor miopen_depthwise_convolution_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_depthwise_convolution_backward_weight( - "miopen_depthwise_convolution_backward_weight", - weight_size, grad_output, input, - padding, stride, dilation, groups, benchmark, deterministic); -} - -// --------------------------------------------------------------------- -// -// Convolution backward (bias) -// -// --------------------------------------------------------------------- - -Tensor miopen_convolution_backward_bias( - const Tensor& grad_output_t) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }; - - // TODO: Workaround since MIOpen does not support NHWC bias - // See #64426 - std::vector discard_dims; - for( int i = 0; i < grad_output_t.dim(); i++ ) { - if(i != output_channels_dim ) { - discard_dims.push_back(i); - } - } - - Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) ); - if( outputBias.dim() == 0 ) { - // always return a tensor of shape [_] - return outputBias.unsqueeze(0); - } - else { - return outputBias; - } - -/* MIOpen does not support NHWC bias. Activate once support is added. - auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options()); - - TensorArg grad_bias{ grad_bias_t, "result", 0 }; - - TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}), - static_cast(grad_output->dim())}; - TensorDescriptor odesc{*grad_output}; - - auto handle = getMiopenHandle(); - auto dataType = getMiopenDataType(*grad_bias); - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(), - &zero, bdesc.desc(), grad_bias->data_ptr())); - return *grad_bias; -*/ -} - +REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward); +REGISTER_CUDA_DISPATCH(miopen_convolution_transpose_backward_stub, &miopen_convolution_transpose_backward); +REGISTER_CUDA_DISPATCH(miopen_depthwise_convolution_backward_stub, &miopen_depthwise_convolution_backward); }} // namespace diff --git a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp index 87850473c9ba..79f52bd25609 100644 --- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp +++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp @@ -27,7 +27,7 @@ c10::MaybeOwned prepare_dense_matrix_for_mkl( if (tensor.is_non_overlapping_and_dense() || is_blas_compatible_row_major_order(tensor) || is_blas_compatible_column_major_order(tensor)) { - return c10::MaybeOwned::borrowed(tensor); + return at::native::expect_resolved_conj(tensor); } else { return c10::MaybeOwned::owned( tensor.clone(at::MemoryFormat::Contiguous)); @@ -45,7 +45,7 @@ c10::MaybeOwned prepare_dense_matrix_for_mkl( const Tensor& tensor, bool row_major) { if (is_blas_compatible_row_major_order(tensor) && row_major) { - return c10::MaybeOwned::borrowed(tensor); + return at::native::expect_resolved_conj(tensor); } else { if (row_major) { return c10::MaybeOwned::owned( diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 7ac1f6a06da1..7c01fe68bded 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -3108,56 +3108,14 @@ dispatch: CUDA: miopen_convolution -- func: miopen_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - dispatch: - CUDA: miopen_convolution_backward_input - -- func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - dispatch: - CUDA: miopen_convolution_backward - -- func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor - dispatch: - CUDA: miopen_convolution_backward_bias - -- func: miopen_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - dispatch: - CUDA: miopen_convolution_backward_weight - - func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_convolution_transpose -# NB: output_padding not strictly needed here, but it's helpful for the float -# backwards -- func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - dispatch: - CUDA: miopen_convolution_transpose_backward - -- func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - dispatch: - CUDA: miopen_convolution_transpose_backward_input - -- func: miopen_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - dispatch: - CUDA: miopen_convolution_transpose_backward_weight - - func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_depthwise_convolution -- func: miopen_depthwise_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - dispatch: - CUDA: miopen_depthwise_convolution_backward_input - -- func: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - dispatch: - CUDA: miopen_depthwise_convolution_backward - -- func: miopen_depthwise_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - dispatch: - CUDA: miopen_depthwise_convolution_backward_weight - - func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) dispatch: CUDA: miopen_rnn diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu index 42561155a8a8..8ee25426fea9 100644 --- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu +++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/docs/source/conf.py b/docs/source/conf.py index a1633217fa8e..29387c27c270 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -208,7 +208,7 @@ coverage_missing_automodule = [ "torch.ao.ns.fx", "torch.ao.quantization", "torch.ao.quantization.fx", - "torch.ao.quantization.fx.backend_config_dict", + "torch.ao.quantization.fx.backend_config", "torch.ao.sparsity", "torch.ao.sparsity.experimental", "torch.ao.sparsity.experimental.pruner", diff --git a/docs/source/testing.rst b/docs/source/testing.rst index 9f1e2c3c53f8..213e82b9c4ca 100644 --- a/docs/source/testing.rst +++ b/docs/source/testing.rst @@ -3,8 +3,8 @@ torch.testing .. warning:: - This module is in a PROTOTYPE state. New functions are still being added, and the available functions may change in - future PyTorch releases. We are actively looking for feedback for UI/UX improvements or missing functionalities. + This module is a beta release, and its interfaces and functionality may change without warning in future + PyTorch releases. .. automodule:: torch.testing diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index 46f1c3bb263f..167723f8f157 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -83,6 +83,16 @@ ALLOW_LIST = [ ("aten::hsplit", datetime.date(2021, 11, 20)), ("aten::dsplit", datetime.date(2021, 11, 20)), ("aten::_convolution_nogroup", datetime.date(9999, 1, 1)), + ("aten::miopen_convolution_backward", datetime.date(9999, 1, 1)), + ("aten::miopen_convolution_backward_bias", datetime.date(9999, 1, 1)), + ("aten::miopen_convolution_backward_input", datetime.date(9999, 1, 1)), + ("aten::miopen_convolution_backward_weight", datetime.date(9999, 1, 1)), + ("aten::miopen_convolution_transpose_backward", datetime.date(9999, 1, 1)), + ("aten::miopen_convolution_transpose_backward_input", datetime.date(9999, 1, 1)), + ("aten::miopen_convolution_transpose_backward_weight", datetime.date(9999, 1, 1)), + ("aten::miopen_depthwise_convolution_backward", datetime.date(9999, 1, 1)), + ("aten::miopen_depthwise_convolution_backward_input", datetime.date(9999, 1, 1)), + ("aten::miopen_depthwise_convolution_backward_weight", datetime.date(9999, 1, 1)), ("caffe2::", datetime.date(2021, 10, 23)), ("prepacked::unpack_prepacked_sizes_conv2d", datetime.date(9999, 1, 1)), ("prepacked::unpack_prepacked_sizes_linear", datetime.date(9999, 1, 1)), diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt index 397b8b112aec..8fc5a0a18331 100644 --- a/test/cpp/tensorexpr/CMakeLists.txt +++ b/test/cpp/tensorexpr/CMakeLists.txt @@ -17,6 +17,7 @@ set(TENSOREXPR_TEST_SRCS ${TENSOREXPR_TEST_ROOT}/test_memdependency.cpp ${TENSOREXPR_TEST_ROOT}/test_ops.cpp ${TENSOREXPR_TEST_ROOT}/test_quantization.cpp + ${TENSOREXPR_TEST_ROOT}/test_memplanning.cpp ${TENSOREXPR_TEST_ROOT}/test_reductions.cpp ${TENSOREXPR_TEST_ROOT}/test_registerizer.cpp ${TENSOREXPR_TEST_ROOT}/test_simplify.cpp diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index d0cfbe3a9051..dd8950e8efa1 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -1901,7 +1901,8 @@ TEST(LoopNest, LoopNestComputeAt_1) { std::vector loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]); l.prepareForCodegen(); - StmtPtr s = l.root_stmt(); + SimpleIREvaluator cg(l.root_stmt(), {B, N}); + StmtPtr s = cg.stmt(); checkIR(s, R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[1] @@ -1913,7 +1914,6 @@ TEST(LoopNest, LoopNestComputeAt_1) { // Now check that the loop still produces the correct result. std::vector b_data(100, 0); - SimpleIREvaluator cg(s, {B, N}); cg.call({b_data, 100}); std::vector b_ref(100, 0); @@ -1967,7 +1967,8 @@ TEST(LoopNest, LoopNestComputeAt_2) { std::vector loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]); l.prepareForCodegen(); - StmtPtr s = l.root_stmt(); + SimpleIREvaluator cg(l.root_stmt(), {c, W, H}); + StmtPtr s = cg.stmt(); // Check the IR we produced checkIR(s, R"IR( @@ -1982,7 +1983,6 @@ TEST(LoopNest, LoopNestComputeAt_2) { // Now check that the loop still produces the correct result. std::vector c_data(kW * kH, 0); - SimpleIREvaluator cg(s, {c, W, H}); cg.call({c_data, kW, kH}); assertAllEqual(c_data, c_ref); @@ -1993,7 +1993,8 @@ TEST(LoopNest, LoopNestComputeAt_2) { std::vector loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]); l.prepareForCodegen(); - StmtPtr s = l.root_stmt(); + SimpleIREvaluator cg(l.root_stmt(), {c, W, H}); + StmtPtr s = cg.stmt(); // Check the IR we produced checkIR(s, R"IR( @@ -2008,7 +2009,6 @@ TEST(LoopNest, LoopNestComputeAt_2) { // Now check that the loop still produces the correct result. std::vector c_data(kW * kH, 0); - SimpleIREvaluator cg(s, {c, W, H}); cg.call({c_data, kW, kH}); assertAllEqual(c_data, c_ref); @@ -2063,7 +2063,8 @@ TEST(LoopNest, LoopNestComputeAt_3) { std::vector loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]); l.prepareForCodegen(); - StmtPtr s = l.root_stmt(); + SimpleIREvaluator cg(l.root_stmt(), {D, W, H}); + StmtPtr s = cg.stmt(); // Check the IR we produced checkIR(s, R"IR( @@ -2083,7 +2084,6 @@ TEST(LoopNest, LoopNestComputeAt_3) { // Now check that the loop still produces the correct result. std::vector c_data(kW * kH, 0); - SimpleIREvaluator cg(s, {D, W, H}); cg.call({c_data, kW, kH}); assertAllEqual(c_data, c_ref); @@ -2094,7 +2094,8 @@ TEST(LoopNest, LoopNestComputeAt_3) { std::vector loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[1]); l.prepareForCodegen(); - StmtPtr s = l.root_stmt(); + SimpleIREvaluator cg(l.root_stmt(), {D, W, H}); + StmtPtr s = cg.stmt(); // Check the IR we produced checkIR(s, R"IR( @@ -2114,7 +2115,6 @@ TEST(LoopNest, LoopNestComputeAt_3) { // Now check that the loop still produces the correct result. std::vector c_data(kW * kH, 0); - SimpleIREvaluator cg(s, {D, W, H}); cg.call({c_data, kW, kH}); assertAllEqual(c_data, c_ref); @@ -2174,7 +2174,8 @@ TEST(LoopNest, Reduce2dComputeAt) { // l.simplify(); l.eliminateDeadStores(); l.prepareForCodegen(); - checkIR(l.root_stmt(), R"IR( + SimpleIREvaluator cg(l.root_stmt(), {c, W, H}); + checkIR(cg.stmt(), R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1] # CHECK: for (int cy = 0; cy < H; cy++) { # CHECK: for (int idx0 = 0; idx0 < 2; idx0++) { @@ -2193,11 +2194,9 @@ TEST(LoopNest, Reduce2dComputeAt) { # CHECK: } # CHECK: Free(temp); )IR"); - StmtPtr s = l.root_stmt(); // Now check that the loop still produces the correct result. std::vector c_data(kW * kH, 0); - SimpleIREvaluator cg(s, {c, W, H}); cg.call({c_data, kW, kH}); assertAllEqual(c_data, c_ref); } @@ -2209,7 +2208,8 @@ TEST(LoopNest, Reduce2dComputeAt) { l.simplify(); l.eliminateDeadStores(); l.prepareForCodegen(); - checkIR(l.root_stmt(), R"IR( + SimpleIREvaluator cg(l.root_stmt(), {c, W, H}); + checkIR(cg.stmt(), R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[2, 2] # CHECK: for (int cy = 0; cy < H; cy++) { # CHECK: for (int cx = 0; cx < W; cx++) { @@ -2228,11 +2228,9 @@ TEST(LoopNest, Reduce2dComputeAt) { # CHECK: } # CHECK: Free(temp); )IR"); - StmtPtr s = l.root_stmt(); // Now check that the loop still produces the correct result. std::vector c_data(kW * kH, 0); - SimpleIREvaluator cg(s, {c, W, H}); cg.call({c_data, kW, kH}); assertAllEqual(c_data, c_ref); } @@ -3737,11 +3735,13 @@ TEST(LoopNest, CacheReadsSimple) { l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + SimpleIREvaluator cg(result, {B, C}); + result = cg.stmt(); // just this once: verify the whole thing. checkIR(result, R"IR( -#CHECK: Allocate(A_local); // dtype=int, dims=[1, 10] #CHECK: Allocate(A); // dtype=int, dims=[64, 64] +#CHECK: Allocate(A_local); // dtype=int, dims=[1, 10] #CHECK: for (int i #CHECK: for (int j #CHECK: A[ @@ -3760,13 +3760,12 @@ TEST(LoopNest, CacheReadsSimple) { #CHECK: C[ #CHECK: } #CHECK: } -#CHECK: Free(A); #CHECK: Free(A_local); +#CHECK: Free(A); )IR"); std::vector b_data(200, 0); std::vector c_data(200, 0); - SimpleIREvaluator cg(l.root_stmt(), {B, C}); cg.call({b_data, c_data}); std::vector b_ref(200, 0); @@ -3803,6 +3802,8 @@ TEST(LoopNest, CacheReadsOuter) { l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + SimpleIREvaluator cg(result, {B, C}); + result = cg.stmt(); checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[21, 11] @@ -3812,7 +3813,6 @@ TEST(LoopNest, CacheReadsOuter) { std::vector b_data(200, 0); std::vector c_data(200, 0); - SimpleIREvaluator cg(l.root_stmt(), {B, C}); cg.call({b_data, c_data}); std::vector b_ref(200, 0); @@ -3848,6 +3848,8 @@ TEST(LoopNest, CacheReadsInternal) { LoopNest::cacheAccesses(A.buf(), "A_local", j_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + SimpleIREvaluator cg(result, {B, C}); + result = cg.stmt(); checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[2, 11] @@ -3857,7 +3859,6 @@ TEST(LoopNest, CacheReadsInternal) { std::vector b_data(200, 0); std::vector c_data(200, 0); - SimpleIREvaluator cg(l.root_stmt(), {B, C}); cg.call({b_data, c_data}); std::vector b_ref(200, 0); @@ -3894,6 +3895,8 @@ TEST(LoopNest, CacheReadsInner) { LoopNest::cacheAccesses(A.buf(), "A_local", body); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + SimpleIREvaluator cg(result, {B, C}); + result = cg.stmt(); checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[5, 2] @@ -3903,7 +3906,6 @@ TEST(LoopNest, CacheReadsInner) { std::vector b_data(200, 0); std::vector c_data(200, 0); - SimpleIREvaluator cg(l.root_stmt(), {B, C}); cg.call({b_data, c_data}); std::vector b_ref(200, 0); @@ -3940,6 +3942,8 @@ TEST(LoopNest, CacheWritesSimple) { l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + SimpleIREvaluator cg(result, {B, C}); + result = cg.stmt(); checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[1, 64] @@ -3953,7 +3957,6 @@ TEST(LoopNest, CacheWritesSimple) { std::vector b_data(200, 0); std::vector c_data(200, 0); - SimpleIREvaluator cg(l.root_stmt(), {B, C}); cg.call({b_data, c_data}); std::vector b_ref(200, 0); diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp index d3ac6f4a5bd0..7019353937b7 100644 --- a/test/cpp/tensorexpr/test_memdependency.cpp +++ b/test/cpp/tensorexpr/test_memdependency.cpp @@ -3021,10 +3021,11 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) { } loop.prepareForCodegen(); + SimpleIREvaluator cg(loop.root_stmt(), {AP, BP, CT}); // now check lowered dependency graph. { - StmtPtr stmt = IRSimplifier::simplify(loop.root_stmt()); + StmtPtr stmt = IRSimplifier::simplify(cg.stmt()); stmt->accept(&analyzer_lowered); // Lowering will change the dimensionality of all bounds due to index diff --git a/test/cpp/tensorexpr/test_memplanning.cpp b/test/cpp/tensorexpr/test_memplanning.cpp new file mode 100644 index 000000000000..ec58aa8f6668 --- /dev/null +++ b/test/cpp/tensorexpr/test_memplanning.cpp @@ -0,0 +1,510 @@ +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +using namespace torch::jit::tensorexpr; + +extern void checkIR(StmtPtr s, const std::string& pattern); + +TEST(BufLiveRange, SingleRangeLine) { + VarHandle i("i", kInt), j("j", kInt); + BufHandle a("a", {32}, kFloat); + BufHandle b("b", {32, 32}, kFloat); + + // Construct Stmt: + // { + // for (int i = 0; i < 32; i++) { + // a[i] = 0; + // for (int j = 0; j < 32; j++) { + // a[i] = (a[i]) + (b[i, j]); + // } + // } + // } + + StorePtr aInit = Store::make(a, {i}, 0); + ExprHandle reduce = a.load({i}) + b.load({i, j}); + StorePtr aReduce = Store::make(a, {i}, reduce); + StmtPtr loop = + For::make(i, 0, 32, Block::make({aInit, For::make(j, 0, 32, aReduce)})); + + StmtPtr stmt = Block::make({loop}); + + auto range = BufLiveRange::liveRange(stmt, a.node()); + ASSERT_TRUE(std::get<0>(range) == 0); + ASSERT_TRUE(std::get<1>(range) == 0); +} + +TEST(BufLiveRange, MulRangeLine) { + VarHandle i("i", kInt); + BufHandle a("a", {32}, kFloat); + BufHandle b("b", {32}, kFloat); + + // Construct Stmt: + // { + // for (int i = 0; i < 32; i++) { + // if (i<10 ? 1 : 0) { + // a[i] = i + i; + // b[i] = i * i; + // } + // } + // for (int i = 0; i < 32; i++) { + // if (i>10 ? 1 : 0) { + // a[i] = i * i; + // b[i] = i + i; + // } + // } + // } + + StorePtr aStore_1 = Store::make(a, {i}, i + i); + StorePtr bStore_1 = Store::make(b, {i}, i * i); + StmtPtr loop_1 = For::make( + i, 0, 32, Cond::make(i < 10, Block::make({aStore_1, bStore_1}), NULL)); + + StorePtr aStore_2 = Store::make(a, {i}, i * i); + StorePtr bStore_2 = Store::make(b, {i}, i + i); + StmtPtr loop_2 = For::make( + i, 0, 32, Cond::make(i > 10, Block::make({aStore_2, bStore_2}), NULL)); + + StmtPtr stmt = Block::make({loop_1, loop_2}); + + auto range_a = BufLiveRange::liveRange(stmt, a.node()); + ASSERT_TRUE(std::get<0>(range_a) == 0); + ASSERT_TRUE(std::get<1>(range_a) == 1); + + auto range_b = BufLiveRange::liveRange(stmt, b.node()); + ASSERT_TRUE(std::get<0>(range_b) == 0); + ASSERT_TRUE(std::get<1>(range_b) == 1); +} + +TEST(MemPlanning, SameBufSizeMemReuse) { + int M = 1024; + int N = 1024; + int K = 2048; + + BufHandle AP("A", {M, K}, kFloat); + BufHandle BP("B", {K, N}, kFloat); + + Tensor CT = Reduce( + "gemm", + {{M, "M"}, {N, "N"}}, + Sum(), + [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { + return AP.load(m, k) * BP.load(k, n); + }, + {{K, "K"}}); + Tensor DT = Compute( + "relu", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + auto zero = Cast::make(CT.buf()->dtype(), 0); + return CompareSelect::make( + CT.load(m, n), zero, zero, CT.load(m, n), kLT); + }); + Tensor ET = Compute( + "add", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return DT.load(m, n) + DT.load(m, n); + }); + Tensor FT = Compute( + "mul", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return ET.load(m, n) * ET.load(m, n); + }); + auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()}); + + // Constructed stmt: + // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2], + // add [2, 3] Buffer 'gemm' and 'add' are the same size; we'll reuse 'gemm' + // for 'add'. + //{ + // for (int M = 0; M < 1024; M++) { + // for (int N = 0; N < 1024; N++) { + // gemm[M, N] = float(0); + // for (int K = 0; K < 2048; K++) { + // gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]), + // reduce_args={K}); + // } + // } + // } + // for (int M_1 = 0; M_1 < 1024; M_1++) { + // for (int N_1 = 0; N_1 < 1024; N_1++) { + // relu[M_1, N_1] = (gemm[M_1, N_1])dtype(), 0); + return CompareSelect::make( + CT.load(m, n), zero, zero, CT.load(m, n), kLT); + }); + Tensor ET = Compute( + "add", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return DT.load(m, n) + DT.load(m, n); + }); + Tensor FT = Compute( + "mul", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return ET.load(m, n) * ET.load(m, n); + }); + Tensor GT = Compute( + "sub", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return FT.load(m, n) - ET.load(m, n); + }); + + auto stmt = + Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt()}); + + // Constructed stmt: + // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2], + // add [2, 3], mul [3, 4] Buffer 'gemm', 'relu, ''add' and 'mul' are the same + // size; we'll reuse 'gemm' for 'add', and reuse 'relu' for 'mul' + //{ + // for (int M = 0; M < 1024; M++) { + // for (int N = 0; N < 1024; N++) { + // gemm[M, N] = float(0); + // for (int K = 0; K < 2048; K++) { + // gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]), + // reduce_args={K}); + // } + // } + // } + // for (int M_1 = 0; M_1 < 1024; M_1++) { + // for (int N_1 = 0; N_1 < 1024; N_1++) { + // relu[M_1, N_1] = (gemm[M_1, N_1])dtype(), 0); + return CompareSelect::make( + CT.load(m, n), zero, zero, CT.load(m, n), kLT); + }); + Tensor ET = Compute( + "add", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return DT.load(m, n) + DT.load(m, n); + }); + Tensor FT = Compute( + "mul", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return ET.load(m, n) * ET.load(m, n); + }); + Tensor GT = Compute( + "sub", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return FT.load(m, n) - 1; + }); + Tensor HT = Compute( + "div", + {{M, "M"}, {N, "N"}}, + [&](const ExprHandle& m, const ExprHandle& n) { + return GT.load(m, n) / 2; + }); + + auto stmt = Block::make( + {CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt(), HT.stmt()}); + + // Constructed stmt: + // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2], + // add [2, 3], mul [3, 4], sub [4, 5] Buffer 'gemm', 'relu, ''add', 'mul' and + // 'sub' are the same size; we'll reuse 'gemm' for 'add', reuse 'relu' for + // 'mul', and reuse 'gemm' for 'sub'. + //{ + // for (int M = 0; M < 1024; M++) { + // for (int N = 0; N < 1024; N++) { + // gemm[M, N] = float(0); + // for (int K = 0; K < 2048; K++) { + // gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]), + // reduce_args={K}); + // } + // } + // } + // for (int M_1 = 0; M_1 < 1024; M_1++) { + // for (int N_1 = 0; N_1 < 1024; N_1++) { + // relu[M_1, N_1] = (gemm[M_1, N_1])dtype(), 0); + return CompareSelect::make( + CT.load(m, n), zero, zero, CT.load(m, n), kLT); + }); + Tensor ET = Compute( + "add", + {{M * 2, "EM"}, {N * 2, "EN"}}, + [&](const ExprHandle& em, const ExprHandle& en) { + return DT.load(em / 2, en / 2) + DT.load(em / 2, en / 2); + }); + Tensor FT = Compute( + "mul", + {{M * 2, "FM"}, {N * 2, "FN"}}, + [&](const ExprHandle& fm, const ExprHandle& fn) { + return ET.load(fm, fn) * ET.load(fm, fn); + }); + auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()}); + + // Constructed stmt: + // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2], + // add [2, 3] We do not reuse buffer 'gemm' for 'add' because the size of + // buffer 'gemm' is smaller. + //{ + // for (int M = 0; M < 1024; M++) { + // for (int N = 0; N < 1024; N++) { + // gemm[M, N] = float(0); + // for (int K = 0; K < 2048; K++) { + // gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]), + // reduce_args={K}); + // } + // } + // } + // for (int M_1 = 0; M_1 < 1024; M_1++) { + // for (int N_1 = 0; N_1 < 1024; N_1++) { + // relu[M_1, N_1] = (gemm[M_1, N_1])= 5 +def _fake_filter_fn_constant(constant, data): + return data >= constant + + def _worker_init_fn(worker_id): random.seed(123) @@ -669,10 +677,12 @@ class TestFunctionalIterDataPipe(TestCase): arr = range(10) picklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [ (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (), {}), - (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (_fake_fn, (0, ), {'test': True}), {}), + (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (_fake_fn, (0, )), {}), + (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (partial(_fake_add, 1), (0,)), {}), (dp.iter.Collator, dp.iter.IterableWrapper(arr), (), {}), - (dp.iter.Collator, dp.iter.IterableWrapper(arr), (_fake_fn, (0, ), {'test': True}), {}), - (dp.iter.Filter, dp.iter.IterableWrapper(arr), (_fake_filter_fn, (0, ), {'test': True}), {}), + (dp.iter.Collator, dp.iter.IterableWrapper(arr), (_fake_fn, (0, )), {}), + (dp.iter.Filter, dp.iter.IterableWrapper(arr), (_fake_filter_fn, (0, )), {}), + (dp.iter.Filter, dp.iter.IterableWrapper(arr), (partial(_fake_filter_fn, 5), (0,)), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes: p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] @@ -1035,9 +1045,6 @@ class TestFunctionalIterDataPipe(TestCase): pass traverse(dp2) # This should not raise any error either - - - @suppress_warnings # Suppress warning for lambda fn def test_map_datapipe(self): input_dp = dp.iter.IterableWrapper(range(10)) @@ -1050,12 +1057,6 @@ class TestFunctionalIterDataPipe(TestCase): for x, y in zip(map_dp, input_dp): self.assertEqual(x, torch.tensor(y, dtype=torch.float)) - map_dp = input_dp.map(fn=fn, fn_args=(torch.int, ), fn_kwargs={'sum': True}) - self.assertEqual(len(input_dp), len(map_dp)) - for x, y in zip(map_dp, input_dp): - self.assertEqual(x, torch.tensor(y, dtype=torch.int).sum()) - - from functools import partial map_dp = input_dp.map(partial(fn, dtype=torch.int, sum=True)) self.assertEqual(len(input_dp), len(map_dp)) for x, y in zip(map_dp, input_dp): @@ -1333,7 +1334,6 @@ class TestFunctionalIterDataPipe(TestCase): _helper(batch_size=3, drop_last=True, batch_num=2, sort_key=_sort_fn) _helper(batch_size=3, drop_last=True, batch_num=2, bucket_num=2, sort_key=_sort_fn) - def test_filter_datapipe(self): input_ds = dp.iter.IterableWrapper(range(10)) @@ -1342,11 +1342,11 @@ class TestFunctionalIterDataPipe(TestCase): return data >= val return True - filter_dp = input_ds.filter(filter_fn=_filter_fn, fn_args=(5, )) + filter_dp = input_ds.filter(partial(_filter_fn, val=5)) for data, exp in zip(filter_dp, range(10)): self.assertEqual(data, exp) - filter_dp = input_ds.filter(filter_fn=_filter_fn, fn_kwargs={'val': 5, 'clip': True}) + filter_dp = input_ds.filter(partial(_filter_fn, val=5, clip=True)) for data, exp in zip(filter_dp, range(5, 10)): self.assertEqual(data, exp) @@ -1427,7 +1427,8 @@ class TestFunctionalMapDataPipe(TestCase): Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]] ] = [ (dp.map.Mapper, dp.map.SequenceWrapper(arr), (), {}), - (dp.map.Mapper, dp.map.SequenceWrapper(arr), (_fake_fn, (0,), {'test': True}), {}), + (dp.map.Mapper, dp.map.SequenceWrapper(arr), (_fake_fn, (0,)), {}), + (dp.map.Mapper, dp.map.SequenceWrapper(arr), (partial(_fake_add, 1), (0,)), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes: p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] @@ -1540,7 +1541,6 @@ class TestFunctionalMapDataPipe(TestCase): shuffler_dp = input_dp1.shuffle() self.assertEqual(10, len(shuffler_dp)) - def test_map_datapipe(self): arr = range(10) input_dp = dp.map.SequenceWrapper(arr) @@ -1556,15 +1556,6 @@ class TestFunctionalMapDataPipe(TestCase): map_dp[index], torch.tensor(input_dp[index], dtype=torch.float) ) - map_dp = input_dp.map(fn=fn, fn_args=(torch.int,), fn_kwargs={'sum': True}) - self.assertEqual(len(input_dp), len(map_dp)) - for index in arr: - self.assertEqual( - map_dp[index], torch.tensor(input_dp[index], dtype=torch.int).sum() - ) - - from functools import partial - map_dp = input_dp.map(partial(fn, dtype=torch.int, sum=True)) self.assertEqual(len(input_dp), len(map_dp)) for index in arr: diff --git a/test/test_nn.py b/test/test_nn.py index bbb686a843a2..94d0854f6499 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -1834,11 +1834,128 @@ class TestNN(NNTestCase): parameters.pop('p4') check() + # Check reverse works + forward = list(iter(parameter_dict)) + backward = list(reversed(parameter_dict)) + self.assertEqual(len(forward), len(backward)) + n = len(forward) + for i in range(n): + self.assertIs(forward[i], backward[n - i - 1]) + check() + + # Check copy works + copy = parameter_dict.copy() + + # Check all keys are present and have shallow copied values + for key in parameter_dict: + self.assertTrue(key in copy) + self.assertEqual(parameter_dict[key], copy[key]) + self.assertIs(parameter_dict[key], copy[key]) + check() + + parameter_dict["p20"] = Parameter(torch.randn(10, 10)) + copy["p21"] = Parameter(torch.randn(9, 10)) + + self.assertTrue("p20" in parameter_dict) + self.assertFalse("p20" in copy) + self.assertFalse("p21" in parameter_dict) + self.assertTrue("p21" in copy) + parameter_dict.pop("p20") + check() + + p = Parameter(torch.randn(10, 10)) + parameter_dict['p12'] = p + p_popitem = parameter_dict.popitem() + self.assertEqual(p_popitem[0], 'p12') + self.assertIs(p_popitem[1], p) + + # Unit test for set_default + # 1. Ensure parameter is correctly inserted when + # the key is not present in `ParameterDict` + assert 'p11' not in parameter_dict + parameters['p11'] = Parameter(torch.randn(10, 10)) + p_setdefault = parameter_dict.setdefault('p11', parameters['p11']) + self.assertIs(p_setdefault, parameters['p11']) + # 2. Ensure parameter is NOT inserted when the + # key is already present in `ParameterDict` + p = Parameter(torch.randn(10, 10)) + self.assertFalse(parameter_dict.setdefault('p11', p) is p) + # 3. Ensure `None` is inserted when the key is not + # present in `Parameter` and parameter is not specified + self.assertIs(parameter_dict.setdefault('p26'), None) + del parameter_dict['p26'] + check() + + parameters2 = OrderedDict([ + ('p13', Parameter(torch.randn(10, 10))), + ('p2', Parameter(torch.randn(10, 10))), + ('p3', Parameter(torch.randn(10, 10))), + ]) + parameter_dict2 = nn.ParameterDict(parameters2) + parameters.update(parameters2) + parameter_dict |= parameter_dict2 + check() + + parameters2 = OrderedDict() + parameter_dict2 = nn.ParameterDict(parameters2) + parameters.update(parameters2) + parameter_dict |= parameter_dict2 + check() + + parameters2 = OrderedDict([ + ('p14', Parameter(torch.randn(10, 10))), + ('p15', Parameter(torch.randn(10, 10))), + ('p13', Parameter(torch.randn(10, 10))), + ]) + parameter_dict2 = nn.ParameterDict(parameters2) + parameters.update(parameters2) + parameter_dict |= parameter_dict2 + check() + + # Check __or__ and __ror__ works + parameters2 = OrderedDict([ + ('p20', Parameter(torch.randn(10, 10))), + ('p21', Parameter(torch.randn(10, 10))), + ('p22', Parameter(torch.randn(10, 10))), + ]) + parameter_dict2 = nn.ParameterDict(parameters2) + parameters.update(parameters2) + parameter_dict = parameter_dict | parameter_dict2 + check() + + parameters2 = OrderedDict([ + ('p23', Parameter(torch.randn(10, 10))), + ('p24', Parameter(torch.randn(10, 10))), + ('p25', Parameter(torch.randn(10, 10))), + ]) + parameter_dict2 = nn.ParameterDict(parameters2) + parameters2.update(parameters) + parameters = parameters2 + parameter_dict = parameter_dict2 | parameter_dict + check() + + parameters['p17'] = Parameter(torch.randn(10, 10)) + parameter_dict['p17'] = parameters['p17'] + self.assertIs(parameters['p17'], parameter_dict.get('p17')) + temp_param = Parameter(torch.randn(10, 10)) + self.assertIs(parameters['p17'], parameter_dict.get('p17', temp_param)) + self.assertIs(None, parameter_dict.get('p18')) + self.assertIs(temp_param, parameter_dict.get('p18', temp_param)) + check() + parameter_dict.clear() self.assertEqual(len(parameter_dict), 0) parameters.clear() check() + parameter_dict2 = parameter_dict.fromkeys(['p19', 'p20']) + self.assertEqual({'p19': None, 'p20': None}, parameter_dict2) + check() + + parameter_dict2 = parameter_dict.fromkeys(['p19', 'p20'], temp_param) + self.assertEqual({'p19': temp_param, 'p20': temp_param}, parameter_dict2) + check() + def test_add_module(self): methods_to_test = ['add_module', 'register_module'] for fn in methods_to_test: diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py index b2ef98aba037..c78d713f003e 100644 --- a/test/test_sparse_csr.py +++ b/test/test_sparse_csr.py @@ -603,11 +603,11 @@ class TestSparseCSR(TestCase): ), shape=a.shape, ) - expected = alpha * (a_bsr * b.cpu().numpy()) + beta * c.cpu().numpy() + expected = alpha * (a_bsr * b.cpu().resolve_conj().numpy()) + beta * c.cpu().numpy() self.assertEqual(actual, out) self.assertEqual(actual, expected) - @onlyCUDA + @skipCPUIfNoMklSparse @unittest.skipIf(not TEST_SCIPY, "SciPy not found") @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_block_addmm(self, device, dtype): @@ -623,7 +623,7 @@ class TestSparseCSR(TestCase): for op_b, op_out in itertools.product([True, False], repeat=2): self.run_test_block_addmm_addmv(torch.addmm, c, a, b, op_b, op_out, dtype=dtype, device=device) - @onlyCUDA + @skipCPUIfNoMklSparse @unittest.skipIf(not TEST_SCIPY, "SciPy not found") @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_block_addmv(self, device, dtype): diff --git a/third_party/cuda.BUILD b/third_party/cuda.BUILD index 0c58b34a52e7..a948415f9138 100644 --- a/third_party/cuda.BUILD +++ b/third_party/cuda.BUILD @@ -1,43 +1,76 @@ -""" -Collect all the CUDA stuff from @local_config_cuda in a single target -for convenience. -""" +# Adopted from: https://github.com/tensorflow/runtime/blob/master/third_party/rules_cuda/private/BUILD.local_cuda +# Library targets are created corresponding to BUILD.bazel's needs. + +cc_library( + name = "cuda_headers", + hdrs = glob([ + "include/**", + "targets/x86_64-linux/include/**", + ]), + includes = [ + "include", + "targets/x86_64-linux/include", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "cuda_driver", + srcs = ["lib64/stubs/libcuda.so"], + visibility = ["//visibility:public"], +) cc_library( name = "cuda", + srcs = ["targets/x86_64-linux/lib/libcudart.so"], visibility = ["//visibility:public"], - deps = [ - "@local_config_cuda//cuda:cublas", - "@local_config_cuda//cuda:cuda_driver", - "@local_config_cuda//cuda:cuda_headers", - "@local_config_cuda//cuda:cudart", - "@local_config_cuda//cuda:cufft", - "@local_config_cuda//cuda:curand", - ], + deps = [":cuda_headers"], ) cc_library( - name = "cupti", - deps = [ - "@local_config_cuda//cuda:cupti_headers", - "@local_config_cuda//cuda:cupti_link", - ], + name = "cufft", + srcs = ["targets/x86_64-linux/lib/libcufft.so"], + visibility = ["//visibility:public"], ) -[ - alias( - name = lib, - actual = "@local_config_cuda//cuda:{}".format(lib), - visibility = ["//visibility:public"], - ) - for lib in [ - "cublas", - "cufft", - "cusolver", - "cusparse", - "curand", - "nvrtc", - "cuda_driver", - "nvToolsExt", - ] -] +cc_library( + name = "cublas", + srcs = [ + "targets/x86_64-linux/lib/libcublasLt.so", + "targets/x86_64-linux/lib/libcublas.so", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "curand", + srcs = ["targets/x86_64-linux/lib/libcurand.so"], + visibility = ["//visibility:public"], +) + +cc_library( + name = "cusolver", + srcs = ["targets/x86_64-linux/lib/libcusolver.so"], + visibility = ["//visibility:public"], +) + +cc_library( + name = "cusparse", + srcs = ["targets/x86_64-linux/lib/libcusparse.so"], + visibility = ["//visibility:public"], +) + +cc_library( + name = "nvrtc", + srcs = [ + "targets/x86_64-linux/lib/libnvrtc.so", + "targets/x86_64-linux/lib/libnvrtc-builtins.so", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "nvToolsExt", + srcs = [ "lib64/libnvToolsExt.so"], + visibility = ["//visibility:public"], +) diff --git a/third_party/cudnn.BUILD b/third_party/cudnn.BUILD new file mode 100644 index 000000000000..03736508a68c --- /dev/null +++ b/third_party/cudnn.BUILD @@ -0,0 +1,26 @@ +# Adopted from: https://github.com/NVIDIA/TRTorch/blob/master/third_party/cudnn/local/BUILD + +cc_library( + name = "cudnn_headers", + hdrs = ["include/cudnn.h"] + glob([ + "include/cudnn+.h", + "include/cudnn_*.h", + ]), + includes = ["include/"], + visibility = ["//visibility:private"], +) + +cc_import( + name = "cudnn_lib", + shared_library = "lib/x86_64-linux-gnu/libcudnn.so", + visibility = ["//visibility:private"], +) + +cc_library( + name = "cudnn", + visibility = ["//visibility:public"], + deps = [ + "cudnn_headers", + "cudnn_lib", + ], +) diff --git a/third_party/gloo.BUILD b/third_party/gloo.BUILD index 5db68095fee3..3f623e54e6ad 100644 --- a/third_party/gloo.BUILD +++ b/third_party/gloo.BUILD @@ -48,8 +48,8 @@ cc_library( cu_library( name = "gloo_cuda", srcs = [ - "gloo/cuda.cu.cc", - "gloo/cuda_private.cu.cc", + "gloo/cuda.cu", + "gloo/cuda_private.cu", ], visibility = ["//visibility:public"], deps = [ @@ -72,8 +72,8 @@ cc_library( "gloo/cuda*.cc", "gloo/common/win.cc", "gloo/rendezvous/redis_store.cc", - ], - ), + ] + ) + if_cuda(glob(["gloo/cuda*.cc"])), copts = [ "-std=gnu++11", "-std=c++11", diff --git a/third_party/substitution.bzl b/third_party/substitution.bzl index db376ebfe52b..7b14b3c8a1c3 100644 --- a/third_party/substitution.bzl +++ b/third_party/substitution.bzl @@ -58,7 +58,7 @@ def header_template_rule_impl(ctx): CcInfo(compilation_context = cc_common.create_compilation_context( # pass out the include path for finding this header - includes = depset([ctx.outputs.out.dirname, ctx.bin_dir.path]), + system_includes = depset([ctx.attr.include, ctx.outputs.out.dirname, ctx.bin_dir.path]), # and the actual header here. headers = depset([ctx.outputs.out]), @@ -68,6 +68,7 @@ def header_template_rule_impl(ctx): header_template_rule = rule( attrs = { "out": attr.output(mandatory = True), + "include": attr.string(), "src": attr.label( mandatory = True, allow_single_file = True, diff --git a/third_party/tensorflow_cuda_bazel_build/BUILD b/third_party/tensorflow_cuda_bazel_build/BUILD deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/third_party/tensorflow_cuda_bazel_build/README.md b/third_party/tensorflow_cuda_bazel_build/README.md deleted file mode 100644 index 439e195d8e44..000000000000 --- a/third_party/tensorflow_cuda_bazel_build/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Config for CUDA - -This is a checked-in copy of the auto-generated config for building CUDA code with bazel. The content of this folder was generated from https://github.com/tensorflow/tensorflow `./configure` execution and then edited manually to fit the pytorch needs. - -The LICENSE for the TensorFlow project is APACHE 2. The full LICENSE file could be found here https://github.com/tensorflow/tensorflow/blob/master/LICENSE. diff --git a/third_party/tensorflow_cuda_bazel_build/WORKSPACE b/third_party/tensorflow_cuda_bazel_build/WORKSPACE deleted file mode 100644 index 59369ce679c1..000000000000 --- a/third_party/tensorflow_cuda_bazel_build/WORKSPACE +++ /dev/null @@ -1 +0,0 @@ -workspace(name = "local_config_cuda") diff --git a/third_party/tensorflow_cuda_bazel_build/cuda/BUILD b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD deleted file mode 100755 index f7271af2750b..000000000000 --- a/third_party/tensorflow_cuda_bazel_build/cuda/BUILD +++ /dev/null @@ -1,451 +0,0 @@ -licenses([ - "restricted", - "reciprocal", - "notice", -]) # MPL2, portions GPL v3, LGPL v3, BSD-like - -package(default_visibility = ["//visibility:public"]) - -config_setting( - name = "using_nvcc", - values = { - "define": "using_cuda_nvcc=true", - }, -) - -config_setting( - name = "using_clang", - values = { - "define": "using_cuda_clang=true", - }, -) - -# Equivalent to using_clang && -c opt. -config_setting( - name = "using_clang_opt", - values = { - "define": "using_cuda_clang=true", - "compilation_mode": "opt", - }, -) - -config_setting( - name = "darwin", - values = {"cpu": "darwin"}, -) - -cc_library( - name = "cuda_headers", - hdrs = [ - ":cuda-include", - ":cudnn-include", - ], - includes = [ - ".", - "include", - ], -) - -cc_library( - name = "cudnn_headers", - hdrs = [ - ":cudnn-include", - ], - includes = [ - ".", - "include", - ], -) - -cc_library( - name = "cudart_static", - linkopts = [ - "-L/usr/local/cuda/lib64", - ], -) - -cc_library( - name = "cuda_driver", - linkopts = ["-lcuda"], - deps = [":linker_search_path"], -) - -# Provides the RPATH for Nvidia-less sytems to be able to run binaries linked to libcuda. -cc_library( - name = "driver_stub_runtime", - linkopts = [ - "-Wl,-rpath,/usr/local/cuda/lib64/stubs", - ], - deps = [":cuda_driver"], -) - -cc_library( - name = "linker_search_path", - linkopts = [ - "-L/usr/local/cuda/lib64", - "-L/usr/local/cuda/lib64/stubs", - "-Wl,-rpath-link,/usr/local/cuda/lib64", - "-Wl,-rpath-link,/usr/local/cuda/lib64/stubs", - ], -) - -[ - cc_library( - name = libname, - linkopts = ["-l" + libname] + (["-lgomp"] if (libname == "cusolver") else []), - linkstatic = True, - deps = [":linker_search_path"], - ) - for libname in [ - "cublas", - "cudart", - "cudnn", - "cufft", - "curand", - "cusolver", - "cusparse", - "nvrtc", - "nvToolsExt", - ] -] - -cc_library( - name = "cuda", - deps = [ - ":cublas", - ":cuda_headers", - ":cudart", - ":cudnn", - ":cufft", - ":curand", - ":nvToolsExt", - ], -) - -# NVIDIA Performance Primitives (http://docs.nvidia.com/cuda/npp/modules.html)) -# used by OpenCV -cc_library( - name = "nppi", - linkopts = [ - "-lnppc", - "-lnppial", - "-lnppicom", - "-lnppidei", - "-lnppif", - "-lnppig", - "-lnppim", - "-lnppist", - "-lnppitc", - "-lnpps", - ], - linkstatic = True, - deps = [":linker_search_path"], -) - -# NVIDIA Management Library -cc_library( - name = "nvml", - linkopts = [ - "-lnvidia-ml", - "-Wl,-rpath,/usr/lib/nvidia-410", - "-Wl,-rpath,/usr/lib/nvidia-390", - "-Wl,-rpath,/usr/lib/nvidia-387", - "-Wl,-rpath,/usr/lib/nvidia-384", - ], - deps = [":linker_search_path"], -) - -cc_library( - name = "cupti_headers", - hdrs = [ - ":cuda-extras", - ], - includes = [ - ".", - "extras/CUPTI/include/", - ], -) - -# cupti .so exposed at linktime -cc_library( - name = "cupti_link", - linkopts = [ - "-L/usr/local/cuda/extras/CUPTI/lib64", - "-lcupti", - ], -) - -cc_library( - name = "libdevice_root", - data = [":cuda-nvvm"], -) - -CUDA_INCLUDES_FILES = [ - "include/builtin_types.h", - "include/channel_descriptor.h", - "include/CL/cl_egl.h", - "include/CL/cl_ext.h", - "include/CL/cl_gl_ext.h", - "include/CL/cl_gl.h", - "include/CL/cl.h", - "include/CL/cl.hpp", - "include/CL/cl_platform.h", - "include/CL/opencl.h", - "include/common_functions.h", - "include/cooperative_groups.h", - "include/cooperative_groups_helpers.h", - "include/crt/common_functions.h", - "include/crt/device_double_functions.h", - "include/crt/device_double_functions.hpp", - "include/crt/device_functions.h", - "include/crt/device_functions.hpp", - "include/crt/func_macro.h", - "include/crt/host_config.h", - "include/crt/host_defines.h", - "include/crt/host_runtime.h", - "include/crt/math_functions.h", - "include/crt/math_functions.hpp", - "include/crt/mma.h", - "include/crt/mma.hpp", - "include/crt/nvfunctional", - "include/crt/sm_70_rt.h", - "include/crt/sm_70_rt.hpp", - "include/crt/storage_class.h", - # TODO: figure out why on a CI machine with CUDA 10.2 it's not present - # "include/cublas_api.h", - # "include/cublas.h", - # "include/cublas_v2.h", - # "include/cublasXt.h", - "include/cuComplex.h", - "include/cuda_device_runtime_api.h", - "include/cudaEGL.h", - "include/cuda_egl_interop.h", - "include/cuda_fp16.h", - "include/cuda_fp16.hpp", - "include/cudaGL.h", - "include/cuda_gl_interop.h", - "include/cuda.h", - "include/cudalibxt.h", - "include/cuda_occupancy.h", - "include/cuda_profiler_api.h", - "include/cudaProfiler.h", - "include/cudart_platform.h", - "include/cuda_runtime_api.h", - "include/cuda_runtime.h", - "include/cuda_surface_types.h", - "include/cuda_texture_types.h", - "include/cudaVDPAU.h", - "include/cuda_vdpau_interop.h", - "include/cufft.h", - "include/cufftw.h", - "include/cufftXt.h", - "include/curand_discrete2.h", - "include/curand_discrete.h", - "include/curand_globals.h", - "include/curand.h", - "include/curand_kernel.h", - "include/curand_lognormal.h", - "include/curand_mrg32k3a.h", - "include/curand_mtgp32dc_p_11213.h", - "include/curand_mtgp32.h", - "include/curand_mtgp32_host.h", - "include/curand_mtgp32_kernel.h", - "include/curand_normal.h", - "include/curand_normal_static.h", - "include/curand_philox4x32_x.h", - "include/curand_poisson.h", - "include/curand_precalc.h", - "include/curand_uniform.h", - "include/cusolver_common.h", - "include/cusolverDn.h", - "include/cusolverRf.h", - "include/cusolverSp.h", - "include/cusolverSp_LOWLEVEL_PREVIEW.h", - "include/cusparse.h", - "include/cusparse_v2.h", - "include/device_atomic_functions.h", - "include/device_atomic_functions.hpp", - "include/device_double_functions.h", - "include/device_functions.h", - "include/device_launch_parameters.h", - "include/device_types.h", - "include/driver_functions.h", - "include/driver_types.h", - "include/fatBinaryCtl.h", - "include/fatbinary.h", - "include/host_config.h", - "include/host_defines.h", - "include/library_types.h", - "include/math_constants.h", - "include/math_functions.h", - "include/mma.h", - "include/nppcore.h", - "include/nppdefs.h", - "include/npp.h", - "include/nppi_arithmetic_and_logical_operations.h", - "include/nppi_color_conversion.h", - "include/nppi_compression_functions.h", - "include/nppi_computer_vision.h", - "include/nppi_data_exchange_and_initialization.h", - "include/nppi_filtering_functions.h", - "include/nppi_geometry_transforms.h", - "include/nppi.h", - "include/nppi_linear_transforms.h", - "include/nppi_morphological_operations.h", - "include/nppi_statistics_functions.h", - "include/nppi_support_functions.h", - "include/nppi_threshold_and_compare_operations.h", - "include/npps_arithmetic_and_logical_operations.h", - "include/npps_conversion_functions.h", - "include/npps_filtering_functions.h", - "include/npps.h", - "include/npps_initialization.h", - "include/npps_statistics_functions.h", - "include/npps_support_functions.h", - # Note: CUDA 10.0 only - # "include/nppversion.h", - # TODO: figure out why on a CI machine with CUDA 10.2 it's not present - # "include/nvblas.h", - "include/nvfunctional", - "include/nvgraph.h", - "include/nvjpeg.h", - "include/nvml.h", - "include/nvrtc.h", - "include/nvToolsExtCuda.h", - "include/nvToolsExtCudaRt.h", - "include/nvToolsExt.h", - "include/nvToolsExtMeta.h", - "include/nvToolsExtSync.h", - "include/nvtx3/nvToolsExtCuda.h", - "include/nvtx3/nvToolsExtCudaRt.h", - "include/nvtx3/nvToolsExt.h", - "include/nvtx3/nvToolsExtOpenCL.h", - "include/nvtx3/nvToolsExtSync.h", - "include/nvtx3/nvtxDetail/nvtxImplCore.h", - "include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h", - "include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h", - "include/nvtx3/nvtxDetail/nvtxImpl.h", - "include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h", - "include/nvtx3/nvtxDetail/nvtxImplSync_v3.h", - "include/nvtx3/nvtxDetail/nvtxInitDecls.h", - "include/nvtx3/nvtxDetail/nvtxInitDefs.h", - "include/nvtx3/nvtxDetail/nvtxInit.h", - "include/nvtx3/nvtxDetail/nvtxLinkOnce.h", - "include/nvtx3/nvtxDetail/nvtxTypes.h", - "include/sm_20_atomic_functions.h", - "include/sm_20_atomic_functions.hpp", - "include/sm_20_intrinsics.h", - "include/sm_20_intrinsics.hpp", - "include/sm_30_intrinsics.h", - "include/sm_30_intrinsics.hpp", - "include/sm_32_atomic_functions.h", - "include/sm_32_atomic_functions.hpp", - "include/sm_32_intrinsics.h", - "include/sm_32_intrinsics.hpp", - "include/sm_35_atomic_functions.h", - "include/sm_35_intrinsics.h", - "include/sm_60_atomic_functions.h", - "include/sm_60_atomic_functions.hpp", - "include/sm_61_intrinsics.h", - "include/sm_61_intrinsics.hpp", - # CUDA 10.0 only - # "include/sobol_direction_vectors.h", - "include/surface_functions.h", - "include/surface_functions.hpp", - "include/surface_indirect_functions.h", - "include/surface_indirect_functions.hpp", - "include/surface_types.h", - "include/texture_fetch_functions.h", - "include/texture_fetch_functions.hpp", - "include/texture_indirect_functions.h", - "include/texture_indirect_functions.hpp", - "include/texture_types.h", - "include/vector_functions.h", - "include/vector_functions.hpp", - "include/vector_types.h", -] - -genrule( - name = "cuda-include", - outs = CUDA_INCLUDES_FILES, - cmd = " && ".join([ - "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p) - for p in CUDA_INCLUDES_FILES - ]), - local = True, - tags = ["no-cache"], -) - -CUDA_NVVM_FILES = [ - "nvvm/bin/cicc", - "nvvm/include/nvvm.h", - "nvvm/lib64/libnvvm.so", - "nvvm/lib64/libnvvm.so.3", - "nvvm/lib64/libnvvm.so.3.3.0", - "nvvm/libdevice/libdevice.10.bc", -] - -genrule( - name = "cuda-nvvm", - outs = CUDA_NVVM_FILES, - cmd = " && ".join([ - "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p) - for p in CUDA_NVVM_FILES - ]), - local = True, - tags = ["no-cache"], -) - -CUDA_EXTRAS_FILES = [ - "extras/CUPTI/include/cuda_stdint.h", - "extras/CUPTI/include/cupti.h", - "extras/CUPTI/include/cupti_activity.h", - "extras/CUPTI/include/cupti_callbacks.h", - "extras/CUPTI/include/cupti_driver_cbid.h", - "extras/CUPTI/include/cupti_events.h", - "extras/CUPTI/include/cupti_metrics.h", - "extras/CUPTI/include/cupti_nvtx_cbid.h", - "extras/CUPTI/include/cupti_result.h", - "extras/CUPTI/include/cupti_runtime_cbid.h", - "extras/CUPTI/include/cupti_version.h", - "extras/CUPTI/include/generated_cuda_gl_interop_meta.h", - "extras/CUPTI/include/generated_cuda_meta.h", - "extras/CUPTI/include/generated_cuda_runtime_api_meta.h", - "extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h", - "extras/CUPTI/include/generated_cudaGL_meta.h", - "extras/CUPTI/include/generated_cudaVDPAU_meta.h", - "extras/CUPTI/include/generated_nvtx_meta.h", - "extras/CUPTI/include/GL/gl.h", - "extras/CUPTI/include/GL/glew.h", - "extras/CUPTI/include/GL/glext.h", - "extras/CUPTI/include/GL/glu.h", - "extras/CUPTI/include/GL/glut.h", - "extras/CUPTI/include/GL/glx.h", - "extras/CUPTI/include/GL/glxext.h", - "extras/CUPTI/include/GL/wglew.h", - "extras/CUPTI/include/GL/wglext.h", - "extras/CUPTI/include/openacc/cupti_openacc.h", -] - -genrule( - name = "cuda-extras", - outs = CUDA_EXTRAS_FILES, - cmd = " && ".join([ - "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p) - for p in CUDA_EXTRAS_FILES - ]), - local = True, - tags = ["no-cache"], -) - -genrule( - name = "cudnn-include", - outs = [ - "include/cudnn.h", - ], - cmd = """ - ln -s /usr/include/cudnn.h $(@D)/cudnn.h""", - local = True, - tags = ["no-cache"], -) - diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD index ae210f473933..66c6a795162a 100644 --- a/third_party/tensorpipe.BUILD +++ b/third_party/tensorpipe.BUILD @@ -162,8 +162,8 @@ cc_library( cc_library( name = "tensorpipe_cuda", - srcs = TENSORPIPE_CUDA_SOURCES, - hdrs = TENSORPIPE_CUDA_HEADERS + [":tensorpipe_cuda_config_header"], + srcs = glob(TENSORPIPE_CUDA_SOURCES), + hdrs = glob(TENSORPIPE_CUDA_HEADERS) + [":tensorpipe_cuda_config_header"], includes = [ ".", ], diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index afdc9cb3d6a4..bdfc20dfbc89 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -2342,22 +2342,13 @@ # miopen - name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - self, weight, bias: "grad.defined() ? miopen_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) : std::tuple()" - -- name: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, true, output_padding, groups, benchmark, deterministic, true, false, grad_input_mask) + self, weight, bias: "grad.defined() ? convolution_backward(grad, self, weight, bias->sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple()" - name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - self, weight, bias: "grad.defined() ? miopen_convolution_backward(self, grad, weight, padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) : std::tuple()" - -- name: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector(padding.size(), 0), groups, benchmark, deterministic, true, false, grad_input_mask) + self, weight, bias: "grad.defined() ? convolution_backward(grad, self, weight, bias->sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" - name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - self, weight, bias: "grad.defined() ? miopen_depthwise_convolution_backward(self, grad, weight, padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) : std::tuple()" - -- name: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector(padding.size(), 0), groups, benchmark, deterministic, true, false, grad_input_mask) + self, weight, bias: "grad.defined() ? convolution_backward(grad, self, weight, bias->sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple()" diff --git a/tools/codegen/utils.py b/tools/codegen/utils.py index 406ae0d2ce89..3c3e3a42c768 100644 --- a/tools/codegen/utils.py +++ b/tools/codegen/utils.py @@ -1,11 +1,11 @@ -import re +import contextlib +import functools +import hashlib import os +import re +import textwrap from typing import Tuple, List, Iterable, Iterator, Callable, Sequence, TypeVar, Optional, Dict, Any, Union, Set, NoReturn from enum import Enum -import contextlib -import textwrap -import hashlib -import functools from tools.codegen.code_template import CodeTemplate @@ -139,6 +139,8 @@ class FileManager: except IOError: old_contents = None if contents != old_contents: + # Create output directory if it doesn't exist + os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, 'w') as f: f.write(contents) diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 332e784f0b06..339a9f99707e 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -1,8 +1,7 @@ +import argparse import collections from pprint import pformat -import argparse - from tools.codegen.model import Variant from tools.codegen.api.python import (PythonSignatureGroup, PythonSignatureNativeFunctionPair) @@ -10,7 +9,7 @@ from tools.codegen.gen import parse_native_yaml from tools.codegen.utils import FileManager from typing import Sequence, List, Dict -from ..autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads +from tools.autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads """ This module implements generation of type stubs for PyTorch, diff --git a/tools/rules/cu.bzl b/tools/rules/cu.bzl index 726fadfe98bd..aec4a874cf00 100644 --- a/tools/rules/cu.bzl +++ b/tools/rules/cu.bzl @@ -1,3 +1,6 @@ -# gpu support is not available -def cu_library(**kwargs): - pass +load("@rules_cuda//cuda:defs.bzl", "cuda_library") + +NVCC_COPTS = ["--expt-relaxed-constexpr", "--expt-extended-lambda"] + +def cu_library(name, srcs, copts = [], **kwargs): + cuda_library(name, srcs = srcs, copts = NVCC_COPTS + copts, **kwargs) diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py index 686b5c4a34f4..ff175771fd18 100644 --- a/tools/setup_helpers/cmake.py +++ b/tools/setup_helpers/cmake.py @@ -118,20 +118,30 @@ class CMake: cmake_command = 'cmake' if IS_WINDOWS: return cmake_command - cmake3 = which('cmake3') - cmake = which('cmake') - if cmake3 is not None and CMake._get_version(cmake3) >= LooseVersion("3.10.0"): - cmake_command = 'cmake3' - return cmake_command - elif cmake is not None and CMake._get_version(cmake) >= LooseVersion("3.10.0"): - return cmake_command - else: + cmake3_version = CMake._get_version(which('cmake3')) + cmake_version = CMake._get_version(which('cmake')) + + _cmake_min_version = LooseVersion("3.10.0") + if all((ver is None or ver < _cmake_min_version for ver in [cmake_version, cmake3_version])): raise RuntimeError('no cmake or cmake3 with version >= 3.10.0 found') + if cmake3_version is None: + cmake_command = 'cmake' + elif cmake_version is None: + cmake_command = 'cmake3' + else: + if cmake3_version >= cmake_version: + cmake_command = 'cmake3' + else: + cmake_command = 'cmake' + return cmake_command + @staticmethod - def _get_version(cmd: str) -> Any: + def _get_version(cmd: Optional[str]) -> Any: "Returns cmake version." + if cmd is None: + return None for line in check_output([cmd, '--version']).decode('utf-8').split('\n'): if 'version' in line: return LooseVersion(line.strip().split(' ')[2]) diff --git a/torch/ao/quantization/_dbr/auto_trace.py b/torch/ao/quantization/_dbr/auto_trace.py index c37d8dc3715f..14d1d6a7b003 100644 --- a/torch/ao/quantization/_dbr/auto_trace.py +++ b/torch/ao/quantization/_dbr/auto_trace.py @@ -34,6 +34,7 @@ enable_logging = False def add_auto_observation( model : torch.nn.Module, + qconfig_dict: Dict[str, Any], example_inputs: Tuple[Any], input_dtypes: Any = (torch.float,), # must be same structure as model inputs output_dtypes: Any = (torch.float,), # must be same structure as model outputs @@ -204,9 +205,11 @@ def add_auto_observation( global_disable_torch_function_override global_disable_torch_function_override = True + # mypy ignore is used instead of assert because this + # runs on every forward and assert has a performance cost args, kwargs = parent_qstate.op_prepare_before_hook( cur_module, args, kwargs, first_call, qtensor_id, - fqn, cur_module) + fqn, cur_module) # type: ignore[arg-type] # original forward output = orig_module_call(self, *args, **kwargs) @@ -263,27 +266,29 @@ def add_auto_observation( # Create a list before iterating because we are adding new # named modules inside the loop. named_modules = list(self.named_modules()) - for k, v in named_modules: + for fqn, v in named_modules: - # k is the global FQN, i.e. 'foo.bar.baz' + # fqn is the global FQN, i.e. 'foo.bar.baz' # v is the module instance # # we need to associate the global FQN with SeenOp # for modules, this is the module FQN # for functions, this is the parent module FQN - module_id_to_fqn[id(v)] = k + module_id_to_fqn[id(v)] = fqn - has_qconfig = hasattr(v, 'qconfig') and v.qconfig is not None - if has_qconfig and not is_leaf(v): - if v is self: - # for the top level module only, specify input - # and output dtypes - v._auto_quant_state = AutoQuantizationState( - v.qconfig, input_dtypes, output_dtypes) - pass - else: - v._auto_quant_state = AutoQuantizationState( - v.qconfig) + if is_leaf(v): + continue + + if v is self: + # for the top level module only, specify input + # and output dtypes + v._auto_quant_state = AutoQuantizationState( + qconfig_dict, fqn, + input_dtypes, output_dtypes) + pass + else: + v._auto_quant_state = AutoQuantizationState( + qconfig_dict, fqn) global_op_idx[0] = 0 diff --git a/torch/ao/quantization/_dbr/qconfig_dict_utils.py b/torch/ao/quantization/_dbr/qconfig_dict_utils.py new file mode 100644 index 000000000000..68314a8fa5be --- /dev/null +++ b/torch/ao/quantization/_dbr/qconfig_dict_utils.py @@ -0,0 +1,27 @@ +from typing import Dict, Any + +import torch + +TYPE_TO_REPLACEMENT_TYPE = { + torch.add: torch.Tensor.add, + torch.Tensor.add_: torch.Tensor.add, + torch.mul: torch.Tensor.mul, + torch.Tensor.mul_: torch.Tensor.mul, +} + +def normalize_object_types(qconfig_dict: Dict[str, Any]) -> None: + """ + This function looks for entries in `qconfig_dict['object_type']` + corresponding to PyTorch overrides of Python math functions + such as `torch.add` and `torch.mul`. If any of these functions are found, + it changes the type to the tensor variant of these functions. + This is needed because the tensor variant is what is expected + within the framework. + """ + if 'object_type' not in qconfig_dict: + return + + for idx, (target_type, qconfig) in enumerate(qconfig_dict['object_type']): + replacement_type = TYPE_TO_REPLACEMENT_TYPE.get(target_type, None) + if replacement_type is not None: + qconfig_dict['object_type'][idx] = (replacement_type, qconfig) diff --git a/torch/ao/quantization/_dbr/quantization_state.py b/torch/ao/quantization/_dbr/quantization_state.py index 9515cb921571..a1b14af761f8 100644 --- a/torch/ao/quantization/_dbr/quantization_state.py +++ b/torch/ao/quantization/_dbr/quantization_state.py @@ -31,6 +31,7 @@ from .utils import ( get_producer_of_seen_op_info, clone_detach_tensor_without_dispatch, get_input_args_quant_dequant_info, + get_cur_qconfig, ) OpConvertInfo = Tuple[ @@ -64,16 +65,15 @@ class AutoQuantizationState(torch.nn.Module): def __init__( self, - qconfig, + qconfig_dict: Dict[str, Any], + fqn: str, input_dtypes: Any = None, output_dtypes: Any = None, ): super().__init__() self.idx = 0 - # TODO(future PR): change this to the subset of qconfig_dict - # relevant to the parent module - assert qconfig is not None - self.qconfig = qconfig + self.qconfig_dict = qconfig_dict + self.fqn = fqn # this is a ModuleDict in order to properly register observers # to be within the module hierarchy. self.tensor_id_to_observer = torch.nn.ModuleDict() @@ -247,7 +247,7 @@ class AutoQuantizationState(torch.nn.Module): kwargs: Dict[str, Any], first_call: bool, qtensor_id: List[int], - fqn: Optional[str], + fqn: str, root_module: torch.nn.Module, ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: """ @@ -613,6 +613,7 @@ class AutoQuantizationState(torch.nn.Module): arg_tensor_infos: List[Optional[QTensorInfo]], func_output_dtype_type: FuncOutputDTypeType, qtensor_id: List[int], + fqn: str, ) -> None: """ Runs the prepare hook during first_call for individual @@ -642,13 +643,17 @@ class AutoQuantizationState(torch.nn.Module): # which will be converted to a quant later # TODO(future PR): share these observers if multiple ops need # this quant. - # TODO(future PR): create from qconfig of op instead of global - # qconfig. - if arg._qtensor_info.inf_dtype != torch.quint8: # type: ignore[attr-defined] + qconfig = get_cur_qconfig(self.qconfig_dict, fqn, op) + if qconfig is None: + # If qconfig is None, we do not need any input observers + return + elif arg._qtensor_info.inf_dtype != torch.quint8: # type: ignore[attr-defined] + # TODO(future PR): currently this only handles float32 and + # quint8, we need to extend it to other dtypes tensor_id = arg._qtensor_info.id # type: ignore[attr-defined] weight_arg_idx = get_weight_arg_idx(op) - obs = self.qconfig.weight() if arg_idx == weight_arg_idx else \ - self.qconfig.activation() + obs = qconfig.weight() if arg_idx == weight_arg_idx else \ + qconfig.activation() self.tensor_id_to_observer[str(tensor_id)] = obs def _first_call_op_prepare_before_hook_create_subgraphs( @@ -658,7 +663,7 @@ class AutoQuantizationState(torch.nn.Module): kwargs: Dict[str, Any], first_call: bool, qtensor_id: List[int], - fqn: Optional[str], + fqn: str, root_module: torch.nn.Module, ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: """ @@ -679,13 +684,13 @@ class AutoQuantizationState(torch.nn.Module): self._first_call_op_prepare_before_hook_create_subgraphs_tensor( op, inner_arg, arg_idx, input_observed_arg_idxs, arg_tensor_infos, func_output_dtype_type, - qtensor_id) + qtensor_id, fqn) arg_idx += 1 else: self._first_call_op_prepare_before_hook_create_subgraphs_tensor( op, arg, arg_idx, input_observed_arg_idxs, arg_tensor_infos, func_output_dtype_type, - qtensor_id) + qtensor_id, fqn) arg_idx += 1 packable_tensor_idx_to_name = {} @@ -716,11 +721,12 @@ class AutoQuantizationState(torch.nn.Module): if self.idx not in self.idx_to_seen_op_infos: op_type_is_module = isinstance(op, torch.nn.Module) op_type = type(op) if op_type_is_module else op + qconfig = get_cur_qconfig(self.qconfig_dict, fqn, op) self.idx_to_seen_op_infos[self.idx] = SeenOpInfo( self.idx, op_type, op_type_is_module, fqn, arg_tensor_infos, [], packable_tensor_idx_to_name, packable_nontensor_idx_to_arg, packable_tensor_kwarg_name_to_name, - op_packing_only_uses_module_attributes) + op_packing_only_uses_module_attributes, qconfig) return args, kwargs @@ -742,8 +748,11 @@ class AutoQuantizationState(torch.nn.Module): does not exist in the "before" hook. """ if func_output_obs_type == FuncOutputObsType.NEW_OBS: + # TODO(future PR): check qconfig is None + qconfig = get_cur_qconfig(self.qconfig_dict, seen_op_info.fqn, op) + assert qconfig is not None self.tensor_id_to_observer[str(qtensor_id[0])] = \ - self.qconfig.activation() + qconfig.activation() elif func_output_obs_type == FuncOutputObsType.REUSES_FIRST_INPUT_OBS: first_input_tensor_id = seen_op_info.input_tensor_infos[0].id @@ -777,7 +786,10 @@ class AutoQuantizationState(torch.nn.Module): if first_input_mod and hasattr(first_input_mod, 'activation_post_process'): first_input_obs = first_input_mod.activation_post_process else: - first_input_obs = self.qconfig.activation() + # TODO(future PR): check qconfig is None + qconfig = get_cur_qconfig(self.qconfig_dict, seen_op_info.fqn, op) + assert qconfig is not None + first_input_obs = qconfig.activation() self.tensor_id_to_observer[str(qtensor_id[0])] = first_input_obs @@ -788,10 +800,24 @@ class AutoQuantizationState(torch.nn.Module): func_output_dtype_type = get_func_output_dtype_type( op, args, seen_op_info.op_packing_only_uses_module_attributes) if func_output_dtype_type == FuncOutputDTypeType.DTYPE_DEPENDS_ON_QCONFIG: - dtype_to_use = torch.quint8 + if isinstance(op, torch.nn.Module): + # For now, assume that eager mode convert has attached qconfig + # objects to any leaf module which needs quantization + if hasattr(op, 'activation_post_process'): + dtype_to_use = op.activation_post_process.dtype + else: + dtype_to_use = torch.float + else: + qconfig = get_cur_qconfig(self.qconfig_dict, seen_op_info.fqn, op) + if qconfig is None: + dtype_to_use = torch.float + else: + dtype_to_use = qconfig.activation().dtype + elif func_output_dtype_type == FuncOutputDTypeType.DTYPE_DEFAULT_BC_UNSUPPORTED_SYNTAX: dtype_to_use = torch.float else: + # TODO(future PR): respect qconfig for torch.cat if isinstance(args[0], (tuple, list)): # for torch.cat unique_arg_dtypes = [ arg._qtensor_info.inf_dtype for arg in args[0]] diff --git a/torch/ao/quantization/_dbr/utils.py b/torch/ao/quantization/_dbr/utils.py index 13f31a314b8e..39e41e63b362 100644 --- a/torch/ao/quantization/_dbr/utils.py +++ b/torch/ao/quantization/_dbr/utils.py @@ -15,11 +15,17 @@ from .mappings import ( add_and_mul_ops, ) +from ..qconfig import QConfigAny + from torch.quantization import ( ObserverBase, FakeQuantizeBase, ) +from ..qconfig_dict_utils import ( + maybe_adjust_qconfig_for_module_type_or_name, +) + def _raise_obs_not_found_error(func): raise RuntimeError( f'Encountered arithmetic operation {torch.typename(func)} but we have ' @@ -75,6 +81,8 @@ SeenOpInfo = collections.namedtuple( # This is False if some packable args are results of other functions. # bool 'op_packing_only_uses_module_attributes', + # QConfig for the op, can be None + 'qconfig', ], ) def seen_op_info_repr(self) -> str: @@ -113,6 +121,7 @@ class ObserverWrapper(torch.nn.Identity): def __init__(self, child): super().__init__() self.child = child + self.dtype = child.dtype def wrap_observers_in_placeholders(module: torch.nn.Module) -> None: """ @@ -182,6 +191,9 @@ def get_func_output_obs_type( if is_module: return FuncOutputObsType.NONE + if seen_op_info.qconfig is None: + return FuncOutputObsType.NONE + # check for ops which need packed weights but the weights are # coming from another function if not seen_op_info.op_packing_only_uses_module_attributes: @@ -214,6 +226,8 @@ def converted_func_needs_scale_zp(seen_op_info: SeenOpInfo) -> bool: is_module = isinstance(op_type, type(torch.nn.Module)) if is_module: return False + if seen_op_info.qconfig is None: + return False if op_type in add_and_mul_ops: # check if both arguments are tensors inputs = seen_op_info.input_tensor_infos @@ -623,3 +637,24 @@ def get_input_args_quant_dequant_info( quant_infos.append(None) dequant_infos.append(False) return quant_infos, dequant_infos, any_arg_quant_or_dequant_needed + +def get_cur_qconfig( + qconfig_dict: Dict[str, Any], + cur_fqn: str, + cur_op: Callable, +) -> Optional[QConfigAny]: + # precedence: global -> object_type -> module_name_regex -> module_name + # -> module_name_object_type_order + # (module_name_regex, module_name_object_type_order not implemented yet) + + # global + global_qconfig = qconfig_dict[''] + + # object_type + is_module = isinstance(cur_op, type(torch.nn.Module)) + cur_op_type = type(cur_op) if is_module else cur_op + + qconfig = maybe_adjust_qconfig_for_module_type_or_name( + qconfig_dict, cur_op_type, cur_fqn, global_qconfig) + + return qconfig diff --git a/torch/ao/quantization/_quantize_dbr.py b/torch/ao/quantization/_quantize_dbr.py index f35fdf0cfb45..6e96427c965c 100644 --- a/torch/ao/quantization/_quantize_dbr.py +++ b/torch/ao/quantization/_quantize_dbr.py @@ -2,27 +2,46 @@ import torch from ._dbr.auto_trace import add_auto_observation, add_auto_convert from ._dbr.fusion import get_module_fusion_fqns +from ._dbr.qconfig_dict_utils import normalize_object_types + +from .qconfig_dict_utils import ( + get_flattened_qconfig_dict, + convert_dict_to_ordered_dict, +) -def prepare(model, example_inputs, inplace=False, allow_list=None, +def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None, observer_non_leaf_module_list=None, prepare_custom_config_dict=None, fuse_modules=True): r"""A wrapper around `torch.quantization.prepare` which prepares the - model for quantization using dynamic tracing. Requires `example_inputs` to build + model for quantization using dynamic tracing. + + Requires `qconfig_dict` (same format as prepare_fx) to specify the + quantization settings. Not all functionality is supported yet. + + Requires `example_inputs` to build the graph before calibration or quantization aware training can proceed. TODO(future PR): better docblock """ assert example_inputs is not None, 'example_inputs must be specified' + for qconfig_dict_option in ('module_name_regex', 'module_name_object_type_order'): + assert qconfig_dict_option not in qconfig_dict, \ + f'{qconfig_dict_option} option of qconfig_dict is not ' + \ + 'implemented yet in define-by-run quantization' + + normalize_object_types(qconfig_dict) + convert_dict_to_ordered_dict(qconfig_dict) + flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict) + torch.quantization.propagate_qconfig_(model, flattened_qconfig_dict) + # TODO(future PR): QAT support + if fuse_modules: # automatically fuse modules old_class = model.__class__ - # For now, need to propagate qconfig before observing, because - # AutoQuantizationState needs a qconfig to work - torch.quantization.propagate_qconfig_(model) - model = add_auto_observation(model, example_inputs) + model = add_auto_observation(model, qconfig_dict, example_inputs) module_fusion_fqns = get_module_fusion_fqns(model) if len(module_fusion_fqns): model = torch.quantization.fuse_modules(model, module_fusion_fqns) @@ -55,7 +74,7 @@ def prepare(model, example_inputs, inplace=False, allow_list=None, model, inplace, allow_list, observer_non_leaf_module_list, prepare_custom_config_dict) assert not inplace - model = add_auto_observation(model, example_inputs) + model = add_auto_observation(model, qconfig_dict, example_inputs) return model diff --git a/torch/ao/quantization/fx/__init__.py b/torch/ao/quantization/fx/__init__.py index b374e083da23..08d613fae771 100644 --- a/torch/ao/quantization/fx/__init__.py +++ b/torch/ao/quantization/fx/__init__.py @@ -1,4 +1,4 @@ from .prepare import prepare from .convert import convert from .fuse import Fuser -from .backend_config_dict import get_tensorrt_backend_config_dict +from .backend_config import get_tensorrt_backend_config_dict diff --git a/torch/ao/quantization/fx/_convert_do_not_use.py b/torch/ao/quantization/fx/_convert_do_not_use.py index 9631fdb435dd..0a5267b4442c 100644 --- a/torch/ao/quantization/fx/_convert_do_not_use.py +++ b/torch/ao/quantization/fx/_convert_do_not_use.py @@ -12,8 +12,9 @@ from ..utils import ( activation_is_int8_quantized, weight_is_statically_quantized, get_qparam_dict, + _parent_name, ) -from .backend_config_dict.utils import get_quantized_reference_module_mapping +from .backend_config.utils import get_quantized_reference_module_mapping from .graph_module import ( QuantizedGraphModule, @@ -23,7 +24,6 @@ from .utils import ( get_custom_module_class_keys, get_quantize_node_info, create_getattr_from_value, - _parent_name, ) from torch.ao.quantization.quantize import ( diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py index 546e41e5408c..bbebc628b580 100644 --- a/torch/ao/quantization/fx/_equalize.py +++ b/torch/ao/quantization/fx/_equalize.py @@ -9,14 +9,16 @@ from .utils import ( WEIGHT_INDEX_DICT, get_new_attr_name_with_prefix, maybe_get_next_module, - _parent_name, ) from ..observer import ( PerChannelMinMaxObserver, _with_args, ObserverBase, ) -from ..utils import check_min_max_valid +from ..utils import ( + check_min_max_valid, + _parent_name, +) from collections import namedtuple from typing import Dict, Any, List, Tuple, Optional diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py index c8418bdc3dd0..79eda8747f91 100644 --- a/torch/ao/quantization/fx/_lower_to_native_backend.py +++ b/torch/ao/quantization/fx/_lower_to_native_backend.py @@ -5,7 +5,7 @@ from .graph_module import QuantizedGraphModule from .quantized_fusion_patterns_and_replacements import get_fbgemm_patterns_and_replacements from .match_utils import is_match from .match_utils import MatchAllNode -from .utils import _parent_name +from ..utils import _parent_name from typing import Dict, Type # Mapping from reference module class to the replacement quantized module class for lowering diff --git a/torch/ao/quantization/fx/backend_config_dict/__init__.py b/torch/ao/quantization/fx/backend_config/__init__.py similarity index 100% rename from torch/ao/quantization/fx/backend_config_dict/__init__.py rename to torch/ao/quantization/fx/backend_config/__init__.py diff --git a/torch/ao/quantization/fx/backend_config_dict/fuse_handler.py b/torch/ao/quantization/fx/backend_config/fuse_handler.py similarity index 100% rename from torch/ao/quantization/fx/backend_config_dict/fuse_handler.py rename to torch/ao/quantization/fx/backend_config/fuse_handler.py diff --git a/torch/ao/quantization/fx/backend_config_dict/observation_type.py b/torch/ao/quantization/fx/backend_config/observation_type.py similarity index 100% rename from torch/ao/quantization/fx/backend_config_dict/observation_type.py rename to torch/ao/quantization/fx/backend_config/observation_type.py diff --git a/torch/ao/quantization/fx/backend_config_dict/quantize_handler.py b/torch/ao/quantization/fx/backend_config/quantize_handler.py similarity index 100% rename from torch/ao/quantization/fx/backend_config_dict/quantize_handler.py rename to torch/ao/quantization/fx/backend_config/quantize_handler.py diff --git a/torch/ao/quantization/fx/backend_config_dict/tensorrt.py b/torch/ao/quantization/fx/backend_config/tensorrt.py similarity index 100% rename from torch/ao/quantization/fx/backend_config_dict/tensorrt.py rename to torch/ao/quantization/fx/backend_config/tensorrt.py diff --git a/torch/ao/quantization/fx/backend_config_dict/utils.py b/torch/ao/quantization/fx/backend_config/utils.py similarity index 100% rename from torch/ao/quantization/fx/backend_config_dict/utils.py rename to torch/ao/quantization/fx/backend_config/utils.py diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py index d1a0546c7c49..2c7606eeb4c3 100644 --- a/torch/ao/quantization/fx/convert.py +++ b/torch/ao/quantization/fx/convert.py @@ -25,12 +25,14 @@ from .graph_module import ( from .quantization_patterns import ( QuantizeHandler, ) -from .qconfig_utils import ( +from ..qconfig_dict_utils import ( convert_dict_to_ordered_dict, + update_qconfig_for_qat, +) +from .qconfig_utils import ( generate_qconfig_map, compare_prepare_convert_qconfig_dict, update_qconfig_for_fusion, - update_qconfig_for_qat, ) from ._equalize import update_obs_for_equalization, convert_eq_obs from .utils import ( diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py index 5da008271323..02a43f996801 100644 --- a/torch/ao/quantization/fx/fuse.py +++ b/torch/ao/quantization/fx/fuse.py @@ -15,8 +15,8 @@ from .pattern_utils import ( get_default_fusion_patterns, ) -from .backend_config_dict.utils import get_fusion_pattern_to_fuse_handler_cls -from .backend_config_dict.utils import get_fuser_method_mapping +from .backend_config.utils import get_fusion_pattern_to_fuse_handler_cls +from .backend_config.utils import get_fuser_method_mapping from .fusion_patterns import * # noqa: F401,F403 diff --git a/torch/ao/quantization/fx/fusion_patterns.py b/torch/ao/quantization/fx/fusion_patterns.py index d86c1cd4e590..8471325333af 100644 --- a/torch/ao/quantization/fx/fusion_patterns.py +++ b/torch/ao/quantization/fx/fusion_patterns.py @@ -3,7 +3,7 @@ from torch.fx.graph import Node from .pattern_utils import ( register_fusion_pattern, ) -from .utils import _parent_name +from ..utils import _parent_name from .quantization_types import QuantizerCls, NodePattern, Pattern from ..fuser_method_mappings import get_fuser_method_new from abc import ABC, abstractmethod diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py index bc11fa666ae9..e94620cd9002 100644 --- a/torch/ao/quantization/fx/prepare.py +++ b/torch/ao/quantization/fx/prepare.py @@ -16,15 +16,16 @@ from ..quantize import ( from ..observer import ( ObserverBase, ) -from ..qconfig import QConfigAny -from ..qconfig import is_reuse_input_qconfig -from .qconfig_utils import ( - convert_dict_to_ordered_dict, - generate_qconfig_map, +from ..qconfig import QConfigAny, is_reuse_input_qconfig +from ..qconfig_dict_utils import ( get_flattened_qconfig_dict, - update_qconfig_for_fusion, + convert_dict_to_ordered_dict, update_qconfig_for_qat, ) +from .qconfig_utils import ( + generate_qconfig_map, + update_qconfig_for_fusion, +) from .quantization_patterns import ( QuantizeHandler, @@ -53,8 +54,8 @@ from .match_utils import ( find_matches, ) +from ..utils import _parent_name from .utils import ( - _parent_name, get_custom_module_class_keys, all_node_args_have_no_tensors, assert_and_get_unique_device, @@ -82,7 +83,7 @@ from ..utils import ( activation_is_int8_quantized, ) -from .backend_config_dict.utils import ( +from .backend_config.utils import ( get_pattern_to_quantize_handlers, get_pattern_to_dtype_configs, get_pattern_to_input_type_to_index, diff --git a/torch/ao/quantization/fx/qconfig_utils.py b/torch/ao/quantization/fx/qconfig_utils.py index b8b754bf81e4..5738637456bc 100644 --- a/torch/ao/quantization/fx/qconfig_utils.py +++ b/torch/ao/quantization/fx/qconfig_utils.py @@ -1,11 +1,10 @@ import torch -from collections import OrderedDict, defaultdict -from typing import Union, Callable, Any, Dict, Tuple, Set, Optional +from collections import defaultdict +from typing import Callable, Any, Dict, Tuple, Set, Optional from torch.ao.quantization.qconfig import add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals from torch.ao.quantization.quantize import ( is_activation_post_process, ) -import re from torch.fx import ( GraphModule, ) @@ -13,88 +12,14 @@ from torch.fx.graph import ( Graph, ) -from .utils import _parent_name -from ..utils import ( - get_combined_dict, -) +from ..utils import _parent_name from ..fuser_method_mappings import DEFAULT_OP_LIST_TO_FUSER_METHOD -from ..quantization_mappings import ( - get_default_qat_module_mappings, +from ..qconfig_dict_utils import ( + get_object_type_qconfig, + maybe_adjust_qconfig_for_module_type_or_name, ) -def get_flattened_qconfig_dict(qconfig_dict): - """ flatten the global, object_type and module_name qconfig - to the same qconfig_dict so that it can be used by - propagate_qconfig_ function. - "module_name_regex" is ignored for now since it's not supported - in propagate_qconfig_, but it can be fixed later. - - For example: - Input: { - "": qconfig, - "object_type": [ - (torch.add, qconfig) - ], - "module_name": [ - ("conv", qconfig) - ] - } - - Output: { - "": qconfig, - torch.add: qconfig, - "conv": qconfig - } - """ - flattened = dict() - if '' in qconfig_dict: - flattened[''] = qconfig_dict[''] - - def flatten_key(key): - if key in qconfig_dict: - for (obj, qconfig) in qconfig_dict[key].items(): - flattened[obj] = qconfig - - flatten_key('object_type') - flatten_key('module_name') - return flattened - - -def convert_dict_to_ordered_dict(qconfig_dict: Any) -> Dict[str, Dict[Any, Any]]: - """ Convert dict in qconfig_dict to ordered dict - """ - # convert a qconfig list for a type to OrderedDict - def _convert_to_ordered_dict(key, qconfig_dict): - qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, [])) - - _convert_to_ordered_dict('object_type', qconfig_dict) - _convert_to_ordered_dict('module_name_regex', qconfig_dict) - _convert_to_ordered_dict('module_name', qconfig_dict) - return qconfig_dict - - -def get_object_type_qconfig( - qconfig_dict: Any, - object_type: Union[Callable, str], - fallback_qconfig: QConfigAny) -> QConfigAny: - # object_type can be - # 1. module type (call_module) - # 2. function (call_function) - # 3. string (call_method) - return qconfig_dict['object_type'].get( - object_type, fallback_qconfig) - - -def get_module_name_regex_qconfig(qconfig_dict, module_name, fallback_qconfig): - for regex_pattern, qconfig in \ - qconfig_dict['module_name_regex'].items(): - if re.match(regex_pattern, module_name): - # first match wins - return qconfig - return fallback_qconfig - - def maybe_adjust_qconfig_for_module_name_object_type_order( qconfig_dict: Any, cur_module_path: str, @@ -116,48 +41,6 @@ def maybe_adjust_qconfig_for_module_name_object_type_order( return fallback_qconfig -def get_module_name_qconfig(qconfig_dict, module_name, fallback_qconfig): - if module_name == '': - # module name qconfig not found - return fallback_qconfig - if module_name in qconfig_dict['module_name']: - return qconfig_dict['module_name'][module_name] - else: - parent, _ = _parent_name(module_name) - return get_module_name_qconfig(qconfig_dict, parent, fallback_qconfig) - -# get qconfig for module_name, -# fallback to module_name_regex_qconfig, module_type_qconfig, -# global_qconfig if necessary - - -def maybe_adjust_qconfig_for_module_type_or_name(qconfig_dict, module_type, module_name, global_qconfig): - module_type_qconfig = get_object_type_qconfig( - qconfig_dict, module_type, global_qconfig) - module_name_regex_qconfig = get_module_name_regex_qconfig( - qconfig_dict, module_name, module_type_qconfig) - module_name_qconfig = get_module_name_qconfig( - qconfig_dict, module_name, module_name_regex_qconfig) - return module_name_qconfig - - -def update_qconfig_for_qat( - qconfig_dict: Any, - additional_qat_module_mapping: Dict[Callable, Callable] -) -> Any: - """ - Update the qconfig_dict to account for module swaps during QAT. - During QAT we perform a module swap on the nn.Module types to the corresponding nn.qat.modules types. - """ - all_qat_mappings = get_combined_dict( - get_default_qat_module_mappings(), additional_qat_module_mapping) - object_type_dict = qconfig_dict.get("object_type", None) - new_object_type_dict = object_type_dict.copy() - for k, v in new_object_type_dict.items(): - if k in all_qat_mappings: - object_type_dict[all_qat_mappings[k]] = v - return qconfig_dict - def update_qconfig_for_fusion( model: GraphModule, qconfig_dict: Any, diff --git a/torch/ao/quantization/fx/quantization_patterns.py b/torch/ao/quantization/fx/quantization_patterns.py index 22218c7ae93c..8a9d419c6b6b 100644 --- a/torch/ao/quantization/fx/quantization_patterns.py +++ b/torch/ao/quantization/fx/quantization_patterns.py @@ -33,9 +33,8 @@ from .pattern_utils import ( get_default_output_activation_post_process_map, Pattern, ) - +from ..utils import _parent_name from .utils import ( - _parent_name, all_node_args_have_no_tensors, quantize_node, get_per_tensor_qparams, diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py index 6f49ba442b70..5e3594772c52 100644 --- a/torch/ao/quantization/fx/utils.py +++ b/torch/ao/quantization/fx/utils.py @@ -37,14 +37,6 @@ BIAS_INDEX_DICT = { torch.nn.functional.instance_norm : [4], } -# turn foo.bar -> ['foo', 'bar'] -def _parent_name(target): - r = target.rsplit('.', 1) - if len(r) == 1: - return '', r[0] - else: - return r[0], r[1] - def graph_pretty_str(g, shorten=True) -> str: """Returns a printable representation of the ops in the graph of g. If shorten is True, tries to abbreviate fields. diff --git a/torch/ao/quantization/qconfig_dict_utils.py b/torch/ao/quantization/qconfig_dict_utils.py new file mode 100644 index 000000000000..33a1e9a4d624 --- /dev/null +++ b/torch/ao/quantization/qconfig_dict_utils.py @@ -0,0 +1,126 @@ +from collections import OrderedDict +import re +from typing import Any, Dict, Callable, Union + +from .utils import ( + get_combined_dict, + _parent_name, +) +from .quantization_mappings import ( + get_default_qat_module_mappings, +) +from torch.ao.quantization.qconfig import QConfigAny + + +def get_object_type_qconfig( + qconfig_dict: Any, + object_type: Union[Callable, str], + fallback_qconfig: QConfigAny) -> QConfigAny: + # object_type can be + # 1. module type (call_module) + # 2. function (call_function) + # 3. string (call_method) + return qconfig_dict['object_type'].get( + object_type, fallback_qconfig) + + +def get_module_name_regex_qconfig(qconfig_dict, module_name, fallback_qconfig): + for regex_pattern, qconfig in \ + qconfig_dict['module_name_regex'].items(): + if re.match(regex_pattern, module_name): + # first match wins + return qconfig + return fallback_qconfig + + +def get_module_name_qconfig(qconfig_dict, module_name, fallback_qconfig): + if module_name == '': + # module name qconfig not found + return fallback_qconfig + if module_name in qconfig_dict['module_name']: + return qconfig_dict['module_name'][module_name] + else: + parent, _ = _parent_name(module_name) + return get_module_name_qconfig(qconfig_dict, parent, fallback_qconfig) + + +def maybe_adjust_qconfig_for_module_type_or_name(qconfig_dict, module_type, module_name, global_qconfig): + # get qconfig for module_name, + # fallback to module_name_regex_qconfig, module_type_qconfig, + # global_qconfig if necessary + module_type_qconfig = get_object_type_qconfig( + qconfig_dict, module_type, global_qconfig) + module_name_regex_qconfig = get_module_name_regex_qconfig( + qconfig_dict, module_name, module_type_qconfig) + module_name_qconfig = get_module_name_qconfig( + qconfig_dict, module_name, module_name_regex_qconfig) + return module_name_qconfig + + +def get_flattened_qconfig_dict(qconfig_dict): + """ flatten the global, object_type and module_name qconfig + to the same qconfig_dict so that it can be used by + propagate_qconfig_ function. + "module_name_regex" is ignored for now since it's not supported + in propagate_qconfig_, but it can be fixed later. + + For example: + Input: { + "": qconfig, + "object_type": [ + (torch.add, qconfig) + ], + "module_name": [ + ("conv", qconfig) + ] + } + + Output: { + "": qconfig, + torch.add: qconfig, + "conv": qconfig + } + """ + flattened = dict() + if '' in qconfig_dict: + flattened[''] = qconfig_dict[''] + + def flatten_key(key): + if key in qconfig_dict: + for (obj, qconfig) in qconfig_dict[key].items(): + flattened[obj] = qconfig + + flatten_key('object_type') + flatten_key('module_name') + return flattened + + +def convert_dict_to_ordered_dict(qconfig_dict: Any) -> Dict[str, Dict[Any, Any]]: + """ Convert dict in qconfig_dict to ordered dict + """ + # convert a qconfig list for a type to OrderedDict + def _convert_to_ordered_dict(key, qconfig_dict): + qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, [])) + + _convert_to_ordered_dict('object_type', qconfig_dict) + _convert_to_ordered_dict('module_name_regex', qconfig_dict) + _convert_to_ordered_dict('module_name', qconfig_dict) + return qconfig_dict + + +def update_qconfig_for_qat( + qconfig_dict: Any, + additional_qat_module_mapping: Dict[Callable, Callable] +) -> Any: + """ + Update the qconfig_dict to account for module swaps during QAT. + During QAT we perform a module swap on the nn.Module types to the corresponding nn.qat.modules types. + """ + all_qat_mappings = get_combined_dict( + get_default_qat_module_mappings(), additional_qat_module_mapping) + object_type_dict = qconfig_dict.get("object_type", None) + new_object_type_dict = object_type_dict.copy() + for k, v in new_object_type_dict.items(): + if k in all_qat_mappings: + object_type_dict[all_qat_mappings[k]] = v + return qconfig_dict diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py index 522df17e87bc..62a4df162b48 100644 --- a/torch/ao/quantization/utils.py +++ b/torch/ao/quantization/utils.py @@ -220,3 +220,14 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b else: quant_min, quant_max = 0, 15 return quant_min, quant_max + + +def _parent_name(target): + """ + Turn 'foo.bar' into ['foo', 'bar'] + """ + r = target.rsplit('.', 1) + if len(r) == 1: + return '', r[0] + else: + return r[0], r[1] diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp index 7876a1e9491d..0cd301e4348f 100644 --- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp +++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp @@ -330,8 +330,8 @@ struct CudaGraphFuser { } if ((consumer->inputs().size() + consumer->outputs().size() + - producer->inputs().size() + - producer->outputs().size()) > subgraph_arg_limit_) { + producer->inputs().size() + producer->outputs().size()) > + subgraph_arg_limit_) { return at::nullopt; } @@ -762,12 +762,14 @@ struct CudaGraphFuser { // fusing nodes sharing inputs, this could save memory bandwidth by // reducing number of tensor read. for (const auto& u : producer->uses()) { - // only merge nodes before consumer, since any sibling after consumer - // has already considered merging this consumer to them already. + // only merge nodes before consumer, since any sibling after + // consumer has already considered merging this consumer to them + // already. if (u.user->isBefore(consumer)) { auto fusion_group = tryFuse(consumer, u.user); if (fusion_group) { - return std::make_pair(fusion_group.value()->reverseIterator(), true); + return std::make_pair( + fusion_group.value()->reverseIterator(), true); } } } diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp index 5ca70fa64efe..a33b33895c5b 100644 --- a/torch/csrc/jit/codegen/cuda/parser.cpp +++ b/torch/csrc/jit/codegen/cuda/parser.cpp @@ -1077,7 +1077,8 @@ class IrParser { auto mask = castOp(input->getDataType().value(), comparison); auto out = mul(grad_output, mask); - value_map.emplace(node->output()->unique(), ValueHolder(out, format)); + value_map.emplace( + node->output()->unique(), ValueHolder(out, format)); }, nullptr, nullptr); @@ -1232,7 +1233,6 @@ class IrParser { node->output(1)->unique(), ValueHolder(TensorViewBuilder().build(), format)); } - }, nullptr, nullptr); @@ -1895,8 +1895,9 @@ class IrParser { } else { const auto half_to_float = constant_as(node->input(2)); TORCH_INTERNAL_ASSERT( - half_to_float.has_value(), "Bool half_to_float is not valid"); - auto input_tensor_type = node->input(0)->type()->cast(); + half_to_float.has_value(), "Bool half_to_float is not valid"); + auto input_tensor_type = + node->input(0)->type()->cast(); if (half_to_float.value() && input_tensor_type->scalarType() != at::ScalarType::Half) { return false; @@ -2272,8 +2273,7 @@ class IrParser { } { - auto ptr_op = getOperatorForLiteral( - "aten::gelu(Tensor self) -> Tensor"); + auto ptr_op = getOperatorForLiteral("aten::gelu(Tensor self) -> Tensor"); REGISTER_PARSE_RULE( ptr_op, { diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp index 929b83b801c8..ab54b7776fa0 100644 --- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp +++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp @@ -34,15 +34,14 @@ static const std::string getTempPath() { static const std::string temp_dir = getTempPath(); static const std::string so_template = temp_dir + "pytorch_fuserXXXXXX.dll"; static const std::string cpp_template = temp_dir + "pytorch_fuserXXXXXX.cpp"; -static const std::string check_exists_string = - "where \"${program}\" > nul 2> nul"; +static const std::string check_exists_string = "where ${program} > nul 2> nul"; static std::vector env_list; constexpr int so_suffix_len = 4; constexpr int cpp_suffix_len = 4; #else static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so"; static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp"; -static const std::string check_exists_string = "which '${program}' > /dev/null"; +static const std::string check_exists_string = "which ${program} > /dev/null"; constexpr int so_suffix_len = 3; constexpr int cpp_suffix_len = 4; #endif @@ -50,8 +49,10 @@ constexpr int cpp_suffix_len = 4; intptr_t run(const std::string& cmd); static bool programExists(const std::string& program) { + std::stringstream ss; + c10::printQuotedString(ss, program); TemplateEnv env; - env.s("program", program); + env.s("program", ss.str()); std::string cmd = format(check_exists_string, env); #ifdef _MSC_VER return (run(cmd.c_str()) == 0); @@ -188,6 +189,7 @@ struct CompilerConfig { #endif if (!programExists(cxx)) { + TORCH_WARN("Compiler passed via CXX envvar does not exist!"); cxx = ""; } } @@ -205,7 +207,7 @@ struct CompilerConfig { const std::string openmp_flags = "-fopenmp"; #endif // Set openmp to true only if PyTorch is compiled with OpenMP support -// OpenMP is typically not availabel on MacOS platform +// OpenMP is typically not available on MacOS platform #if defined(_OPENMP) bool openmp = true; #else @@ -267,6 +269,7 @@ static void runCompiler( const std::string& cpp_file, const std::string& so_file) { auto& config = getConfig(); + TORCH_CHECK(!config.cxx.empty(), "Failed to compile a fused CPU kernel: Compiler not found"); TemplateEnv env; env.s("cxx", config.cxx); env.s("fopenmp", config.openmp ? config.openmp_flags : ""); diff --git a/torch/csrc/jit/tensorexpr/IRSpecification.md b/torch/csrc/jit/tensorexpr/IRSpecification.md index 4fa08c8f47a7..d9c37a4dd15e 100644 --- a/torch/csrc/jit/tensorexpr/IRSpecification.md +++ b/torch/csrc/jit/tensorexpr/IRSpecification.md @@ -4,6 +4,7 @@ Stmt | Store(buf_ = Buf, indices = [Expr], value_ = Expr, mask_ = Expr) | Allocate(buf_ = Buf) | Free(buf_ = Buf) +| PlacementAllocate(buf_ = Buf, buf_to_reuse_ = Buf) | Let(var_ = Var, val_ = Expr) | Cond(condition_ = Expr, true_stmt_ = Block, false_stmt_ = Block) | For(var_ = Var, start_ = Expr, stop_ = Expr, body_ = Block, loopOptions = LoopOptions) diff --git a/torch/csrc/jit/tensorexpr/analysis.h b/torch/csrc/jit/tensorexpr/analysis.h index 6f021448c25f..82e7b7f62afd 100644 --- a/torch/csrc/jit/tensorexpr/analysis.h +++ b/torch/csrc/jit/tensorexpr/analysis.h @@ -248,6 +248,87 @@ class ModifiesVarChecker : public IRVisitor { bool found_{false}; }; +// Traverse the Block stmt to identify the live range of the specified buf. The +// live range, indicated by a pair of integers, specifies the first and last +// stmt in block stmts that access to the buf. +class BufLiveRange : public IRVisitor { + public: + BufLiveRange(BufPtr b) : buf_(b) {} + + static std::tuple liveRange(StmtPtr s, BufPtr b) { + BlockPtr block = to(s); + // We Only analze buffer live ranges for block stmts. + if (!block) { + return std::make_tuple(0, 0); + } + + BufLiveRange analyzer(b); + block->accept(&analyzer); + return analyzer.getLiveRange(); + } + + private: + std::tuple getLiveRange() { + return std::make_tuple(begin_, end_); + } + + bool hasBufReads(StmtPtr s) { + auto loads1 = NodeFinder::find(s); + for (auto l : loads1) { + if (l->buf() == buf_) { + return true; + } + } + auto loads2 = NodeFinder::find(s); + for (auto l : loads2) { + for (auto lb : l->buf_args()) { + if (lb == buf_) { + return true; + } + } + } + return false; + } + + bool hasBufWrites(StmtPtr s) { + auto writes1 = NodeFinder::find(s); + for (auto w : writes1) { + if (w->buf() == buf_) { + return true; + } + } + auto writes2 = NodeFinder::find(s); + for (auto w : writes2) { + if (w->buf() == buf_) { + return true; + } + } + return false; + } + + void findAccAndUpdateLiveRange(StmtPtr s) { + bool has_reads = hasBufReads(s), has_writes = hasBufWrites(s); + if (has_reads || has_writes) { + if (begin_ == -1) { + begin_ = curr_index_; + }; + end_ = curr_index_; + } + } + + void visit(BlockPtr v) { + for (StmtPtr s : *v) { + curr_index_ += 1; + findAccAndUpdateLiveRange(s); + } + } + + BufPtr buf_; + int32_t begin_ = -1; + int32_t end_ = -1; + int32_t curr_index_ = -1; +}; + // A class that analyzes the given program relevant for Block backend // It creates a map of multi dim buffers and their flat verions class CreateBufferMap : public IRVisitor { diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp index 0a1051f9dd5f..59786cb980c6 100644 --- a/torch/csrc/jit/tensorexpr/codegen.cpp +++ b/torch/csrc/jit/tensorexpr/codegen.cpp @@ -1,3 +1,5 @@ +#include +#include #include #include @@ -80,6 +82,174 @@ void CodeGen::call_with_numel(void** args, int64_t numel) { TORCH_INTERNAL_ASSERT( false, "This codegen backend does not implement call_with_numel"); } + +c10::optional bufSize(BufPtr buf) { + size_t size = elementSize(buf->dtype().scalar_type()) * buf->dtype().lanes(); + for (auto& d : buf->dims()) { + if (!d->isConstant()) { + return c10::nullopt; + } + size = size * (*intValue(d)); + } + return size; +} + +// This algorithm takes the list of intermediate buffers and their liveness +// ranges, and returns the allocations of these buffers. A buffer 'A' can be +// allocated in the memory (appears as a pair of 'A's in the allocation results) +// or reuse another buffer such as 'B' (appears as ('A', 'B')). Specifically, we +// linearly scan the intermediate buffers by the time they appear, and try to +// assign it an existing non-occupied memory allocation. If there are no such +// allocations available, we'll create memory for it. Once we are beyond the +// liveness range of this buffer, we'll mark its corresponding memory allocation +// as "up for grabs" for future reuse. +std::vector> AllocBufsWithMemReuse( + const std::unordered_set& bufs, + const std::unordered_map>& + buf_ranges) { + // Sort buffers by the time they appear. + std::vector bufs_sorted(bufs.begin(), bufs.end()); + auto sorting_function_by_start_time = [&buf_ranges]( + BufPtr b1, BufPtr b2) -> bool { + return std::get<0>(buf_ranges.at(b1)) < std::get<0>(buf_ranges.at(b2)); + }; + std::sort( + bufs_sorted.begin(), bufs_sorted.end(), sorting_function_by_start_time); + + // Map intermediate buffers to the most recently used memory if any. + std::list mem_up_for_grabs; + std::unordered_map buf_mem_map; + std::vector> buf_allocs; + + auto sorting_function_by_end_time = [&buf_ranges]( + BufPtr b1, BufPtr b2) -> bool { + return std::get<1>(buf_ranges.at(b1)) < std::get<1>(buf_ranges.at(b2)); + }; + for (auto buf : bufs_sorted) { + // If the buf has dynamic shapes, we'll skip it (i.e., allocate memory for + // it, and there are no future reuses on its memory). + // TODO: reuse memory for bufs with dynamic shapes + if (!bufSize(buf)) { + buf_allocs.emplace_back(std::make_pair(buf, buf)); + continue; + } + + auto start = std::get<0>(buf_ranges.at(buf)); + auto end = std::get<1>(buf_ranges.at(buf)); + + // Release memory for buffers whose liveness range ends before the creation + // time of this buf. + // TODO: optimize in-place opererations and copy operations + std::vector buf_to_release; + for (auto& mapped : buf_mem_map) { + auto buf_mapped = mapped.first; + auto end_buf_mapped = std::get<1>(buf_ranges.at(buf_mapped)); + if (end_buf_mapped < start) { + buf_to_release.push_back(buf_mapped); + } + } + + // Sort the buffers in the order of used time so the head of the release + // list contains the most recently used buf. + std::sort( + buf_to_release.begin(), + buf_to_release.end(), + sorting_function_by_end_time); + for (auto& buf_rl : buf_to_release) { + mem_up_for_grabs.push_front(buf_mem_map.at(buf_rl)); + buf_mem_map.erase(buf_rl); + } + + bool allocated = false; + // Check whether there are free memories that this buf can reuse. + for (auto it = mem_up_for_grabs.begin(); it != mem_up_for_grabs.end(); + it++) { + auto m = *it; + if (bufSize(m) >= bufSize(buf)) { + buf_mem_map[buf] = m; + buf_allocs.emplace_back(std::make_pair(buf, m)); + allocated = true; + mem_up_for_grabs.erase(it); + break; + } + } + + // If there are no memories to reuse, we'll have to allocate new memory for + // it. + if (!allocated) { + buf_mem_map[buf] = buf; + buf_allocs.emplace_back(std::make_pair(buf, buf)); + } + } + + return buf_allocs; +} + +StmtPtr insertAllocFree( + std::vector>& buf_allocs, + StmtPtr stmt) { + BlockPtr b = to(stmt); + if (!b) { + b = alloc(std::vector({stmt})); + } + + // Insert allocations and frees for temporary buffers at global scope. + for (auto rit = buf_allocs.rbegin(); rit != buf_allocs.rend(); ++rit) { + if (rit->first == rit->second) { + BufPtr buf = rit->first; + b->prepend_stmt(alloc(buf)); + b->append_stmt(alloc(buf)); + } else { + b->prepend_stmt(alloc(rit->first, rit->second)); + } + } + + return b; +} + +// We allocate intermediate buffers by inserting Allocate/Free or +// PlacementAllocate stmts. Allocate/Free stmts will allocate memory at runtime, +// and PlacementAllocate stmt reuses the memory of one buffer for another +// buffer. In current implementation, we use linear scan for memory reuses. +// TODO: try more memory reuse algorithms and compare their memory efficiency. +void CodeGen::allocIntermediateBufs() { + // Identify intermediate buffers that are not allocated yet. + auto bufs = NodeFinder::find(stmt_); + std::unordered_set bufs_allocated; + for (auto b : buffer_args_) { + bufs_allocated.insert(b.buf()); + } + auto allocs = NodeFinder::find(stmt_); + for (auto a : allocs) { + bufs_allocated.insert(a->buf()); + } + + std::unordered_set interm_bufs; + std::unordered_map> interm_buf_ranges; + for (auto buf : bufs) { + if (!bufs_allocated.count(buf) && !interm_bufs.count(buf)) { + interm_bufs.insert(buf); + + // Identify the access stmts to each unallocated intermeiate buffer. + auto range = BufLiveRange::liveRange(stmt_, buf); + interm_buf_ranges.emplace(buf, range); + } + } + + // For each intermediate buffer, we reuse the memory of an old buffer whose + // liveness range does not overlap with the current buffer, or allocate memory + // if reusing buffer is impossible. + auto buf_allocs = AllocBufsWithMemReuse(interm_bufs, interm_buf_ranges); + + // Insert memory allocation/mapping nodes. + if (buf_allocs.size() > 0) { + auto stmt_new = insertAllocFree(buf_allocs, stmt_); + set_stmt(stmt_new); + } + + GRAPH_DEBUG("\nMemory Allocation:\n\n", *stmt(), "\n"); +} + } // namespace tensorexpr } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h index 5665f658bf7e..8b2041215fb8 100644 --- a/torch/csrc/jit/tensorexpr/codegen.h +++ b/torch/csrc/jit/tensorexpr/codegen.h @@ -30,7 +30,9 @@ class TORCH_API CodeGen { : stmt_(stmt), buffer_args_(std::move(buffer_args)), device_(device), - kernel_func_name_(std::move(kernel_func_name)) {} + kernel_func_name_(std::move(kernel_func_name)) { + allocIntermediateBufs(); + } virtual ~CodeGen() = default; @@ -99,6 +101,8 @@ class TORCH_API CodeGen { return kernel_func_name_; } + void allocIntermediateBufs(); + protected: static void* argToPtr(const BufferArg& bufferArg, const CallArg& callArg); diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index c8793fd06892..c867e16a34b7 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -105,6 +105,10 @@ void CudaAnalysis::visit(AllocatePtr v) { throw std::runtime_error("Global alloc not supported yet"); } +void CudaAnalysis::visit(PlacementAllocatePtr v) { + throw std::runtime_error("Memory reuse not supported yet"); +} + void CudaAnalysis::visit(ForPtr v) { // Recurse first. v->body()->accept(this); diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.h b/torch/csrc/jit/tensorexpr/cuda_codegen.h index 912f4e9a9949..30af7a42929b 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.h +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.h @@ -54,6 +54,7 @@ class CudaAnalysis : public IRVisitor { void visit(AllocatePtr v) override; void visit(FreePtr v) override; + void visit(PlacementAllocatePtr v) override; void visit(ForPtr v) override; std::unordered_set store_targets_; diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp index 73178384571b..51c21f85401a 100644 --- a/torch/csrc/jit/tensorexpr/eval.cpp +++ b/torch/csrc/jit/tensorexpr/eval.cpp @@ -981,6 +981,10 @@ class SimpleIREvaluatorImpl : public IRVisitor { internal_buffers_.insert(std::make_pair(b, std::move(buffer))); } + void visit(PlacementAllocatePtr v) override { + buffer_mapping_[v->buf()] = buffer_mapping_[v->buf_to_reuse()]; + } + void visit(FreePtr v) override { BufPtr b = v->buf(); GRAPH_DEBUG("FREE: buf=", v->buf()->name_hint()); diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h index 2054a56ff41c..f8efd30b510d 100644 --- a/torch/csrc/jit/tensorexpr/fwd_decls.h +++ b/torch/csrc/jit/tensorexpr/fwd_decls.h @@ -102,6 +102,7 @@ class Cond; class ExternalCall; class For; class Free; +class PlacementAllocate; class SyncThreads; using AllocatePtr = NodePtr; using AtomicAddPtr = NodePtr; @@ -110,6 +111,7 @@ using CondPtr = NodePtr; using ExternalCallPtr = NodePtr; using ForPtr = NodePtr; using FreePtr = NodePtr; +using PlacementAllocatePtr = NodePtr; using SyncThreadsPtr = NodePtr; #define IMM_DECLARE(Type, Name) \ diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp index 44ef4fbccb9f..4fb8cd451d63 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp +++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp @@ -497,6 +497,22 @@ StmtPtr IRMutator::mutate(FreePtr v) { return v; } +StmtPtr IRMutator::mutate(PlacementAllocatePtr v) { + BufPtr buf = v->buf(); + BufPtr buf_new = to(buf->accept_mutator(this)); + TORCH_INTERNAL_ASSERT( + buf_new, buildErrorMessage("IRMutator produced null for Buf.")); + v->set_buf(buf_new); + + BufPtr buf_to_reuse = v->buf_to_reuse(); + BufPtr buf_to_reuse_new = to(buf_to_reuse->accept_mutator(this)); + TORCH_INTERNAL_ASSERT( + buf_to_reuse_new, buildErrorMessage("IRMutator produced null for Buf.")); + v->set_buf_to_reuse(buf_to_reuse_new); + + return v; +} + StmtPtr IRMutator::mutate(LetPtr v) { VarPtr var_old = v->var(); VarPtr var_new = to(var_old->accept_mutator(this)); diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h index 27f41185e75f..2d37d49ba60c 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.h +++ b/torch/csrc/jit/tensorexpr/ir_mutator.h @@ -54,6 +54,7 @@ class TORCH_API IRMutator { virtual StmtPtr mutate(AllocatePtr v); virtual StmtPtr mutate(FreePtr v); + virtual StmtPtr mutate(PlacementAllocatePtr v); virtual StmtPtr mutate(LetPtr v); virtual StmtPtr mutate(CondPtr v); }; diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp index 53efe5b1ed2c..35d481cdd8d2 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.cpp +++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp @@ -483,6 +483,11 @@ void IRPrinter::visit(FreePtr v) { os() << "Free(" << *v->buffer_var() << ");"; } +void IRPrinter::visit(PlacementAllocatePtr v) { + os() << "Alias(" << *v->buf()->base_handle() << "," + << *v->buf_to_reuse()->base_handle() << ");"; +} + void IRPrinter::visit(LetPtr v) { os() << dtypeToCppString(v->var()->dtype()) << " " << *v->var(); os() << " = " << *v->value(); diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h index db2ecc062ae7..c58012e8a1b8 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.h +++ b/torch/csrc/jit/tensorexpr/ir_printer.h @@ -61,6 +61,7 @@ class TORCH_API IRPrinter : public IRVisitor { void visit(BlockPtr v) override; void visit(AllocatePtr v) override; void visit(FreePtr v) override; + void visit(PlacementAllocatePtr v) override; void visit(LetPtr v) override; // A child class may have a difference rule for generating dtype diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp index 44db863b8205..649a51ee4577 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp +++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp @@ -183,6 +183,11 @@ void IRVisitor::visit(FreePtr v) { v->buffer_var()->accept(this); } +void IRVisitor::visit(PlacementAllocatePtr v) { + v->buf()->accept(this); + v->buf_to_reuse()->accept(this); +} + void IRVisitor::visit(LetPtr v) { v->var()->accept(this); v->value()->accept(this); diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h index 26101a2c8fde..2bb48088d89f 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.h +++ b/torch/csrc/jit/tensorexpr/ir_visitor.h @@ -43,6 +43,7 @@ class TORCH_API IRVisitor { virtual void visit(IntrinsicsPtr v); virtual void visit(AllocatePtr v); virtual void visit(FreePtr v); + virtual void visit(PlacementAllocatePtr v); virtual void visit(LetPtr v); virtual void visit(CondPtr v); virtual void visit(TermPtr v); diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index ae049c39a0ac..0445636c5c4f 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -781,12 +781,11 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { if (pre_alloc_) { auto interm_bufs = l.getIntermediateBufs(); - interm_bufs = preAllocIntermediateBufs(interm_bufs); - l.prepareForCodegen(interm_bufs); - } else { - l.prepareForCodegen(); + preAllocIntermediateBufs(interm_bufs); } + l.prepareForCodegen(); + GRAPH_DEBUG("after prepareForCodegen", *l.root_stmt()); l.simplify(); GRAPH_DEBUG("after simplification", *l.root_stmt()); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index 650d4c45c8f2..109cf80a55b1 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -290,6 +290,7 @@ class LLVMCodeGenImpl : public IRVisitor { void visit(IntrinsicsPtr v) override; void visit(AllocatePtr v) override; void visit(FreePtr v) override; + void visit(PlacementAllocatePtr v) override; void visit(LetPtr v) override; void visit(CondPtr v) override; void visit(ExternalCallPtr v) override; @@ -355,9 +356,9 @@ LLVMCodeGen::LLVMCodeGen( c10::optional triple, c10::optional cpu, c10::optional attrs) - : CodeGen(stmt, args, device, kernel_func_name), - impl_(std::make_unique< - LLVMCodeGenImpl>(stmt, args, device, dtype, triple, cpu, attrs)) { + : CodeGen(stmt, args, device, kernel_func_name) { + impl_ = std::make_unique( + this->stmt(), args, device, dtype, triple, cpu, attrs); callee_ = std::make_unique( impl_->releaseJIT(), (void*)impl_->getKernelAddress()); } @@ -2049,6 +2050,11 @@ void LLVMCodeGenImpl::visit(AllocatePtr v) { varToVal_[v->buffer_var()] = malloc; } +void LLVMCodeGenImpl::visit(PlacementAllocatePtr v) { + llvm::Value* ptr = varToVal_.at(v->buf_to_reuse()->base_handle()); + varToVal_[v->buf()->base_handle()] = ptr; +} + void LLVMCodeGenImpl::visit(FreePtr v) { value_ = llvm::ConstantInt::get(IntTy_, 0); llvm::Value* ptr = varToVal_.at(v->buffer_var()); diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 428c145388a8..b6a333bf1e58 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -1133,36 +1133,6 @@ BlockPtr findLowestContainingBlock(const std::vector& uses) { return b; } -StmtPtr LoopNest::insertAllocFree( - StmtPtr stmt, - const c10::optional>& interm_bufs /* = c10::nullopt*/) { - std::vector intermediate_bufs; - if (interm_bufs) { - intermediate_bufs = *interm_bufs; - } else { - intermediate_bufs = getIntermediateBufs(); - } - - if (intermediate_bufs.size() == 0ULL) { - return stmt; - } - - BlockPtr b = to(stmt); - if (!b) { - b = alloc(std::vector({stmt})); - } - - std::unordered_map> uses = - findLoadOrStoreUses(stmt); - // Insert allocations and frees for temporary buffers at global scope. - for (BufPtr buf : intermediate_bufs) { - b->prepend_stmt(alloc(buf)); - b->append_stmt(alloc(buf)); - } - - return b; -} - class StmtDeleter : public IRMutator { public: StmtDeleter(const std::unordered_set& targets) : targets_(targets) {} @@ -1219,16 +1189,12 @@ void LoopNest::eliminateDeadStores() { root_stmt_ = root_stmt_->accept_mutator(&deleter); } -void LoopNest::prepareForCodegen( - const c10::optional>& interm_bufs /*= c10::nullopt*/) { +void LoopNest::prepareForCodegen() { // Expand reduction ops. ReductionExpander reduceExpander; root_stmt_ = reduceExpander.expand(root_stmt_); root_stmt_ = FlattenIndexes(root_stmt_); - - // Add allocs and frees for intermediate buffers at the global level. - root_stmt_ = insertAllocFree(root_stmt_, interm_bufs); } namespace { diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h index e7e29e699b18..3bf23abc1f84 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.h +++ b/torch/csrc/jit/tensorexpr/loopnest.h @@ -550,16 +550,7 @@ class TORCH_API LoopNest { void eliminateDeadStores(); - // Make the stmt ready for codegen. The optional argument 'interm_bufs' allows - // users to specify intermediate buffers that need runtime allocation. In - // default, we will insert 'Alloc/Free' stmts to allocate all intermediate - // buffers at runtime but users may have pre-allocated some of them at compile - // time, and in that case the user can specify what buffers to insert - // 'Alloc/Free' stmts for using 'interm_bufs'. - // TODO: refactor function 'prepareForCodegen' to remove argument - // 'interm_bufs'. - void prepareForCodegen( - const c10::optional>& interm_bufs = c10::nullopt); + void prepareForCodegen(); const std::unordered_set getInputBufs() const; const std::unordered_set getOutputBufs() const { @@ -571,9 +562,6 @@ class TORCH_API LoopNest { void initialize( const std::vector& output_tensors, const std::vector& tensors_to_compute); - StmtPtr insertAllocFree( - StmtPtr stmt, - const c10::optional>& interm_bufs = c10::nullopt); StmtPtr root_stmt_; diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h index 22efa47fd1b0..b39976443e48 100644 --- a/torch/csrc/jit/tensorexpr/stmt.h +++ b/torch/csrc/jit/tensorexpr/stmt.h @@ -388,6 +388,42 @@ class TORCH_API Allocate : public StmtNode { // TODO: add memory types. }; +// PlacementAllocate is a variation of the Allocate operator in NNC IR. It does +// not allocate memory but reuse the memory of another buffer for the given +// buffer. +class TORCH_API PlacementAllocate : public StmtNode { + public: + static PlacementAllocatePtr make( + const BufHandle& buf_handle, + const BufHandle& buf_handle_to_reuse) { + return alloc( + buf_handle.node(), buf_handle_to_reuse.node()); + } + + BufPtr buf() const { + return buf_; + } + + BufPtr buf_to_reuse() const { + return buf_to_reuse_; + } + + void set_buf(BufPtr buf) { + buf_ = buf; + } + + void set_buf_to_reuse(BufPtr buf) { + buf_to_reuse_ = buf; + } + + explicit PlacementAllocate(BufPtr buf, BufPtr buf_to_reuse) + : buf_(buf), buf_to_reuse_(buf_to_reuse) {} + + private: + BufPtr buf_; + BufPtr buf_to_reuse_; +}; + // Free the specific buffer. It is an error. class TORCH_API Free : public StmtNode { public: diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index e6c25bff72bd..09e26d0b7cce 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -842,34 +842,19 @@ def acc_ops_sum( args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str, -) -> Union[TRTTensor, Sequence[TRTTensor]]: - input_val = kwargs["input"] - if not isinstance(input_val, TRTTensor): - raise RuntimeError( - f"sum received input {input_val} that is not part " - "of the TensorRT region!" - ) +) -> TRTTensor: + return add_reduce_layer(network, target, args, kwargs, trt.ReduceOperation.SUM, name) - # If dim is specified, then we are computing reduced sum over certain dimensions. - # Otherwise, we are dong summation over all elements, which is only supported in - # explicit batch dimension. - if "dim" not in kwargs: - assert ( - not network.has_implicit_batch_dimension - ), "Do not support sum all the elements for implicit batch." - dim = range(0, len(input_val.shape)) - else: - dim = kwargs["dim"] # type: ignore[assignment] - keepdim = False if "keepdim" not in kwargs else kwargs["keepdim"] - layer = network.add_reduce( - input_val, - trt.ReduceOperation.SUM, - get_axes_for_reduce_op(dim, network.has_implicit_batch_dimension), - keepdim, - ) - set_layer_name(layer, target, name) - return layer.get_output(0) +@tensorrt_converter(acc_ops.mean) +def acc_ops_mean( + network: TRTNetwork, + target: Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + name: str, +) -> TRTTensor: + return add_reduce_layer(network, target, args, kwargs, trt.ReduceOperation.AVG, name) def add_acc_ops_full_reduce(network, target, args, kwargs, name, reduce_op): diff --git a/torch/fx/experimental/fx2trt/converters/converter_utils.py b/torch/fx/experimental/fx2trt/converters/converter_utils.py index 25f71575523e..3d87a57fca96 100644 --- a/torch/fx/experimental/fx2trt/converters/converter_utils.py +++ b/torch/fx/experimental/fx2trt/converters/converter_utils.py @@ -1,9 +1,9 @@ -from typing import Any, Tuple, Sequence, Union, List, Optional +from typing import Any, Tuple, Sequence, Union, List, Optional, Dict import numpy as np import tensorrt as trt import torch -from torch.fx.node import Target +from torch.fx.node import Target, Argument from torch.fx.experimental.fx2trt.types import * # noqa: F403 from torch.fx.experimental.fx2trt.utils import torch_dtype_from_trt @@ -450,6 +450,58 @@ def add_activation_layer( return layer.get_output(0) +def add_reduce_layer( + network: TRTNetwork, + target: Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + operation_type: trt.ActivationType, + name: str, +) -> TRTTensor: + """ + Add a TensorRT Reduce layer to `network`. + + Args: + network (TRTNetwork): TensorRT network object. + target (Target): Target of fx node. + args (Tuple[Argument, ...]): Args of the fx node. + kwargs (Dict[str, Argument]): Kwargs of the fx node. + operation_type (trt.ElementWiseOperation): Type of the TensorRT activation + operation. + name (str): The name we want to assign to the created TensorRT layer. + + Returns: + The output of TensorRT Reduce layer. + """ + input_val = kwargs["input"] + if not isinstance(input_val, TRTTensor): + raise RuntimeError( + f"{name} received input {input_val} that is not part " + "of the TensorRT region!" + ) + + # If dim is specified, then the op is reducing over certain dimensions. + # Otherwise, it's reducing over all elements, which is only supported in + # explicit batch dimension. + if "dim" not in kwargs: + assert ( + not network.has_implicit_batch_dimension + ), f"We don't support reduce({name}) over all the elements if batch dim is implicit." + dim = range(0, len(input_val.shape)) + else: + dim = kwargs["dim"] # type: ignore[assignment] + + keepdim = False if "keepdim" not in kwargs else kwargs["keepdim"] + layer = network.add_reduce( + input_val, + operation_type, + get_axes_for_reduce_op(dim, network.has_implicit_batch_dimension), + keepdim, + ) + set_layer_name(layer, target, name) + return layer.get_output(0) + + def get_dyn_range(scale, zero_point, dtype): """ Get the dynamic range of a tensor based on its scale, zero_point and dtype. diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py index 14662729e89e..416556db046c 100644 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ b/torch/fx/experimental/fx_acc/acc_ops.py @@ -30,7 +30,8 @@ def linear(*, input, weight, bias): @register_acc_op_properties(AccOpProperty.quantized) @register_acc_op -def quantized_linear(*, input, weight, bias, acc_out_ty): +def quantized_linear(*, input, weight, bias, acc_out_ty=None): + assert acc_out_ty is not None qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams") return nn.quantized.functional.linear( input, @@ -490,7 +491,8 @@ def hardswish_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: ], ) @register_acc_op -def quantized_add(*, input, other, acc_out_ty): +def quantized_add(*, input, other, acc_out_ty=None): + assert acc_out_ty is not None qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams") return torch.ops.quantized.add( input, @@ -515,7 +517,8 @@ def quantized_add(*, input, other, acc_out_ty): ], ) @register_acc_op -def quantized_mul(*, input, other, acc_out_ty): +def quantized_mul(*, input, other, acc_out_ty=None): + assert acc_out_ty is not None qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams") return torch.ops.quantized.mul( input, @@ -542,7 +545,8 @@ def quantized_mul(*, input, other, acc_out_ty): ], ) @register_acc_op -def quantize_per_tensor(*, input, acc_out_ty): +def quantize_per_tensor(*, input, acc_out_ty=None): + assert acc_out_ty is not None qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams") dtype = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "dtype") return torch.quantize_per_tensor( @@ -568,7 +572,8 @@ def quantize_per_tensor(*, input, acc_out_ty): ], ) @register_acc_op -def quantize_per_channel(*, input, acc_out_ty): +def quantize_per_channel(*, input, acc_out_ty=None): + assert acc_out_ty is not None qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams") dtype = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "dtype") return torch.quantize_per_channel( @@ -590,13 +595,15 @@ def dequantize(*, input): @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary, AccOpProperty.quantized) @register_acc_op -def rescale_quantize_per_tensor(*, input, acc_out_ty): +def rescale_quantize_per_tensor(*, input, acc_out_ty=None): + assert acc_out_ty is not None d = dequantize(input=input) return quantize_per_tensor(input=d, acc_out_ty=acc_out_ty) @register_acc_op_properties(AccOpProperty.unary, AccOpProperty.quantized) @register_acc_op -def rescale_quantize_per_channel(*, input, acc_out_ty): +def rescale_quantize_per_channel(*, input, acc_out_ty=None): + assert acc_out_ty is not None d = dequantize(input=input) return quantize_per_channel(input=d, acc_out_ty=acc_out_ty) @@ -710,7 +717,25 @@ def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node return log_node +def reduce_op_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule, func) -> torch.fx.Node: + with node.graph.inserting_before(node): + kwargs = dict(node.kwargs) + if "dim" in kwargs and isinstance(kwargs["dim"], int): + kwargs["dim"] = (kwargs["dim"],) + new_node = node.graph.call_function(func, kwargs=kwargs) + new_node.meta = node.meta.copy() + return new_node + + @register_acc_op_properties(AccOpProperty.unary) +@register_acc_op +def sum(*, input, dim=None, keepdim=False, dtype=None): + if dim is not None: + return torch.sum(**locals()) + else: + return input.sum(dtype=dtype) + + @register_custom_acc_mapper_fn( op_and_target=("call_method", "sum"), arg_replacement_tuples=[ @@ -729,23 +754,39 @@ def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node ("dtype", "dtype", this_arg_is_optional), ], ) -def add_sum_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node: - with node.graph.inserting_before(node): - sum_kwargs = dict(node.kwargs) - if "dim" in sum_kwargs and isinstance(sum_kwargs["dim"], int): - sum_kwargs["dim"] = (sum_kwargs["dim"],) - sum_node = node.graph.call_function(sum, kwargs=sum_kwargs) - sum_node.meta = node.meta.copy() - return sum_node +def sum_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node: + return reduce_op_mapper(node, mod, sum) @register_acc_op_properties(AccOpProperty.unary) @register_acc_op -def sum(*, input, dim=None, keepdim=False, dtype=None): +def mean(*, input, dim=None, keepdim=False, dtype=None): if dim is not None: - return torch.sum(**locals()) + return torch.mean(**locals()) else: - return input.sum(dtype=dtype) + return input.mean(dtype=dtype) + + +@register_custom_acc_mapper_fn( + op_and_target=("call_method", "mean"), + arg_replacement_tuples=[ + ("input", "input"), + ("dim", "dim", this_arg_is_optional), + ("keepdim", "keepdim", this_arg_is_optional), + ("dtype", "dtype", this_arg_is_optional), + ], +) +@register_custom_acc_mapper_fn( + op_and_target=("call_function", torch.mean), + arg_replacement_tuples=[ + ("input", "input"), + ("dim", "dim", this_arg_is_optional), + ("keepdim", "keepdim", this_arg_is_optional), + ("dtype", "dtype", this_arg_is_optional), + ], +) +def mean_mapper(node, mod): + return reduce_op_mapper(node, mod, mean) @register_custom_acc_mapper_fn( @@ -1374,7 +1415,8 @@ def custom_narrow_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node: kwargs_to_move_to_acc_out_ty=[("shape", "shape")], ) @register_acc_op -def reshape(*, input, acc_out_ty): +def reshape(*, input, acc_out_ty=None): + assert acc_out_ty is not None return torch.reshape( input, tuple(acc_utils.get_field_from_acc_out_ty(acc_out_ty, "shape")) ) @@ -1415,8 +1457,8 @@ def custom_tensor_reshape_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx. @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) @register_acc_op -def to_dtype(input, acc_out_ty): - assert acc_out_ty is not None, "valid acc_out_ty needed" +def to_dtype(input, acc_out_ty=None): + assert acc_out_ty is not None return input.to(dtype=acc_utils.get_field_from_acc_out_ty(acc_out_ty, "dtype")) diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py index 5312de637742..87b4519638fb 100644 --- a/torch/nn/modules/container.py +++ b/torch/nn/modules/container.py @@ -603,9 +603,31 @@ class ParameterDict(Module): def __iter__(self) -> Iterator[str]: return iter(self._parameters.keys()) + def __reversed__(self) -> Iterator[str]: + return reversed(list(self._parameters.keys())) + + def copy(self) -> 'ParameterDict': + """Returns a copy of this :class:`~torch.nn.ParameterDict` instance. + """ + return ParameterDict(self._parameters.copy()) + def __contains__(self, key: str) -> bool: return key in self._parameters + def setdefault(self, key: str, default: Optional['Parameter'] = None) -> 'Parameter': + """If key is in the ParameterDict, return its parameter. + If not, insert `key` with a parameter `default` and return `default`. + `default` defaults to `None`. + + Args: + key (string): key to set default for + default (:class:`~torch.nn.Parameter`): the parameter set to the key + """ + if key in self._parameters: + return self._parameters[key] + self[key] = default # type: ignore[assignment] + return self._parameters[key] + def clear(self) -> None: """Remove all items from the ParameterDict. """ @@ -621,6 +643,31 @@ class ParameterDict(Module): del self[key] return v + def popitem(self) -> Tuple[str, 'Parameter']: + """Remove and return the last inserted `(key, parameter)` pair + from the ParameterDict + """ + return self._parameters.popitem() + + def get(self, key: str, default: Optional['Parameter'] = None) -> 'Parameter | None': + r"""Return the parameter associated with key if present. + Otherwise return default if provided, None if not. + + Args: + key (string): key to get from the ParameterDict + default (Parameter, optional): value to return if key not present + """ + return self._parameters.get(key, default) + + def fromkeys(self, keys: Iterable['str'], default: Optional['Parameter'] = None) -> 'ParameterDict': + r"""Return a new ParameterDict with the keys provided + + Args: + keys (iterable, string): keys to make the new ParameterDict from + default (Parameter, optional): value to set for all keys + """ + return ParameterDict(self._parameters.fromkeys(keys, default)) # type: ignore[arg-type] + def keys(self) -> Iterable[str]: r"""Return an iterable of the ParameterDict keys. """ @@ -693,3 +740,17 @@ class ParameterDict(Module): "on each GPU except the original one.") return super(ParameterDict, self)._replicate_for_data_parallel() + + def __or__(self, other: 'ParameterDict') -> 'ParameterDict': + copy = self.copy() + copy.update(other._parameters) + return copy + + def __ror__(self, other: 'ParameterDict') -> 'ParameterDict': + copy = other.copy() + copy.update(self._parameters) + return copy + + def __ior__(self, other : 'ParameterDict') -> 'ParameterDict': + self.update(other._parameters) + return self diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index 4b49b9d48cea..d891c667b70b 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -7,7 +7,6 @@ appropriate files under `torch/ao/quantization/fx/`, while adding an import stat here. """ from torch.ao.quantization.fx.utils import ( - _parent_name, graph_pretty_str, get_per_tensor_qparams, quantize_node, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 48d3b030c81e..49c18c620267 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1275,11 +1275,8 @@ def sample_inputs_tensor_split(op_info, device, dtype, requires_grad, **kwargs): (3, -1), ) - def generator(): - for args in args_cases: - yield SampleInput(make_input((S, S, S)), args=args) - - return list(generator()) + for args in args_cases: + yield SampleInput(make_input((S, S, S)), args=args) def sample_inputs_linalg_det(op_info, device, dtype, requires_grad): @@ -1575,15 +1572,12 @@ def sample_inputs_cosine_similarity(op_info, device, dtype, requires_grad, **kwa ((S, S), {}) ) - def generator(): - for input_shape, kwargs in cases: - yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs) - # Test for Broadcasting - yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1}) - yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -2}) - yield SampleInput(make_arg((2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1}) - - return list(generator()) + for input_shape, kwargs in cases: + yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs) + # Test for Broadcasting + yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1}) + yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -2}) + yield SampleInput(make_arg((2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1}) def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -1601,48 +1595,45 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs): ((2, 1), {}), ) - def generator(): - for input_shape, kwargs in cases: - # args: running mean, running var, weight and bias should necessarily be of shape: (channels,) - channels = input_shape[1] if len(input_shape) > 1 else 0 - weight = make_arg(channels) if channels > 0 else None - bias = make_arg(channels) if channels > 0 else None - running_mean = make_arg_without_requires_grad(channels, low=0) - running_var = make_arg_without_requires_grad(channels, low=0) + for input_shape, kwargs in cases: + # args: running mean, running var, weight and bias should necessarily be of shape: (channels,) + channels = input_shape[1] if len(input_shape) > 1 else 0 + weight = make_arg(channels) if channels > 0 else None + bias = make_arg(channels) if channels > 0 else None + running_mean = make_arg_without_requires_grad(channels, low=0) + running_var = make_arg_without_requires_grad(channels, low=0) - yield SampleInput( - make_arg(input_shape), - args=( - running_mean, - running_var, - weight, - bias - ), - kwargs=kwargs - ) + yield SampleInput( + make_arg(input_shape), + args=( + running_mean, + running_var, + weight, + bias + ), + kwargs=kwargs + ) - # Checking for permutations of weights and biases as `None` - weights = [channels, None, None] - biases = [None, channels, None] - is_training = [True, False, False] + # Checking for permutations of weights and biases as `None` + weights = [channels, None, None] + biases = [None, channels, None] + is_training = [True, False, False] - for weight, bias, training in zip(weights, biases, is_training): - yield SampleInput( - make_arg(input_shape), - args=( - running_mean, - running_var, - make_arg(channels), - make_arg(channels) - ), - kwargs={'training': training} - ) + for weight, bias, training in zip(weights, biases, is_training): + yield SampleInput( + make_arg(input_shape), + args=( + running_mean, + running_var, + make_arg(channels), + make_arg(channels) + ), + kwargs={'training': training} + ) - # Test case for no optional kwargs - # running_mean and running_var are required in evaluation mode (training: False) but not in training mode - yield SampleInput(make_arg((1, 2, 3)), args=(None, None), kwargs={'training': True}) - - return list(generator()) + # Test case for no optional kwargs + # running_mean and running_var are required in evaluation mode (training: False) but not in training mode + yield SampleInput(make_arg((1, 2, 3)), args=(None, None), kwargs={'training': True}) def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -1654,11 +1645,8 @@ def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kw ((S, M, S)) ) - def generator(): - for shape in cases: - yield SampleInput(make_arg(shape)) - - return list(generator()) + for shape in cases: + yield SampleInput(make_arg(shape)) def sample_inputs_nn_functional_prelu(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -1670,17 +1658,14 @@ def sample_inputs_nn_functional_prelu(op_info, device, dtype, requires_grad, **k ((S, M, S)) ) - def generator(): - for shape in cases: - for weight in [-1., 0., 0.8, 1.]: - weight_tensor = torch.tensor(weight, device=device, dtype=dtype, requires_grad=requires_grad) - yield SampleInput(make_arg(shape), kwargs=dict(weight=weight_tensor)) + for shape in cases: + for weight in [-1., 0., 0.8, 1.]: + weight_tensor = torch.tensor(weight, device=device, dtype=dtype, requires_grad=requires_grad) + yield SampleInput(make_arg(shape), kwargs=dict(weight=weight_tensor)) - if len(shape) >= 2: - channel_size = shape[1] - yield SampleInput(make_arg(shape), kwargs=dict(weight=make_arg((channel_size,)))) - - return list(generator()) + if len(shape) >= 2: + channel_size = shape[1] + yield SampleInput(make_arg(shape), kwargs=dict(weight=make_arg((channel_size,)))) def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -1730,14 +1715,11 @@ def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs): new_args[1] *= -1 cases_negdim.append((shape, tuple(new_args), name.replace("_dim", "_neg_dim"))) - def generator(): - for shape, args, name in itertools.chain(cases, cases_negdim): - yield SampleInput(make_arg(shape), args=args, name=name) + for shape, args, name in itertools.chain(cases, cases_negdim): + yield SampleInput(make_arg(shape), args=args, name=name) - for shape, args, name in cases_nonzero_input: - yield SampleInput(make_arg(shape, exclude_zero=True), args=args, name=name) - - return list(generator()) + for shape, args, name in cases_nonzero_input: + yield SampleInput(make_arg(shape, exclude_zero=True), args=args, name=name) def sample_inputs_norm_fro(op_info, device, dtype, requires_grad, **kwargs): @@ -1749,11 +1731,8 @@ def sample_inputs_norm_fro(op_info, device, dtype, requires_grad, **kwargs): ((S, S), ('fro', [0, 1],), 'fro'), ) - def generator(): - for shape, args, name in cases: - yield SampleInput(make_arg(shape), args=args, name=name) - - return list(generator()) + for shape, args, name in cases: + yield SampleInput(make_arg(shape), args=args, name=name) def sample_inputs_norm_nuc(op_info, device, dtype, requires_grad, **kwargs): @@ -1764,11 +1743,8 @@ def sample_inputs_norm_nuc(op_info, device, dtype, requires_grad, **kwargs): ((S, S, S), ('nuc', [1, 2]), 'nuc_batched'), ) - def generator(): - for shape, args, name in cases: - yield SampleInput(make_arg(shape), args=args, name=name) - - return list(generator()) + for shape, args, name in cases: + yield SampleInput(make_arg(shape), args=args, name=name) def sample_inputs_norm_inf(op_info, device, dtype, requires_grad, **kwargs): @@ -1781,11 +1757,8 @@ def sample_inputs_norm_inf(op_info, device, dtype, requires_grad, **kwargs): ((S, S), (inf, -1,), 'inf_2_neg_dim'), ) - def generator(): - for shape, args, name in cases: - yield SampleInput(make_arg(shape), args=args, name=name) - - return list(generator()) + for shape, args, name in cases: + yield SampleInput(make_arg(shape), args=args, name=name) def sample_inputs_linalg_vector_norm(op_info, device, dtype, requires_grad, **kwargs): @@ -2169,13 +2142,10 @@ def sample_inputs_addmv(op_info, device, dtype, requires_grad, **kwargs): cases = test_cases + test_cases_with_broadcast - def generator(): - # addmv performs: beta * M + alpha * (mat @ vec) - for M, mat, vec, beta, alpha, broadcasts_input in cases: - yield SampleInput(make_arg(M), args=(make_arg(mat), make_arg(vec)), - kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=broadcasts_input) - - return list(generator()) + # addmv performs: beta * M + alpha * (mat @ vec) + for size, mat, vec, beta, alpha, broadcasts_input in cases: + yield SampleInput(make_arg(size), args=(make_arg(mat), make_arg(vec)), + kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=broadcasts_input) def sample_inputs_addbmm(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -2189,16 +2159,13 @@ def sample_inputs_addbmm(op_info, device, dtype, requires_grad, **kwargs): ((), (S, S, S), (S, S, M), 0.6, 0.2, True), ] - def generator(): - for input_shape, batch1_shape, batch2_shape, beta, alpha, is_broadcasting in test_cases: - if dtype.is_complex: - beta_complex, alpha_complex = beta * (1 + 2j), alpha * (2 + 3j) - yield SampleInput(make_arg(input_shape), args=(make_arg(batch1_shape), make_arg(batch2_shape)), - kwargs=dict(beta=beta_complex, alpha=alpha_complex), broadcasts_input=is_broadcasting) + for input_shape, batch1_shape, batch2_shape, beta, alpha, is_broadcasting in test_cases: + if dtype.is_complex: + beta_complex, alpha_complex = beta * (1 + 2j), alpha * (2 + 3j) yield SampleInput(make_arg(input_shape), args=(make_arg(batch1_shape), make_arg(batch2_shape)), - kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=is_broadcasting) - - return list(generator()) + kwargs=dict(beta=beta_complex, alpha=alpha_complex), broadcasts_input=is_broadcasting) + yield SampleInput(make_arg(input_shape), args=(make_arg(batch1_shape), make_arg(batch2_shape)), + kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=is_broadcasting) def sample_inputs_addcmul_addcdiv(op_info, device, dtype, requires_grad, **kwargs): test_cases = [(((S, S), (S, S), (S, S)), False), @@ -2329,45 +2296,39 @@ def sample_inputs_xlogy(self, device, dtype, requires_grad, **kwargs): def sample_inputs_xlog1py(self, device, dtype, requires_grad): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) - def generator(): - # same shape - yield SampleInput(make_arg((S, S)), args=(make_arg((S, S), low=-1),)) - # rhs broadcast - yield SampleInput(make_arg((S, S)), args=(make_arg((S,), low=-1),)) - # all zero `x` - with torch.no_grad(): - x = make_arg((S, S)) - x.fill_(0) - yield SampleInput(x, args=(make_arg((S, S), low=-1),)) - - # randomly zero-masked `x` + # same shape + yield SampleInput(make_arg((S, S)), args=(make_arg((S, S), low=-1),)) + # rhs broadcast + yield SampleInput(make_arg((S, S)), args=(make_arg((S,), low=-1),)) + # all zero `x` + with torch.no_grad(): x = make_arg((S, S)) - y = make_arg((S, S), low=-1) - with torch.no_grad(): - x[torch.rand(x.shape) > 0.5] = 0 - yield SampleInput(x, args=(y,)) + x.fill_(0) + yield SampleInput(x, args=(make_arg((S, S), low=-1),)) - # Scalar x - # `input` has to be a tensor - # yield SampleInput(0, args=(make_arg((S, S), low=-1),)) - # yield SampleInput(2.1, args=(make_arg((S, S), low=-1),)) + # randomly zero-masked `x` + x = make_arg((S, S)) + y = make_arg((S, S), low=-1) + with torch.no_grad(): + x[torch.rand(x.shape) > 0.5] = 0 + yield SampleInput(x, args=(y,)) - # Scalar y - yield SampleInput(make_arg((S, S)), args=(-0.5,)) - yield SampleInput(make_arg((S, S)), args=(1.2,)) + # Scalar x + # `input` has to be a tensor + # yield SampleInput(0, args=(make_arg((S, S), low=-1),)) + # yield SampleInput(2.1, args=(make_arg((S, S), low=-1),)) - return list(generator()) + # Scalar y + yield SampleInput(make_arg((S, S)), args=(-0.5,)) + yield SampleInput(make_arg((S, S)), args=(1.2,)) def sample_inputs_zero_(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) cases = ((), (S, S, S), (S,)) - def generator(): - for shape in cases: - yield(SampleInput(make_arg(shape))) - - return list(generator()) + for shape in cases: + yield(SampleInput(make_arg(shape))) def sample_inputs_logsumexp(self, device, dtype, requires_grad): @@ -2597,11 +2558,8 @@ def sample_inputs_renorm(self, device, dtype, requires_grad, **kwargs): ((S, S, S), (float('inf'), 2, 0.5)), ) - def generator(): - for shape, args in cases: - yield SampleInput(make_arg(shape), args=args) - - return list(generator()) + for shape, args in cases: + yield SampleInput(make_arg(shape), args=args) def sample_inputs_transpose_swapdims(self, device, dtype, requires_grad, **kwargs): @@ -2616,23 +2574,20 @@ def sample_inputs_transpose_swapdims(self, device, dtype, requires_grad, **kwarg ((M, M), (0, 1)), ((S, S, S), (2, 0)), ) - def generator(): - for shape, args in cases: - yield SampleInput(make_arg(shape), args=args) - - return list(generator()) + for shape, args in cases: + yield SampleInput(make_arg(shape), args=args) def sample_inputs_adjoint(self, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) shapes = ((1, 2, 3), (), (M, M), (S, S, S), (S, M, S), (M, S, M, S)) - return list(SampleInput(make_arg(shape)) for shape in shapes) + return (SampleInput(make_arg(shape)) for shape in shapes) def sample_inputs_T(self, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) shapes = ((), (M, M)) - return list(SampleInput(make_arg(shape)) for shape in shapes) + return (SampleInput(make_arg(shape)) for shape in shapes) def sample_inputs_linalg_invertible(op_info, device, dtype, requires_grad=False, **kwargs): @@ -2672,16 +2627,13 @@ def sample_inputs_linalg_pinv_singular(op_info, device, dtype, requires_grad=Fal # of the pinv's backward method, albeit it is slow. size = [0, 3, 50] - def generate_samples(): - for batch, m, n in product(batches, size, size): - for k in range(min(3, min(m, n))): - # Note that by making the columns of `a` and `b` orthonormal we make sure that - # the product matrix `a @ b.t()` has condition number 1 when restricted to its image - a = torch.rand(*batch, m, k, device=device, dtype=dtype).qr().Q.requires_grad_(requires_grad) - b = torch.rand(*batch, n, k, device=device, dtype=dtype).qr().Q.requires_grad_(requires_grad) - yield SampleInput(a, args=(b,)) - - return list(generate_samples()) + for batch, m, n in product(batches, size, size): + for k in range(min(3, min(m, n))): + # Note that by making the columns of `a` and `b` orthonormal we make sure that + # the product matrix `a @ b.t()` has condition number 1 when restricted to its image + a = torch.rand(*batch, m, k, device=device, dtype=dtype).qr().Q.requires_grad_(requires_grad) + b = torch.rand(*batch, n, k, device=device, dtype=dtype).qr().Q.requires_grad_(requires_grad) + yield SampleInput(a, args=(b,)) def sample_inputs_singular_matrix_factors(op_info, device, dtype, requires_grad=False, **kwargs): @@ -2791,11 +2743,8 @@ def sample_inputs_linalg_cond(op_info, device, dtype, requires_grad=False, **kwa (2, S, S), (2, 1, S, S), ) - def generator(): - for shape in shapes: - yield SampleInput(make_arg(shape)) - - return list(generator()) + for shape in shapes: + yield SampleInput(make_arg(shape)) def np_sinc_with_fp16_as_fp32(x): # Wraps numpy's sinc function so that fp16 values are promoted to fp32 @@ -2905,11 +2854,8 @@ def sample_inputs_fill_(op_info, device, dtype, requires_grad, **kwargs): # check https://github.com/pytorch/pytorch/issues/59137 ((S, S, S), (make_arg((), requires_grad=False),))) - def generator(): - for shape, args in cases: - yield SampleInput(make_arg(shape), args=args) - - return list(generator()) + for shape, args in cases: + yield SampleInput(make_arg(shape), args=args) def sample_inputs_comparison_ops(self, device, dtype, requires_grad, **kwargs): @@ -2980,11 +2926,8 @@ def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs): ((1,), (1,), {}) # dim not passed, fallback to default ) - def generator(): - for input_shape1, input_shape2, kwargs in cases: - yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs) - - return list(generator()) + for input_shape1, input_shape2, kwargs in cases: + yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs) def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad, **kwargs): tensors = [ @@ -3438,13 +3381,10 @@ def sample_inputs_unique(op_info, device, dtype, requires_grad, **kwargs): return sample_inputs def sample_inputs_unique_consecutive(*args, **kwargs): - def generator(): - for sample_input in sample_inputs_unique(*args, **kwargs): - if not sample_input.kwargs["sorted"]: - sample_input.kwargs.pop("sorted") - yield sample_input - - return list(generator()) + for sample_input in sample_inputs_unique(*args, **kwargs): + if not sample_input.kwargs["sorted"]: + sample_input.kwargs.pop("sorted") + yield sample_input def sample_inputs_index_fill(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -3509,11 +3449,8 @@ def sample_inputs_adaptive_avg_pool1d(op_info, device, dtype, requires_grad, **k ((3, 8, 8), 1) ) - def generator(): - for input_shape, output_size in cases: - yield SampleInput(make_arg(input_shape), args=(output_size,)) - - return list(generator()) + for input_shape, output_size in cases: + yield SampleInput(make_arg(input_shape), args=(output_size,)) def sample_inputs_adaptive_avg_pool2d(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -3527,11 +3464,8 @@ def sample_inputs_adaptive_avg_pool2d(op_info, device, dtype, requires_grad, **k ((1, 8, 4, 3), (5)), ) - def generator(): - for input_shape, output_size in cases: - yield SampleInput(make_arg(input_shape), args=(output_size,)) - - return list(generator()) + for input_shape, output_size in cases: + yield SampleInput(make_arg(input_shape), args=(output_size,)) def sample_inputs_adaptive_avg_pool3d(op_info, device, dtype, requires_grad, **kwargs): @@ -3547,11 +3481,8 @@ def sample_inputs_adaptive_avg_pool3d(op_info, device, dtype, requires_grad, **k ((3, 3, 8, 8, 6), (None, 3, 2)), ) - def generator(): - for input_shape, output_size in cases: - yield SampleInput(make_arg(input_shape), args=(output_size,)) - - return list(generator()) + for input_shape, output_size in cases: + yield SampleInput(make_arg(input_shape), args=(output_size,)) def sample_inputs_adaptive_max_pool1d(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -3564,11 +3495,8 @@ def sample_inputs_adaptive_max_pool1d(op_info, device, dtype, requires_grad, **k ((3, 4, 4), 1) ) - def generator(): - for shapes, return_idx in product(cases, (True, False)): - yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx)) - - return list(generator()) + for shapes, return_idx in product(cases, (True, False)): + yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx)) def sample_inputs_adaptive_max_pool2d(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -3585,11 +3513,8 @@ def sample_inputs_adaptive_max_pool2d(op_info, device, dtype, requires_grad, **k ((1, 4, 4, 3), (3)), ) - def generator(): - for shapes, return_idx in product(cases, (True, False)): - yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx)) - - return list(generator()) + for shapes, return_idx in product(cases, (True, False)): + yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx)) def sample_inputs_adaptive_max_pool3d(op_info, device, dtype, requires_grad, **kwargs): @@ -3606,11 +3531,8 @@ def sample_inputs_adaptive_max_pool3d(op_info, device, dtype, requires_grad, **k ((3, 3, 4, 4, 6), (None, 3, 2)), ) - def generator(): - for shapes, return_idx in product(cases, (True, False)): - yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx)) - - return list(generator()) + for shapes, return_idx in product(cases, (True, False)): + yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx)) class _TestParamsMaxPoolBase(object): @@ -3689,13 +3611,10 @@ def sample_inputs_max_pool(op_info, device, dtype, requires_grad, **kwargs): 'nn.functional.max_pool3d': _TestParamsMaxPool3d, } - def generator(): - params_generator = params_generator_type_dict[op_info.name]() - for (shape, memory_format), kwargs in params_generator.gen_input_params(): - arg = make_arg(shape).to(memory_format=memory_format).requires_grad_(requires_grad) - yield SampleInput(arg, kwargs=kwargs) - - return list(generator()) + params_generator = params_generator_type_dict[op_info.name]() + for (shape, memory_format), kwargs in params_generator.gen_input_params(): + arg = make_arg(shape).to(memory_format=memory_format).requires_grad_(requires_grad) + yield SampleInput(arg, kwargs=kwargs) def sample_inputs_normalize(self, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, low=-1, high=1, device=device, dtype=dtype, requires_grad=requires_grad) @@ -3710,11 +3629,8 @@ def sample_inputs_normalize(self, device, dtype, requires_grad, **kwargs): ((2, 3, 4, 5), {}), ((2, 3, 4, 5), {'eps': 1e-4})) - def generator(): - for input_shape, kwargs in cases: - yield SampleInput(make_arg(input_shape), kwargs=kwargs) - - return list(generator()) + for input_shape, kwargs in cases: + yield SampleInput(make_arg(input_shape), kwargs=kwargs) def sample_inputs_conv_transpose1d(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -3761,14 +3677,11 @@ def sample_inputs_conv_transpose2d(op_info, device, dtype, requires_grad, **kwar {}) ) - def generator(): - for input_shape, weight, bias, kwargs in cases: - yield SampleInput(make_arg(input_shape), args=( - make_arg(weight), - make_arg(bias) if bias is not None else bias - ), kwargs=kwargs) - - return list(generator()) + for input_shape, weight, bias, kwargs in cases: + yield SampleInput(make_arg(input_shape), args=( + make_arg(weight), + make_arg(bias) if bias is not None else bias + ), kwargs=kwargs) def sample_inputs_conv_transpose3d(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -3816,14 +3729,11 @@ def sample_inputs_conv1d(op_info, device, dtype, requires_grad, **kwargs): # Should replace test_conv_modules_raise_error_on_incorrect_input_size and test_conv_shapecheck # in test/test_nn.py - def generator(): - for input_shape, weight, bias, kwargs in cases: - yield SampleInput(make_arg(input_shape), args=( - make_arg(weight), - make_arg(bias) if bias is not None else bias - ), kwargs=kwargs) - - return list(generator()) + for input_shape, weight, bias, kwargs in cases: + yield SampleInput(make_arg(input_shape), args=( + make_arg(weight), + make_arg(bias) if bias is not None else bias + ), kwargs=kwargs) def sample_inputs_conv2d(op_info, device, dtype, requires_grad, jit_fail_sample=False, **kwargs): @@ -3858,14 +3768,11 @@ def sample_inputs_conv2d(op_info, device, dtype, requires_grad, jit_fail_sample= ((1, 4, 5, 5), (3, 4, 3, 3), None, {}), ) - def generator(): - for input_shape, weight, bias, kwargs in cases: - yield SampleInput(make_arg(input_shape), args=( - make_arg(weight), - make_arg(bias) if bias is not None else bias - ), kwargs=kwargs) - - return list(generator()) + for input_shape, weight, bias, kwargs in cases: + yield SampleInput(make_arg(input_shape), args=( + make_arg(weight), + make_arg(bias) if bias is not None else bias + ), kwargs=kwargs) def sample_inputs_group_norm(opinfo, device, dtype, requires_grad, **kwargs): @@ -3879,21 +3786,18 @@ def sample_inputs_group_norm(opinfo, device, dtype, requires_grad, **kwargs): ((0, 2), 1, None), ) - def generator(): - for input_shape, num_groups, eps in cases: - # Shape of weight and bias should be the same as num_channels - weight = make_arg(input_shape[1]) - bias = make_arg(input_shape[1]) - kwargs = {'weight': weight, 'bias': bias} if eps is None else {'weight': weight, 'bias': bias, 'eps': eps} - yield SampleInput( - make_arg(input_shape), - args=(num_groups,), - kwargs=kwargs - ) - # Without any optional args - yield SampleInput(make_arg((1, 2)), args=(1,)) - - return list(generator()) + for input_shape, num_groups, eps in cases: + # Shape of weight and bias should be the same as num_channels + weight = make_arg(input_shape[1]) + bias = make_arg(input_shape[1]) + kwargs = {'weight': weight, 'bias': bias} if eps is None else {'weight': weight, 'bias': bias, 'eps': eps} + yield SampleInput( + make_arg(input_shape), + args=(num_groups,), + kwargs=kwargs + ) + # Without any optional args + yield SampleInput(make_arg((1, 2)), args=(1,)) def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs): @@ -3910,51 +3814,48 @@ def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs): ((3, 2, 3, 4), {'momentum': -1.0, 'eps': 0.5}), ) - def generator(): - for input_shape, kwargs in cases: - # args: running mean, running var, weight and bias should necessarily be of shape: (channels,) - channels = input_shape[1] - weight = make_arg(channels) - bias = make_arg(channels) - running_mean = make_arg_without_requires_grad(channels, low=0) - running_var = make_arg_without_requires_grad(channels, low=0) - new_kwargs = { + for input_shape, kwargs in cases: + # args: running mean, running var, weight and bias should necessarily be of shape: (channels,) + channels = input_shape[1] + weight = make_arg(channels) + bias = make_arg(channels) + running_mean = make_arg_without_requires_grad(channels, low=0) + running_var = make_arg_without_requires_grad(channels, low=0) + new_kwargs = { + 'running_mean': running_mean, + 'running_var': running_var, + 'weight': weight, + 'bias': bias, + **kwargs + } + + yield SampleInput( + make_arg(input_shape), + args=(), + kwargs=new_kwargs + ) + + # Checking for permutations of weights and biases as `None` + # instance_norm assumes that if there's a bias, there's a weight + weights = [channels, None] + biases = [None, None] + + for weight_channels, bias_channels in zip(weights, biases): + running_mean = make_arg_without_requires_grad(channels, low=0) + running_var = make_arg_without_requires_grad(channels, low=0) + yield SampleInput( + make_arg(input_shape), + args=(), + kwargs={ 'running_mean': running_mean, 'running_var': running_var, - 'weight': weight, - 'bias': bias, - **kwargs + 'weight': make_arg(weight_channels) if weight_channels is not None else None, + 'bias': make_arg(bias_channels) if bias_channels is not None else None } + ) - yield SampleInput( - make_arg(input_shape), - args=(), - kwargs=new_kwargs - ) - - # Checking for permutations of weights and biases as `None` - # instance_norm assumes that if there's a bias, there's a weight - weights = [channels, None] - biases = [None, None] - - for weight_channels, bias_channels in zip(weights, biases): - running_mean = make_arg_without_requires_grad(channels, low=0) - running_var = make_arg_without_requires_grad(channels, low=0) - yield SampleInput( - make_arg(input_shape), - args=(), - kwargs={ - 'running_mean': running_mean, - 'running_var': running_var, - 'weight': make_arg(weight_channels) if weight_channels is not None else None, - 'bias': make_arg(bias_channels) if bias_channels is not None else None - } - ) - - # Test case for no optional kwargs - yield SampleInput(make_arg((1, 2, 3)), kwargs={}) - - return list(generator()) + # Test case for no optional kwargs + yield SampleInput(make_arg((1, 2, 3)), kwargs={}) def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs): @@ -3969,29 +3870,26 @@ def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs): ((0, 1), (1,), {}), ) - def generator(): - for input_shape, normalized_shape, kwargs in cases: - # Shape of weight and bias should be the same as normalized_shape - weight = make_arg(normalized_shape) - bias = make_arg(normalized_shape) - yield SampleInput( - make_arg(input_shape), - args=(normalized_shape, weight, bias), - kwargs=kwargs - ) - # Without any optional args - yield SampleInput(make_arg((1, 2)), args=((2,),)) + for input_shape, normalized_shape, kwargs in cases: + # Shape of weight and bias should be the same as normalized_shape + weight = make_arg(normalized_shape) + bias = make_arg(normalized_shape) + yield SampleInput( + make_arg(input_shape), + args=(normalized_shape, weight, bias), + kwargs=kwargs + ) + # Without any optional args + yield SampleInput(make_arg((1, 2)), args=((2,),)) - # TODO: @krshrimali, once to_numpy method in SampleInput class is modified to take None inputs, - # enable these inputs; see https://github.com/pytorch/pytorch/pull/63276#discussion_r691950400 + # TODO: @krshrimali, once to_numpy method in SampleInput class is modified to take None inputs, + # enable these inputs; see https://github.com/pytorch/pytorch/pull/63276#discussion_r691950400 - # With weight and a `None` bias - # yield SampleInput(make_arg((1, 2)), args=((2,), make_arg((2,)), None)) + # With weight and a `None` bias + # yield SampleInput(make_arg((1, 2)), args=((2,), make_arg((2,)), None)) - # With `None` weight and bias (tests failing for this, see the link above) - # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,)))) - - return list(generator()) + # With `None` weight and bias (tests failing for this, see the link above) + # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,)))) def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kwargs): @@ -4012,11 +3910,8 @@ def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kw ((0, 1, 2), 1, {'alpha': 3e-05, 'beta': 0.5, 'k': 1.25}), ) - def generator(): - for input_shape, size, kwargs in cases: - yield SampleInput(make_arg(input_shape), args=(size,), kwargs=kwargs) - - return list(generator()) + for input_shape, size, kwargs in cases: + yield SampleInput(make_arg(input_shape), args=(size,), kwargs=kwargs) def sample_inputs_hardswish(self, device, dtype, requires_grad): @@ -4351,14 +4246,11 @@ def sample_inputs_avgpool2d(op_info, device, dtype, requires_grad, **kwargs): ((1, 1, 4, 4), (2, 2), (), (0, ), False, True, -2), ((1, 2, 6, 6), (4, 4), (2, 2), (2, ), True, True, None)) - def generator(): - for input_shape, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override in cases: - yield SampleInput(make_arg(input_shape), - args=(kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)) - # Case with just input_shape and kernel_size - yield SampleInput(make_arg((1, 3, 9, 9)), args=((3, 3))) - - return list(generator()) + for input_shape, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override in cases: + yield SampleInput(make_arg(input_shape), + args=(kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)) + # Case with just input_shape and kernel_size + yield SampleInput(make_arg((1, 3, 9, 9)), args=((3, 3))) def sample_inputs_avgpool1d(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -4376,11 +4268,8 @@ def sample_inputs_avgpool1d(op_info, device, dtype, requires_grad, **kwargs): ((1, 2, 9), (7,), dict(stride=(3,), ceil_mode=True)), ] - def generator(): - for input_shape, kernel_size, kwargs in cases: - yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs) - - return list(generator()) + for input_shape, kernel_size, kwargs in cases: + yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs) def sample_inputs_avgpool3d(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -4403,11 +4292,8 @@ def sample_inputs_avgpool3d(op_info, device, dtype, requires_grad, **kwargs): count_include_pad=True, divisor_override=None)), ] - def generator(): - for input_shape, kernel_size, kwargs in cases: - yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs) - - return list(generator()) + for input_shape, kernel_size, kwargs in cases: + yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs) def sample_inputs_topk(op_info, device, dtype, requires_grad, **kwargs): def get_tensor_input(size): @@ -4447,13 +4333,10 @@ def sample_inputs_igamma_igammac(op_info, device, dtype, requires_grad, **kwargs ((S, ), (S, S), True), ((), (), False)) - def generator(): - for shape, other_shape, broadcasts_input in cases: - yield SampleInput(make_arg(shape, requires_grad=requires_grad), - args=(make_arg(other_shape, requires_grad=False),), - broadcasts_input=broadcasts_input) - - return list(generator()) + for shape, other_shape, broadcasts_input in cases: + yield SampleInput(make_arg(shape, requires_grad=requires_grad), + args=(make_arg(other_shape, requires_grad=False),), + broadcasts_input=broadcasts_input) def sample_inputs_dist(op_info, device, dtype, requires_grad): @@ -4461,11 +4344,8 @@ def sample_inputs_dist(op_info, device, dtype, requires_grad): sizes = ((S, S, S), (S,), (S, 1, S), (), (S, S)) ps = (2, 4) - def generate_samples(): - for size_x, size_y, p in product(sizes, sizes, ps): - yield SampleInput(make_arg(size_x), args=(make_arg(size_y), p)) - - return list(generate_samples()) + for size_x, size_y, p in product(sizes, sizes, ps): + yield SampleInput(make_arg(size_x), args=(make_arg(size_y), p)) # Missing to test the nondeterminism of the operation # https://github.com/pytorch/pytorch/issues/53352 @@ -4516,39 +4396,36 @@ def sample_inputs_put(op_info, device, dtype, requires_grad): S = 3 - def gen_inputs(): - # Generic inputs - idx = torch.randperm(S * S, device=device, dtype=torch.int64)[:S] - idx_list = [idx, -idx - 1] - for idx, acc in product(idx_list, (True, False)): - yield SampleInput(input=make_arg((S, S)), - args=(idx.detach().clone(), - make_arg((S,)), - acc)) + # Generic inputs + idx = torch.randperm(S * S, device=device, dtype=torch.int64)[:S] + idx_list = [idx, -idx - 1] + for idx, acc in product(idx_list, (True, False)): + yield SampleInput(input=make_arg((S, S)), + args=(idx.detach().clone(), + make_arg((S,)), + acc)) - # Scalar cases - scalar_sizes = [(), (1,)] - tgt_gen = (make_arg(size) for size in scalar_sizes) - idx_gen = (make_idx(size, high=1) for size in scalar_sizes) - src_gen = (make_arg(size) for size in scalar_sizes) - for tgt, idx, src, acc in product(tgt_gen, idx_gen, src_gen, (True, False)): - yield SampleInput(input=tgt.detach().clone().requires_grad_(requires_grad), - args=(idx.detach().clone(), - src.detach().clone().requires_grad_(requires_grad), - acc)) + # Scalar cases + scalar_sizes = [(), (1,)] + tgt_gen = (make_arg(size) for size in scalar_sizes) + idx_gen = (make_idx(size, high=1) for size in scalar_sizes) + src_gen = (make_arg(size) for size in scalar_sizes) + for tgt, idx, src, acc in product(tgt_gen, idx_gen, src_gen, (True, False)): + yield SampleInput(input=tgt.detach().clone().requires_grad_(requires_grad), + args=(idx.detach().clone(), + src.detach().clone().requires_grad_(requires_grad), + acc)) - # Empty cases - tgt_sizes = [(0,), (), (1,), (3, 2)] - tgt_gen = (make_arg(size) for size in tgt_sizes) - idx = make_idx((0,), high=1) - src = make_arg((0,)) - for tgt, acc in product(tgt, (True, False)): - yield SampleInput(input=tgt.detach().clone().requires_grad_(requires_grad), - args=(idx.detach().clone(), - src.detach().clone().requires_grad_(requires_grad), - acc)) - - return list(gen_inputs()) + # Empty cases + tgt_sizes = [(0,), (), (1,), (3, 2)] + tgt_gen = (make_arg(size) for size in tgt_sizes) + idx = make_idx((0,), high=1) + src = make_arg((0,)) + for tgt, acc in product(tgt, (True, False)): + yield SampleInput(input=tgt.detach().clone().requires_grad_(requires_grad), + args=(idx.detach().clone(), + src.detach().clone().requires_grad_(requires_grad), + acc)) def sample_inputs_take(op_info, device, dtype, requires_grad): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) @@ -4556,29 +4433,26 @@ def sample_inputs_take(op_info, device, dtype, requires_grad): S = 3 - def gen_inputs(): - # Generic inputs: take S elements out of S * S - index = make_idx((S,), high=(S * S)) - for idx in (index, -index - 1): - yield SampleInput(input=make_arg((S, S)), args=(idx,)) + # Generic inputs: take S elements out of S * S + index = make_idx((S,), high=(S * S)) + for idx in (index, -index - 1): + yield SampleInput(input=make_arg((S, S)), args=(idx,)) - # Scalar cases - scalar_sizes = [(), (1,)] - src_gen = (make_arg(size) for size in scalar_sizes) - idx_gen = (make_idx(size, high=1) for size in scalar_sizes) - for src, idx in product(src_gen, idx_gen): - yield SampleInput(input=src.detach().clone().requires_grad_(requires_grad), - args=(idx.detach().clone(),)) + # Scalar cases + scalar_sizes = [(), (1,)] + src_gen = (make_arg(size) for size in scalar_sizes) + idx_gen = (make_idx(size, high=1) for size in scalar_sizes) + for src, idx in product(src_gen, idx_gen): + yield SampleInput(input=src.detach().clone().requires_grad_(requires_grad), + args=(idx.detach().clone(),)) - # Empty cases - src_sizes = [(0,), (), (1,), (3, 2)] - src_gen = (make_arg(size) for size in src_sizes) - idx = make_idx((0,), high=1) - for src in src_gen: - yield SampleInput(input=src.detach().clone().requires_grad_(requires_grad), - args=(idx.detach().clone(),)) - - return list(gen_inputs()) + # Empty cases + src_sizes = [(0,), (), (1,), (3, 2)] + src_gen = (make_arg(size) for size in src_sizes) + idx = make_idx((0,), high=1) + for src in src_gen: + yield SampleInput(input=src.detach().clone().requires_grad_(requires_grad), + args=(idx.detach().clone(),)) def sample_movedim_moveaxis(op_info, device, dtype, requires_grad): return ( @@ -4622,13 +4496,10 @@ def sample_inputs_narrow(op_info, device, dtype, requires_grad, **kwargs): ((S, S, S), (-1, 0, 0)), ) - def generator(): - for shape, args in shapes_and_args: - tensor = make_tensor(shape, device, dtype, low=None, high=None, - requires_grad=requires_grad) - yield SampleInput(tensor, args=args) - - return list(generator()) + for shape, args in shapes_and_args: + tensor = make_tensor(shape, device, dtype, low=None, high=None, + requires_grad=requires_grad) + yield SampleInput(tensor, args=args) def sample_trapezoid(op_info, device, dtype, requires_grad, **kwargs): y_shape_x_shape_and_kwargs = [ @@ -4711,17 +4582,14 @@ def sample_inputs_nn_unfold(op_info, device, dtype, requires_grad, **kwargs): paddings = (0, 1, (1, 1)) strides = (1, 2, (1, 2)) - def generator(): - cases = product(shapes, kernel_sizes, dilations, paddings, strides) - for shape, kernel_size, dilation, padding, stride in cases: - tensor = make_tensor(shape, device, dtype, requires_grad=requires_grad) - yield SampleInput(tensor, args=(kernel_size, dilation, padding, stride)) + cases = product(shapes, kernel_sizes, dilations, paddings, strides) + for shape, kernel_size, dilation, padding, stride in cases: + tensor = make_tensor(shape, device, dtype, requires_grad=requires_grad) + yield SampleInput(tensor, args=(kernel_size, dilation, padding, stride)) - # With default args - yield SampleInput(make_tensor((1, 1, 5, 5), device, dtype, requires_grad=requires_grad), - args=((3, 3),)) - - return list(generator()) + # With default args + yield SampleInput(make_tensor((1, 1, 5, 5), device, dtype, requires_grad=requires_grad), + args=((3, 3),)) def sample_inputs_squeeze(op_info, device, dtype, requires_grad, **kwargs): @@ -4735,14 +4603,11 @@ def sample_inputs_squeeze(op_info, device, dtype, requires_grad, **kwargs): ((), (0, )), ) - def generator(): - for shape, args in shapes_and_args: - tensor = make_tensor(shape, device, dtype, low=None, high=None, - requires_grad=requires_grad) + for shape, args in shapes_and_args: + tensor = make_tensor(shape, device, dtype, low=None, high=None, + requires_grad=requires_grad) - yield SampleInput(tensor, args=args) - - return list(generator()) + yield SampleInput(tensor, args=args) def sample_inputs_nn_pad(op_info, device, dtype, requires_grad, mode, **kwargs): @@ -4813,20 +4678,17 @@ def sample_inputs_nn_pad(op_info, device, dtype, requires_grad, mode, **kwargs): make_inp = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) - def generator(): - if mode == 'constant': - # Default args - yield SampleInput(make_inp((1, 3, 3)), args=((2, 2),)) + if mode == 'constant': + # Default args + yield SampleInput(make_inp((1, 3, 3)), args=((2, 2),)) - if mode in ['reflect', 'replicate', 'circular']: + if mode in ['reflect', 'replicate', 'circular']: + for shape, pad in cases: + yield SampleInput(make_inp(shape), args=(pad, mode)) + else: # mode == 'constant' + for pad_value in (1., 2.): for shape, pad in cases: - yield SampleInput(make_inp(shape), args=(pad, mode)) - else: # mode == 'constant' - for pad_value in (1., 2.): - for shape, pad in cases: - yield SampleInput(make_inp(shape), args=(pad, mode, pad_value)) - - return list(generator()) + yield SampleInput(make_inp(shape), args=(pad, mode, pad_value)) # TODO: reconcile with torch.linalg.det and torch.linalg.slogdet @@ -5378,39 +5240,36 @@ def sample_inputs_linalg_solve_triangular(op_info, device, dtype, requires_grad= ns = (3, 0) ks = (1, 3, 0) - def gen_inputs(): - for b, n, k, (left, upper, uni) in product(bs, ns, ks, product((True, False), repeat=3)): - with torch.no_grad(): - if b == 1: - A = make_arg((n, n)) if left else make_arg((k, k)) - B = make_arg((n, k)) - else: - A = make_arg((b, n, n)) if left else make_arg((b, k, k)) - B = make_arg((b, n, k)) - if uni: - # Not really necessary, but writing it for consistency - A.diagonal(0, -2, -1).fill_(1.) - else: - d = A.diagonal(0, -2, -1) - d[d.abs() < 1e-6] = 1. - if upper: - A.triu_() - else: - A.tril_() - kwargs = {"upper": upper, "left": left, "unitriangular": uni} - if requires_grad: - for grad_A, grad_B in product((True, False), repeat=2): - # Either A or B needs to have a gradient - if not grad_A and not grad_B: - continue - yield SampleInput( - A.clone().requires_grad_(grad_A), - args=(B.clone().requires_grad_(grad_B),), - kwargs=kwargs) + for b, n, k, (left, upper, uni) in product(bs, ns, ks, product((True, False), repeat=3)): + with torch.no_grad(): + if b == 1: + A = make_arg((n, n)) if left else make_arg((k, k)) + B = make_arg((n, k)) else: - yield SampleInput(A, args=(B,), kwargs=kwargs) - - return list(gen_inputs()) + A = make_arg((b, n, n)) if left else make_arg((b, k, k)) + B = make_arg((b, n, k)) + if uni: + # Not really necessary, but writing it for consistency + A.diagonal(0, -2, -1).fill_(1.) + else: + d = A.diagonal(0, -2, -1) + d[d.abs() < 1e-6] = 1. + if upper: + A.triu_() + else: + A.tril_() + kwargs = {"upper": upper, "left": left, "unitriangular": uni} + if requires_grad: + for grad_A, grad_B in product((True, False), repeat=2): + # Either A or B needs to have a gradient + if not grad_A and not grad_B: + continue + yield SampleInput( + A.clone().requires_grad_(grad_A), + args=(B.clone().requires_grad_(grad_B),), + kwargs=kwargs) + else: + yield SampleInput(A, args=(B,), kwargs=kwargs) def sample_inputs_legacy_solve(op_info, device, dtype, requires_grad=False, **kwargs): """ @@ -5446,14 +5305,11 @@ def sample_inputs_cholesky_solve(op_info, device, dtype, requires_grad=False, ** def sample_inputs_lu(op_info, device, dtype, requires_grad=False, **kwargs): # not needed once OpInfo tests support Iterables - def generate_samples(): - batch_shapes = ((), (3,), (3, 3)) - for batch_shape, get_infos, size_delta in product(batch_shapes, (True, False), (-2, -1, 0, +1, +2)): - shape = batch_shape + (S + size_delta, S) - input = make_tensor(shape, device, dtype, requires_grad=requires_grad, low=None, high=None) - yield SampleInput(input, args=(True, get_infos)) - - return list(generate_samples()) + batch_shapes = ((), (3,), (3, 3)) + for batch_shape, get_infos, size_delta in product(batch_shapes, (True, False), (-2, -1, 0, +1, +2)): + shape = batch_shape + (S + size_delta, S) + input = make_tensor(shape, device, dtype, requires_grad=requires_grad, low=None, high=None) + yield SampleInput(input, args=(True, get_infos)) def sample_inputs_lu_solve(op_info, device, dtype, requires_grad=False, **kwargs): @@ -5463,61 +5319,52 @@ def sample_inputs_lu_solve(op_info, device, dtype, requires_grad=False, **kwargs ns = [5, 3, 0] nrhs = [0, 1, 6] - def generate_samples(): - for n, batch, rhs in product(ns, batches, nrhs): - a = random_fullrank_matrix_distinct_singular_value(n, *batch, dtype=dtype, device=device) - requires_grad_options = (False,) if not requires_grad else (True, False) - # we try all possible combinations of requires_grad for each input - for lu_requires_grad, b_requires_grad in product(requires_grad_options, requires_grad_options): - # when requires_grad == True, at least one input has to have requires_grad enabled - if requires_grad and not lu_requires_grad and not b_requires_grad: - continue - # we run LU several times to guarantee that the produced SampleInputs are independent - # this is especially important when setting different requries_grad for same tensors! - lu, pivs = a.lu() - lu.requires_grad = lu_requires_grad - b = torch.randn(*batch, n, rhs, dtype=dtype, device=device) - b.requires_grad = b_requires_grad - yield SampleInput(b, args=(lu, pivs)) - - return list(generate_samples()) + for n, batch, rhs in product(ns, batches, nrhs): + a = random_fullrank_matrix_distinct_singular_value(n, *batch, dtype=dtype, device=device) + requires_grad_options = (False,) if not requires_grad else (True, False) + # we try all possible combinations of requires_grad for each input + for lu_requires_grad, b_requires_grad in product(requires_grad_options, requires_grad_options): + # when requires_grad == True, at least one input has to have requires_grad enabled + if requires_grad and not lu_requires_grad and not b_requires_grad: + continue + # we run LU several times to guarantee that the produced SampleInputs are independent + # this is especially important when setting different requries_grad for same tensors! + lu, pivs = a.lu() + lu.requires_grad = lu_requires_grad + b = torch.randn(*batch, n, rhs, dtype=dtype, device=device) + b.requires_grad = b_requires_grad + yield SampleInput(b, args=(lu, pivs)) def sample_inputs_lu_unpack(op_info, device, dtype, requires_grad=False, **kwargs): # not needed once OpInfo tests support Iterables - def generate_samples(): - for lu_sample in sample_inputs_lu(op_info, device, dtype, requires_grad, **kwargs): - lu_data, pivots = lu_sample.input.lu() + for lu_sample in sample_inputs_lu(op_info, device, dtype, requires_grad, **kwargs): + lu_data, pivots = lu_sample.input.lu() + yield SampleInput(lu_data, args=(pivots,)) + + # generate rectangular inputs + lu_data_shape = lu_data.shape + batch_shape = lu_data_shape[:-2] + n = lu_data_shape[-2] + + for shape_inc in ((1, 0), (0, 1)): + lu_data, pivots = make_tensor( + batch_shape + (n + shape_inc[0], n + shape_inc[1]), + device, dtype, + requires_grad=False, + low=None, high=None + ).lu() + lu_data.requires_grad_(requires_grad) yield SampleInput(lu_data, args=(pivots,)) - # generate rectangular inputs - lu_data_shape = lu_data.shape - batch_shape = lu_data_shape[:-2] - n = lu_data_shape[-2] - - for shape_inc in ((1, 0), (0, 1)): - lu_data, pivots = make_tensor( - batch_shape + (n + shape_inc[0], n + shape_inc[1]), - device, dtype, - requires_grad=False, - low=None, high=None - ).lu() - lu_data.requires_grad_(requires_grad) - yield SampleInput(lu_data, args=(pivots,)) - - return list(generate_samples()) - def sample_inputs_roll(op_info, device, dtype, requires_grad=False, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) args = ((0, 0), (1, 2), (0, 2), (2, 0), (-1, 0), (10000, 1), (2,), ((1, 2, -1), (0, 1, 2))) - def generator(): - for arg in args: - yield SampleInput(make_arg((S, S, S)), args=arg) - - return list(generator()) + for arg in args: + yield SampleInput(make_arg((S, S, S)), args=arg) def sample_inputs_rot90(op_info, device, dtype, requires_grad=False, **kwargs): @@ -5528,11 +5375,8 @@ def sample_inputs_rot90(op_info, device, dtype, requires_grad=False, **kwargs): (1, (1, -1),), ()) - def generator(): - for arg in args: - yield SampleInput(make_arg((S, S, S)), args=arg) - - return list(generator()) + for arg in args: + yield SampleInput(make_arg((S, S, S)), args=arg) def sample_inputs_std_var(op_info, device, dtype, requires_grad, **kwargs): @@ -5673,11 +5517,8 @@ def sample_inputs_permute(op_info, device, dtype, requires_grad, **kwargs): ((), ()), ((1, 2, 3, 4), (2, 1, 3, 0))] - def generator(): - for shape, args in cases: - yield SampleInput(make_arg(shape), args=(args,)) - - return list(generator()) + for shape, args in cases: + yield SampleInput(make_arg(shape), args=(args,)) # Based on erstwhile method_tests tests & some tensor_op_tests for pow @@ -5866,11 +5707,8 @@ def sample_inputs_flip(op_info, device, dtype, requires_grad): sizes = ((S, M, S), (S, 0, M)) all_dims = ((0, 1, 2), (0,), (0, 2), (-1,), ()) - def gen_samples(): - for size, dims in product(sizes, all_dims): - yield SampleInput(make_arg(size), kwargs={"dims": dims}) - - return list(gen_samples()) + for size, dims in product(sizes, all_dims): + yield SampleInput(make_arg(size), kwargs={"dims": dims}) def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad, **kwargs): tensors = ( @@ -5908,16 +5746,13 @@ def sample_inputs_fmod_remainder(op_info, device, dtype, requires_grad, *, autod samples = cases + cases_with_tensor_scalar + cases_with_broadcasting # type: ignore[assignment] - def generator(): - for shape, arg_other, broadcasts_input in samples: - if isinstance(arg_other, tuple): - arg = make_arg(arg_other, requires_grad=False, exclude_zero=True) - else: - # shape_other is scalar or torch.tensor - arg = arg_other - yield(SampleInput(make_arg(shape), args=(arg,), broadcasts_input=broadcasts_input)) - - return list(generator()) + for shape, arg_other, broadcasts_input in samples: + if isinstance(arg_other, tuple): + arg = make_arg(arg_other, requires_grad=False, exclude_zero=True) + else: + # shape_other is scalar or torch.tensor + arg = arg_other + yield(SampleInput(make_arg(shape), args=(arg,), broadcasts_input=broadcasts_input)) # TODO: clamp shares tensors among its sample inputs --- we should prohibit this! def sample_inputs_clamp(op_info, device, dtype, requires_grad, **kwargs): @@ -5992,22 +5827,18 @@ def sample_inputs_cumprod(op_info, device, dtype, requires_grad, **kwargs): result.narrow(dim_select[0], 4, 1).narrow(dim_select[1], 3, 1).zero_() return result - # will not be needed once OpInfo tests suport Iterables - def sample_generator(): - for dim in range(3): - yield SampleInput(make_arg((S, S, S)), args=(dim,)) - # Scalar tensors and empty tensor - for size in [(), (1,), (0,)]: - yield SampleInput(make_arg(size), args=(0,)) + for dim in range(3): + yield SampleInput(make_arg((S, S, S)), args=(dim,)) + # Scalar tensors and empty tensor + for size in [(), (1,), (0,)]: + yield SampleInput(make_arg(size), args=(0,)) - yield SampleInput(prod_zeros([0, 1]), args=(1,)) - yield SampleInput(prod_zeros([0, 2]), args=(1,)) - yield SampleInput(prod_zeros([1, 2]), args=(1,)) + yield SampleInput(prod_zeros([0, 1]), args=(1,)) + yield SampleInput(prod_zeros([0, 2]), args=(1,)) + yield SampleInput(prod_zeros([1, 2]), args=(1,)) - # test dtype kwarg - yield SampleInput(prod_zeros([1, 2]), args=(1,), kwargs={'dtype': dtype}) - - return list(sample_generator()) + # test dtype kwarg + yield SampleInput(prod_zeros([1, 2]), args=(1,), kwargs={'dtype': dtype}) def sample_inputs_view_as_complex(op_info, device, dtype, requires_grad, **kwargs): return [SampleInput(make_tensor((S, 2), device, dtype, requires_grad=requires_grad),)] @@ -6042,17 +5873,14 @@ def sample_inputs_copysign(op_info, device, dtype, requires_grad, **kwargs): # broadcast all cases.append(((S, 1, S), (M, S), True)) - def generator(): - for input_shape, arg_val, broadcasts_input in cases: - if isinstance(arg_val, tuple): - arg = _make_tensor(*arg_val) - else: - # arg_val is scalar - arg = arg_val + for input_shape, arg_val, broadcasts_input in cases: + if isinstance(arg_val, tuple): + arg = _make_tensor(*arg_val) + else: + # arg_val is scalar + arg = arg_val - yield SampleInput(_make_tensor(*input_shape), args=(arg, ), broadcasts_input=broadcasts_input) - - return list(generator()) + yield SampleInput(_make_tensor(*input_shape), args=(arg, ), broadcasts_input=broadcasts_input) def sample_inputs_prod(op_info, device, dtype, requires_grad): def make_arg(shape): @@ -6065,33 +5893,29 @@ def sample_inputs_prod(op_info, device, dtype, requires_grad): result[0, 1] = 0 return result - # will not be needed once OpInfo tests support Iterables - def sample_generator(): - for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad): - # only Tensor, ignore other inputs - yield SampleInput(sample.input.detach().clone().requires_grad_(requires_grad)) - yield sample + for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad): + # only Tensor, ignore other inputs + yield SampleInput(sample.input.detach().clone().requires_grad_(requires_grad)) + yield sample - # Generates samples with keepdim = True - for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad): - sample.kwargs['keepdim'] = True - yield sample + # Generates samples with keepdim = True + for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad): + sample.kwargs['keepdim'] = True + yield sample - yield SampleInput(prod_single_zero()) - yield SampleInput(make_arg((3, 3, 3)), args=(1,)) - yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True}) + yield SampleInput(prod_single_zero()) + yield SampleInput(make_arg((3, 3, 3)), args=(1,)) + yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True}) - # test zero scalar tensor - zero = make_arg(()) - with torch.no_grad(): - zero.zero_() - yield SampleInput(zero.detach().clone().requires_grad_(requires_grad)) - yield SampleInput(zero.detach().clone().requires_grad_(requires_grad), args=(0,)) - yield SampleInput(zero.detach().clone().requires_grad_(requires_grad), - args=(0,), - kwargs={'keepdim': True}) - - return list(sample_generator()) + # test zero scalar tensor + zero = make_arg(()) + with torch.no_grad(): + zero.zero_() + yield SampleInput(zero.detach().clone().requires_grad_(requires_grad)) + yield SampleInput(zero.detach().clone().requires_grad_(requires_grad), args=(0,)) + yield SampleInput(zero.detach().clone().requires_grad_(requires_grad), + args=(0,), + kwargs={'keepdim': True}) def error_inputs_neg(op_info, device, **kwargs): si = SampleInput(torch.tensor((False, True), device=device)) @@ -6109,11 +5933,8 @@ def sample_inputs_nextafter(op_info, device, dtype, requires_grad, **kwargs): ((S, ), (S, S), True) ) - def generator(): - for shape, other_shape, broadcasts_input in cases: - yield SampleInput(make_arg(shape), args=(make_arg(other_shape),), broadcasts_input=broadcasts_input) - - return list(generator()) + for shape, other_shape, broadcasts_input in cases: + yield SampleInput(make_arg(shape), args=(make_arg(other_shape),), broadcasts_input=broadcasts_input) def sample_inputs_diag(op_info, device, dtype, requires_grad, **kwargs): @@ -6145,11 +5966,8 @@ def sample_inputs_diagonal_diag_embed(op_info, device, dtype, requires_grad, **k args_2d = ((), (2,), (-2,), (1,)) args_3d = ((1, 1, 2), (2, 0, 1), (-2, 0, 1)) - def generator(): - for shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)): - yield SampleInput(make_arg(shape), args=arg) - - return list(generator()) + for shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)): + yield SampleInput(make_arg(shape), args=arg) def sample_inputs_diagonal_scatter(op_info, device, dtype, requires_grad, **kwargs): @@ -6164,20 +5982,17 @@ def sample_inputs_diagonal_scatter(op_info, device, dtype, requires_grad, **kwar args_2d = ((), (2,), (-2,), (1,)) args_3d = ((1, 1, 2), (2, 0, 1), (-2, 0, 1)) - def generator(): - for input_shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)): - input_ = make_arg(input_shape) - # We can programatically figure out the right shape for src: - # It should be the same size as input.diagonal(other_args...) - if not isinstance(arg, tuple): - arg_tuple = (arg,) - else: - arg_tuple = arg - src_shape = input_.diagonal(*arg_tuple).size() - src = make_arg(src_shape) - yield SampleInput(input_, args=(src, *arg_tuple)) - - return list(generator()) + for input_shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)): + input_ = make_arg(input_shape) + # We can programatically figure out the right shape for src: + # It should be the same size as input.diagonal(other_args...) + if not isinstance(arg, tuple): + arg_tuple = (arg,) + else: + arg_tuple = arg + src_shape = input_.diagonal(*arg_tuple).size() + src = make_arg(src_shape) + yield SampleInput(input_, args=(src, *arg_tuple)) def sample_inputs_to_sparse(op_info, device, dtype, requires_grad, **kwargs): @@ -6490,11 +6305,8 @@ def sample_inputs_polygamma(op_info, device, dtype, requires_grad, **kwargs): tensor_shapes = ((S, S), ()) ns = (1, 2, 3, 4, 5) - def generator(): - for shape, n in product(tensor_shapes, ns): - yield SampleInput(make_arg(shape), args=(n,)) - - return list(generator()) + for shape, n in product(tensor_shapes, ns): + yield SampleInput(make_arg(shape), args=(n,)) def sample_inputs_mvlgamma(op_info, device, dtype, requires_grad, **kwargs): @@ -6509,15 +6321,12 @@ def sample_inputs_mvlgamma(op_info, device, dtype, requires_grad, **kwargs): def compute_min_val(p): return (p - 1.) / 2 - def generator(): - for shape, n in product(tensor_shapes, ns): - min_val = compute_min_val(n) - if not dtype.is_floating_point: - # Round-up minimum value for integral dtypes - min_val += 1 - yield SampleInput(make_arg(shape, low=min_val), args=(n,)) - - return list(generator()) + for shape, n in product(tensor_shapes, ns): + min_val = compute_min_val(n) + if not dtype.is_floating_point: + # Round-up minimum value for integral dtypes + min_val += 1 + yield SampleInput(make_arg(shape, low=min_val), args=(n,)) # Since `mvlgamma` has multiple entries, @@ -6763,12 +6572,9 @@ def sample_inputs_atan2(op_info, device, dtype, requires_grad, **kwargs): ((S, 1, S), (S, S), True), ) - def generator(): - for x_shape, y_shape, broadcasts_input in cases: - yield SampleInput(make_arg(x_shape), args=(make_arg(y_shape),), - broadcasts_input=broadcasts_input) - - return list(generator()) + for x_shape, y_shape, broadcasts_input in cases: + yield SampleInput(make_arg(x_shape), args=(make_arg(y_shape),), + broadcasts_input=broadcasts_input) def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=False, **kwargs): @@ -6786,11 +6592,8 @@ def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=Fals ((S, S, S), (S, 1)), ) - def generator(): - for shape, args in cases: - yield SampleInput(make_arg(shape), args=args) - - return list(generator()) + for shape, args in cases: + yield SampleInput(make_arg(shape), args=args) def sample_inputs_split_with_sizes(op_info, device, dtype, requires_grad, **kwargs): @@ -6802,11 +6605,8 @@ def sample_inputs_split_with_sizes(op_info, device, dtype, requires_grad, **kwar ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)], -2)), ) - def generator(): - for shape, args in cases: - yield SampleInput(make_arg(shape), args=args) - - return list(generator()) + for shape, args in cases: + yield SampleInput(make_arg(shape), args=args) def sample_inputs_msort(op_info, device, dtype, requires_grad): @@ -7003,30 +6803,20 @@ def sample_inputs_tril_triu(op_info, device, dtype, requires_grad, **kwargs): ((S, M, M), (2,)), ((3, 3, S, S), ()),) - def generator(): - for shape, args in cases: - yield SampleInput(make_arg(shape), args=args) - - return list(generator()) + for shape, args in cases: + yield SampleInput(make_arg(shape), args=args) def sample_inputs_clone(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) - def generator(): - yield SampleInput(make_arg((S, M, S))) - yield SampleInput(make_arg(())) - - return list(generator()) + yield SampleInput(make_arg((S, M, S))) + yield SampleInput(make_arg(())) def sample_inputs_contiguous(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) - - def generator(): - yield SampleInput(make_arg((S, S))) - - return list(generator()) + yield SampleInput(make_arg((S, S))) def sample_inputs_sum_to_size(op_info, device, dtype, requires_grad, **kwargs): @@ -7057,21 +6847,18 @@ def sample_inputs_resize_ops(op_info, device, dtype, requires_grad, **kwargs): ((), (1, 1, 1)), ) - def generator(): - for shape, args_or_shape in cases: - # Update `args` based on operator - if op_info.name == 'resize_': - # resize_ takes shape/tuple of ints, - args = (args_or_shape, ) - elif op_info.name == 'resize_as_': - # resize_as_ takes another tensor - args = (make_arg(shape, requires_grad=False), ) # type:ignore[assignment] - else: - raise ValueError("sample_inputs_resize_ops is being used with incorrect operator") + for shape, args_or_shape in cases: + # Update `args` based on operator + if op_info.name == 'resize_': + # resize_ takes shape/tuple of ints, + args = (args_or_shape, ) + elif op_info.name == 'resize_as_': + # resize_as_ takes another tensor + args = (make_arg(shape, requires_grad=False), ) # type:ignore[assignment] + else: + raise ValueError("sample_inputs_resize_ops is being used with incorrect operator") - yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args)) - - return list(generator()) + yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args)) def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) @@ -7084,18 +6871,15 @@ def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs): ((), ()), ((), (1,))) - def generator(): - for case in cases: - shape, args = case - inp = make_arg(shape, requires_grad=requires_grad) - yield(SampleInput(inp, args=(args, ))) + for case in cases: + shape, args = case + inp = make_arg(shape, requires_grad=requires_grad) + yield(SampleInput(inp, args=(args, ))) - if op_info.name != "view" and len(shape) >= 2: - yield(SampleInput( - inp.detach().clone().transpose(0, 1).requires_grad_(requires_grad), - args=(args, ))) - - return list(generator()) + if op_info.name != "view" and len(shape) >= 2: + yield(SampleInput( + inp.detach().clone().transpose(0, 1).requires_grad_(requires_grad), + args=(args, ))) def sample_inputs_view_as_reshape_as(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device) @@ -7105,18 +6889,15 @@ def sample_inputs_view_as_reshape_as(op_info, device, dtype, requires_grad, **kw ((), (1, 1)), ) - def generator(): - for case in cases: - shape, shape_other = case - inp = make_arg(shape, requires_grad=requires_grad) - yield(SampleInput(inp, args=(make_arg(shape_other, requires_grad=False),))) + for case in cases: + shape, shape_other = case + inp = make_arg(shape, requires_grad=requires_grad) + yield(SampleInput(inp, args=(make_arg(shape_other, requires_grad=False),))) - if op_info.name != "view_as" and len(shape) >= 2: - yield(SampleInput( - inp.detach().clone().transpose(0, 1).requires_grad_(requires_grad), - args=(make_arg(shape_other, requires_grad=False),))) - - return list(generator()) + if op_info.name != "view_as" and len(shape) >= 2: + yield(SampleInput( + inp.detach().clone().transpose(0, 1).requires_grad_(requires_grad), + args=(make_arg(shape_other, requires_grad=False),))) def sample_inputs_atleast1d2d3d(op_info, device, dtype, requires_grad, **kwargs): input_list = [] @@ -7161,11 +6942,8 @@ def sample_inputs_select(op_info, device, dtype, requires_grad, **kwargs): ((S,), (0, 2)) ) - def generator(): - for shape, args in cases: - yield SampleInput(make_arg(shape), args=args) - - return list(generator()) + for shape, args in cases: + yield SampleInput(make_arg(shape), args=args) def sample_inputs_select_scatter(op_info, device, dtype, requires_grad, **kwargs): @@ -7178,13 +6956,10 @@ def sample_inputs_select_scatter(op_info, device, dtype, requires_grad, **kwargs ((S,), (), (0, 2)) ) - def generator(): - for input_shape, src_shape, args in cases: - input_ = make_arg(input_shape) - src = make_arg(src_shape) - yield SampleInput(input_, args=(src, *args)) - - return list(generator()) + for input_shape, src_shape, args in cases: + input_ = make_arg(input_shape) + src = make_arg(src_shape) + yield SampleInput(input_, args=(src, *args)) def sample_inputs_slice_scatter(op_info, device, dtype, requires_grad, **kwargs): @@ -7201,13 +6976,10 @@ def sample_inputs_slice_scatter(op_info, device, dtype, requires_grad, **kwargs) ((L, L, L), (L, L, L // 4,), (2, L // 2, L, 2)), ) - def generator(): - for input_shape, src_shape, args in cases: - input_ = make_arg(input_shape) - src = make_arg(src_shape) - yield SampleInput(input_, args=(src, *args)) - - return list(generator()) + for input_shape, src_shape, args in cases: + input_ = make_arg(input_shape) + src = make_arg(src_shape) + yield SampleInput(input_, args=(src, *args)) def sample_inputs_rbinops(op_info, device, dtype, requires_grad, supports_dtype_kwargs=True, **kwargs): @@ -7243,12 +7015,9 @@ def sample_inputs_expand(op_info, device, dtype, requires_grad, **kwargs): ((), (1, 3, 2)), ) - def generator(): - for case in cases: - shape, args = case - yield(SampleInput(make_arg(shape), args=(args, ))) - - return list(generator()) + for case in cases: + shape, args = case + yield(SampleInput(make_arg(shape), args=(args, ))) def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) @@ -7257,11 +7026,9 @@ def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs): (2, 3)) memory_format_options = [None, torch.contiguous_format] - def generator(): - for shape, memory_format in itertools.product(shapes, memory_format_options): - yield SampleInput(make_arg(shape), - kwargs={'memory_format': memory_format} if memory_format else {}) - return list(generator()) + for shape, memory_format in itertools.product(shapes, memory_format_options): + yield SampleInput(make_arg(shape), + kwargs={'memory_format': memory_format} if memory_format else {}) def sample_inputs_conversion_channels_last(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) @@ -7279,12 +7046,9 @@ def sample_inputs_expand_as(op_info, device, dtype, requires_grad, **kwargs): ((), (1, 1)), ) - def generator(): - for shape, shape_other in cases: - yield(SampleInput(make_arg(shape, requires_grad=requires_grad), - args=(make_arg(shape_other, requires_grad=False), ))) - - return list(generator()) + for shape, shape_other in cases: + yield(SampleInput(make_arg(shape, requires_grad=requires_grad), + args=(make_arg(shape_other, requires_grad=False), ))) def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs): @@ -7316,13 +7080,10 @@ def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs): ((M, 1, M), (), (M, M, 1), True), ((), (M, M), (), True),) - def generator(): - for shape, mask_shape, other_shape, broadcasts_input in cases: - yield SampleInput(make_arg(shape), - args=(make_bool_mask(mask_shape), make_arg(other_shape)), - broadcasts_input=broadcasts_input) - - return list(generator()) + for shape, mask_shape, other_shape, broadcasts_input in cases: + yield SampleInput(make_arg(shape), + args=(make_bool_mask(mask_shape), make_arg(other_shape)), + broadcasts_input=broadcasts_input) def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) @@ -7341,12 +7102,9 @@ def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs): mixed[mask_t] = 0 inputs.append(mixed) - def generator(): - for input_t, as_tuple in product(inputs, [False, True]): - yield(SampleInput(input_t.detach().clone().requires_grad_(requires_grad), - kwargs=dict(as_tuple=as_tuple))) - - return list(generator()) + for input_t, as_tuple in product(inputs, [False, True]): + yield(SampleInput(input_t.detach().clone().requires_grad_(requires_grad), + kwargs=dict(as_tuple=as_tuple))) def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device) @@ -7355,12 +7113,9 @@ def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs): ((S, S, S), (S, 1)), ((S, S, S), (S, -1))) - def generator(): - for case in cases: - shape, args = case - yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args)) - - return list(generator()) + for case in cases: + shape, args = case + yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args)) def sample_inputs_kthvalue(op_info, device, dtype, requires_grad, **kwargs): def _tensor(shape, dtype=dtype, low=None, high=None): @@ -7561,66 +7316,63 @@ def sample_inputs_embedding(op_info, device, dtype, requires_grad, **kwargs): def make_long_input(shape, *, low, high): return make_tensor(shape, device=device, dtype=torch.long, low=low, high=high) - def generator(): - # 0-D index tensor - idx = make_long_input((), low=0, high=M) - yield SampleInput(make_input((M, S)), args=(idx,),) + # 0-D index tensor + idx = make_long_input((), low=0, high=M) + yield SampleInput(make_input((M, S)), args=(idx,),) - # 1-D index tensor - idx = make_long_input((S,), low=0, high=M) - yield SampleInput(make_input((M, S)), args=(idx,),) + # 1-D index tensor + idx = make_long_input((S,), low=0, high=M) + yield SampleInput(make_input((M, S)), args=(idx,),) - # 2-D index tensor - idx = make_long_input((S, S), low=0, high=M) - yield SampleInput(make_input((M, S)), args=(idx,),) + # 2-D index tensor + idx = make_long_input((S, S), low=0, high=M) + yield SampleInput(make_input((M, S)), args=(idx,),) - if not requires_grad: - # Following inputs return different gradient from the numerical gradient. - # This is expected and relevant tests are present in `test_nn.py`. + if not requires_grad: + # Following inputs return different gradient from the numerical gradient. + # This is expected and relevant tests are present in `test_nn.py`. - # The gradient vector at `padding_idx` is not updated. - idx = make_long_input((2, 2), low=0, high=S) - idx[0, 0] = 2 - idx[1, 1] = 2 - yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},) + # The gradient vector at `padding_idx` is not updated. + idx = make_long_input((2, 2), low=0, high=S) + idx[0, 0] = 2 + idx[1, 1] = 2 + yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},) - idx = make_long_input((2, 2), low=0, high=S) - idx[0, 0] = 4 - idx[1, 1] = 4 - yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},) + idx = make_long_input((2, 2), low=0, high=S) + idx[0, 0] = 4 + idx[1, 1] = 4 + yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},) - # Due to inplace renorming of weight, the numerical gradient doesn't match the - # analytical gradient. - idx = make_long_input((2, 2), low=0, high=S) - weights = make_input((S, S)) * 2 - yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1.},) + # Due to inplace renorming of weight, the numerical gradient doesn't match the + # analytical gradient. + idx = make_long_input((2, 2), low=0, high=S) + weights = make_input((S, S)) * 2 + yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1.},) - idx = make_long_input((2, 2), low=0, high=S) - weights = make_input((S, S)) * 2 - yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1., 'norm_type': 1.0},) + idx = make_long_input((2, 2), low=0, high=S) + weights = make_input((S, S)) * 2 + yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1., 'norm_type': 1.0},) - # Scale the gradient based on the inverse frequency of a particular index. - idx = make_long_input((2, 2), low=0, high=S) - idx[0, 0] = 1 - idx[0, 1] = 1 - weights = make_input((S, S)) - yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},) + # Scale the gradient based on the inverse frequency of a particular index. + idx = make_long_input((2, 2), low=0, high=S) + idx[0, 0] = 1 + idx[0, 1] = 1 + weights = make_input((S, S)) + yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},) - # gradcheck not implemented for sparse tensors. - idx = make_long_input((2, 2), low=0, high=S) - weights = make_input((S, S)) - yield SampleInput(weights, args=(idx,), kwargs={'sparse': True}) + # gradcheck not implemented for sparse tensors. + idx = make_long_input((2, 2), low=0, high=S) + weights = make_input((S, S)) + yield SampleInput(weights, args=(idx,), kwargs={'sparse': True}) - idx = make_long_input((3, 3), low=0, high=S) - idx[0, 0] = 1 # freq more than 1 - idx[0, 1] = 1 # freq more than 1 - idx[1, 0] = 0 # padding_idx - weights = make_input((S, S)) * 2 - yield SampleInput(weights, args=(idx,), - kwargs={'sparse': True, 'scale_grad_by_freq': True, - 'padding_idx': 0, 'max_norm': 1.}) - - return list(generator()) + idx = make_long_input((3, 3), low=0, high=S) + idx[0, 0] = 1 # freq more than 1 + idx[0, 1] = 1 # freq more than 1 + idx[1, 0] = 0 # padding_idx + weights = make_input((S, S)) * 2 + yield SampleInput(weights, args=(idx,), + kwargs={'sparse': True, 'scale_grad_by_freq': True, + 'padding_idx': 0, 'max_norm': 1.}) def sample_inputs_one_hot(op_info, device, dtype, requires_grad, **kwargs): @@ -7677,13 +7429,10 @@ def sample_inputs_tensorsolve(op_info, device, dtype, requires_grad, **kwargs): # a_shapes += [(0, 0, 1, 2, 3, 0)] dimss = [None, (0, 2)] - def gen_inputs(): - for a_shape, dims in itertools.product(a_shapes, dimss): - a = make_tensor(a_shape, dtype=dtype, device=device, requires_grad=requires_grad) - b = make_tensor(a_shape[:2], dtype=dtype, device=device, requires_grad=requires_grad) - yield SampleInput(a, args=(b,), kwargs=dict(dims=dims)) - - return list(gen_inputs()) + for a_shape, dims in itertools.product(a_shapes, dimss): + a = make_tensor(a_shape, dtype=dtype, device=device, requires_grad=requires_grad) + b = make_tensor(a_shape[:2], dtype=dtype, device=device, requires_grad=requires_grad) + yield SampleInput(a, args=(b,), kwargs=dict(dims=dims)) def sample_inputs_mse_loss(op_info, device, dtype, requires_grad, **kwargs): _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -7742,17 +7491,14 @@ def sample_inputs_cosine_embedding_loss(op_info, device, dtype, requires_grad, * target = t.to(dtype=dtype).detach() return target - def gen_inputs(): - shapes = ((S, S), (S,)) - reductions = ('none', 'mean', 'sum') - for s, r in product(shapes, reductions): - yield SampleInput( - make_input(s), - args=(make_input(s), make_target(s)), - kwargs=dict(reduction=r, margin=random.uniform(-1, 1)) - ) - - return list(gen_inputs()) + shapes = ((S, S), (S,)) + reductions = ('none', 'mean', 'sum') + for s, r in product(shapes, reductions): + yield SampleInput( + make_input(s), + args=(make_input(s), make_target(s)), + kwargs=dict(reduction=r, margin=random.uniform(-1, 1)) + ) def sample_inputs_ctc_loss(op_info, device, dtype, requires_grad, **kwargs): input_length = 50 @@ -7765,18 +7511,15 @@ def sample_inputs_ctc_loss(op_info, device, dtype, requires_grad, **kwargs): log_probs = t.log_softmax(2).to(device=device, dtype=dtype).detach().requires_grad_(requires_grad=requires_grad) return log_probs - def gen_inputs(): - reductions = ('none', 'mean', 'sum') - zero_inf = (True, False) - for r, z in product(reductions, zero_inf): - log_probs = make_log_probs((input_length, batch, num_char)) - targets = torch.randint(1, num_char, (batch, target_length), dtype=torch.long, device=device) - input_lengths = torch.full((batch, ), input_length, dtype=torch.long, device=device) - target_lengths = torch.randint(10, target_length, (batch, ), dtype=torch.long, device=device) + reductions = ('none', 'mean', 'sum') + zero_inf = (True, False) + for r, z in product(reductions, zero_inf): + log_probs = make_log_probs((input_length, batch, num_char)) + targets = torch.randint(1, num_char, (batch, target_length), dtype=torch.long, device=device) + input_lengths = torch.full((batch, ), input_length, dtype=torch.long, device=device) + target_lengths = torch.randint(10, target_length, (batch, ), dtype=torch.long, device=device) - yield SampleInput(log_probs, args=(targets, input_lengths, target_lengths,), kwargs=dict(reduction=r, zero_infinity=z)) - - return list(gen_inputs()) + yield SampleInput(log_probs, args=(targets, input_lengths, target_lengths,), kwargs=dict(reduction=r, zero_infinity=z)) def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs): shape = (2, 3) @@ -7816,38 +7559,32 @@ def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs): if reduction != "mean": yield make_input(s), make_target(s, zeros=True), dict(ignore_index=0, reduction=reduction) - def gen_inputs(): - for input, target, kwargs in gen_shape_kwargs(): - yield SampleInput(input, args=(target,), kwargs=kwargs) - - return list(gen_inputs()) + for input, target, kwargs in gen_shape_kwargs(): + yield SampleInput(input, args=(target,), kwargs=kwargs) def sample_inputs_argwhere(op_info, device, dtype, requires_grad, **kwargs): - def generator(): - yield SampleInput(torch.tensor([1, 0, 2, 0], dtype=dtype, device=device, requires_grad=requires_grad)) + yield SampleInput(torch.tensor([1, 0, 2, 0], dtype=dtype, device=device, requires_grad=requires_grad)) - mask = torch.tensor([[0, 1, 0, 1, 0], - [1, 1, 1, 1, 0], - [0, 0, 0, 1, 0], - [1, 0, 1, 1, 0], - [1, 0, 0, 1, 0]], dtype=torch.bool, device=device) - t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad) - with torch.no_grad(): - t[mask] = 0 - yield SampleInput(t) + mask = torch.tensor([[0, 1, 0, 1, 0], + [1, 1, 1, 1, 0], + [0, 0, 0, 1, 0], + [1, 0, 1, 1, 0], + [1, 0, 0, 1, 0]], dtype=torch.bool, device=device) + t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad) + with torch.no_grad(): + t[mask] = 0 + yield SampleInput(t) - t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad, noncontiguous=True) - with torch.no_grad(): - t[mask] = 0 - yield SampleInput(t) + t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad, noncontiguous=True) + with torch.no_grad(): + t[mask] = 0 + yield SampleInput(t) - t = make_tensor((S, 0), dtype=dtype, device=device, requires_grad=requires_grad) - yield SampleInput(t) + t = make_tensor((S, 0), dtype=dtype, device=device, requires_grad=requires_grad) + yield SampleInput(t) - yield SampleInput(torch.zeros((S,), dtype=dtype, device=device, requires_grad=requires_grad)) - yield SampleInput(make_tensor((), dtype=dtype, device=device, requires_grad=requires_grad)) - - return list(generator()) + yield SampleInput(torch.zeros((S,), dtype=dtype, device=device, requires_grad=requires_grad)) + yield SampleInput(make_tensor((), dtype=dtype, device=device, requires_grad=requires_grad)) def _generate_sample_shape_reduction(): shapes = ((S,), (S, S), (S, S, S)) @@ -7857,7 +7594,8 @@ def _generate_sample_shape_reduction(): def sample_inputs_gaussian_nll_loss(op_info, device, dtype, requires_grad, **kwargs): _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) - make_var = partial(make_tensor, low=0, device=device, dtype=dtype, requires_grad=requires_grad) + # Set low slightly above 0 so gradcheck doesn't accidentally dip below 0 + make_var = partial(make_tensor, low=0.1, device=device, dtype=dtype, requires_grad=requires_grad) def gen_shape(shape): yield shape @@ -7882,11 +7620,8 @@ def sample_inputs_gaussian_nll_loss(op_info, device, dtype, requires_grad, **kwa dict(full=True, eps=random.uniform(1e-6, 1e-3), reduction=r) ) - def gen_inputs(): - for input, target, var, kwargs in gen_shape_kwargs(): - yield SampleInput(input, args=(target, var, ), kwargs=kwargs) - - return list(gen_inputs()) + for input, target, var, kwargs in gen_shape_kwargs(): + yield SampleInput(input, args=(target, var, ), kwargs=kwargs) def _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs): _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -7895,20 +7630,14 @@ def _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwa yield _make_tensor(s), _make_tensor(s), dict(reduction=r) def sample_inputs_hinge_embedding_loss(op_info, device, dtype, requires_grad, **kwargs): - def gen_inputs(): - for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs): - d['margin'] = random.uniform(-9, 9) - yield SampleInput(input, args=(target, ), kwargs=d) - - return list(gen_inputs()) + for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs): + d['margin'] = random.uniform(-9, 9) + yield SampleInput(input, args=(target, ), kwargs=d) def sample_inputs_huber_loss(op_info, device, dtype, requires_grad, **kwargs): - def gen_inputs(): - for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs): - d['delta'] = random.uniform(1e-3, 9) - yield SampleInput(input, args=(target, ), kwargs=d) - - return list(gen_inputs()) + for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs): + d['delta'] = random.uniform(1e-3, 9) + yield SampleInput(input, args=(target, ), kwargs=d) def sample_inputs_poisson_nll_loss(op_info, device, dtype, requires_grad, **kwargs): _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -7928,11 +7657,8 @@ def sample_inputs_poisson_nll_loss(op_info, device, dtype, requires_grad, **kwar reduction=r) ) - def gen_inputs(): - for input, target, kwargs in gen_shape_kwargs(): - yield SampleInput(input, args=(target, ), kwargs=kwargs) - - return list(gen_inputs()) + for input, target, kwargs in gen_shape_kwargs(): + yield SampleInput(input, args=(target, ), kwargs=kwargs) def sample_inputs_pairwise_distance(op_info, device, dtype, requires_grad, **kwargs): make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -9035,13 +8761,7 @@ op_db: List[OpInfo] = [ supports_out=False, supports_gradgrad=False, assert_autodiffed=False, - sample_inputs_func=sample_inputs_cdist, - skips=( - # RuntimeError: _cdist_backward requires X1 to be contiguous - DecorateInfo(unittest.skip("_cdist_backward requires X1 to be contiguous"), - 'TestCommon', 'test_noncontiguous_samples'), - ) - ), + sample_inputs_func=sample_inputs_cdist), UnaryUfuncInfo('ceil', ref=np.ceil, dtypes=floating_types_and(torch.bfloat16), @@ -9860,6 +9580,9 @@ op_db: List[OpInfo] = [ skips=( # Dispatch stub: unsupported device typemeta DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'), + # (ROCm) Memory access fault by GPU node-4 (Agent handle: 0x55cebc9e8430) on address 0x7fa17b757000 + DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM), )), BinaryUfuncInfo('floor_divide', dtypes=all_types_and(torch.half, torch.bfloat16), @@ -9939,6 +9662,9 @@ op_db: List[OpInfo] = [ DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32, torch.complex64)), # noqa: B950 # 69925: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='cuda'), + # (ROCm) Memory exception on virtual address 0x7f6f3deb7000, node id 4: Page not present + DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64, torch.complex128], active_if=TEST_WITH_ROCM), ), supports_inplace_autograd=False, sample_inputs_func=sample_inputs_gradient), @@ -10519,6 +10245,9 @@ op_db: List[OpInfo] = [ skips=( # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...) DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'), + # (ROCm) unexpected success + DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM), )), OpInfo('median', dtypes=all_types_and(torch.bfloat16), @@ -10531,6 +10260,9 @@ op_db: List[OpInfo] = [ skips=( # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...) DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'), + # (ROCm) unexpected success + DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM), )), OpInfo('nanmedian', dtypes=all_types_and(torch.bfloat16), @@ -10543,6 +10275,9 @@ op_db: List[OpInfo] = [ skips=( # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...) DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'), + # (ROCm) unexpected success + DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM), )), OpInfo('var_mean', dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16), @@ -10647,6 +10382,9 @@ op_db: List[OpInfo] = [ skips=( # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...) DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'), + # (ROCm) unexpected success + DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM), )), OpInfo('quantile', dtypes=floating_types(), @@ -12886,6 +12624,9 @@ op_db: List[OpInfo] = [ skips=( # test does not work with passing lambda for op DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'), + # CUDA runs out of memory + DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.cdouble]), )), OpInfo('linalg.pinv', aten_name='linalg_pinv', @@ -14075,6 +13816,9 @@ op_db: List[OpInfo] = [ skips=( # Dispatch stub: unsupported device typemeta DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'), + # (ROCm) Memory access fault by GPU node-4 (Agent handle: 0x55860348e690) on address 0x7f0f4ddcb000 + DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64, torch.complex128], active_if=TEST_WITH_ROCM), )), OpInfo('trapezoid', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), @@ -14085,6 +13829,9 @@ op_db: List[OpInfo] = [ skips=( # Dispatch stub: unsupported device typemeta DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'), + # (ROCm) Memory access fault by GPU node-4 (Agent handle: 0x55bbf53d5500) on address 0x7fe536eb5000 + DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64, torch.complex128], active_if=TEST_WITH_ROCM), )), OpInfo('cumulative_trapezoid', dtypes=all_types_and_complex_and(), @@ -14593,7 +14340,10 @@ op_db: List[OpInfo] = [ DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'), # On CUDA, the op is dispatched (and a few more conditions) to # _fused_dropout, which doesn't support forward AD - DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD', device_type='cuda'),), + DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD', device_type='cuda'), + # (ROCm) NotImplementedError: Trying to use forward AD with native_dropout that does not support it + DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad', + device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),), gradcheck_wrapper=wrapper_set_seed, supports_forward_ad=True, supports_fwgrad_bwgrad=True, diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py index e8a96d9fa847..bbe28172222d 100644 --- a/torch/utils/data/datapipes/iter/callable.py +++ b/torch/utils/data/datapipes/iter/callable.py @@ -1,6 +1,6 @@ import warnings from torch.utils.data import IterDataPipe, _utils, functional_datapipe -from typing import Callable, Dict, Iterator, Optional, Sized, Tuple, TypeVar +from typing import Callable, Iterator, Sized, TypeVar try: import dill @@ -37,8 +37,6 @@ class MapperIterDataPipe(IterDataPipe[T_co]): For `input_col` with multiple indices, the left-most one is used, and other indices will be removed. - Integer is used for list/tuple. -1 represents to append result at the end. - Key is used for dict. New key is acceptable. - fn_args: Positional arguments for `fn` - fn_kwargs: Keyword arguments for `fn` """ datapipe: IterDataPipe fn: Callable @@ -49,9 +47,6 @@ class MapperIterDataPipe(IterDataPipe[T_co]): fn: Callable, input_col=None, output_col=None, - *, - fn_args: Optional[Tuple] = None, - fn_kwargs: Optional[Dict] = None, ) -> None: super().__init__() self.datapipe = datapipe @@ -70,20 +65,18 @@ class MapperIterDataPipe(IterDataPipe[T_co]): raise ValueError("`output_col` must be a single-element list or tuple") output_col = output_col[0] self.output_col = output_col - self.args = () if fn_args is None else fn_args - self.kwargs = {} if fn_kwargs is None else fn_kwargs def _apply_fn(self, data): if self.input_col is None and self.output_col is None: - return self.fn(data, *self.args, **self.kwargs) + return self.fn(data) if self.input_col is None: - res = self.fn(data, *self.args, **self.kwargs) + res = self.fn(data) elif isinstance(self.input_col, (list, tuple)): args = tuple(data[col] for col in self.input_col) - res = self.fn(*args, *self.args, **self.kwargs) + res = self.fn(*args) else: - res = self.fn(data[self.input_col], *self.args, **self.kwargs) + res = self.fn(data[self.input_col]) # Copy tuple to list and run in-place modification because tuple is immutable. if isinstance(data, tuple): @@ -132,8 +125,6 @@ class MapperIterDataPipe(IterDataPipe[T_co]): dill_function, self.input_col, self.output_col, - self.args, - self.kwargs, ) return state @@ -143,8 +134,6 @@ class MapperIterDataPipe(IterDataPipe[T_co]): dill_function, self.input_col, self.output_col, - self.args, - self.kwargs, ) = state if DILL_AVAILABLE: self.fn = dill.loads(dill_function) # type: ignore[assignment] @@ -163,8 +152,6 @@ class CollatorIterDataPipe(MapperIterDataPipe): datapipe: Iterable DataPipe being collated collate_fn: Customized collate function to collect and combine data or a batch of data. Default function collates to Tensor(s) based on data type. - fn_args: Positional arguments for `collate_fn` - fn_kwargs: Keyword arguments for `collate_fn` Example: Convert integer data to float Tensor >>> class MyIterDataPipe(torch.utils.data.IterDataPipe): @@ -196,7 +183,5 @@ class CollatorIterDataPipe(MapperIterDataPipe): self, datapipe: IterDataPipe, collate_fn: Callable = _utils.collate.default_collate, - fn_args: Optional[Tuple] = None, - fn_kwargs: Optional[Dict] = None, ) -> None: - super().__init__(datapipe, fn=collate_fn, fn_args=fn_args, fn_kwargs=fn_kwargs) + super().__init__(datapipe, fn=collate_fn) diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py index 2edb0017b042..bd31d938c1b2 100644 --- a/torch/utils/data/datapipes/iter/selecting.py +++ b/torch/utils/data/datapipes/iter/selecting.py @@ -1,5 +1,5 @@ import warnings -from typing import Callable, Dict, Iterator, Optional, Tuple, TypeVar +from typing import Callable, Iterator, TypeVar from torch.utils.data import IterDataPipe, functional_datapipe from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper @@ -28,8 +28,6 @@ class FilterIterDataPipe(IterDataPipe[T_co]): Args: datapipe: Iterable DataPipe being filtered filter_fn: Customized function mapping an element to a boolean. - fn_args: Positional arguments for `filter_fn` - fn_kwargs: Keyword arguments for `filter_fn` drop_empty_batches: By default, drops batch if it is empty after filtering instead of keeping an empty list """ datapipe: IterDataPipe @@ -39,8 +37,6 @@ class FilterIterDataPipe(IterDataPipe[T_co]): def __init__(self, datapipe: IterDataPipe, filter_fn: Callable, - fn_args: Optional[Tuple] = None, - fn_kwargs: Optional[Dict] = None, drop_empty_batches: bool = True, ) -> None: super().__init__() @@ -50,8 +46,6 @@ class FilterIterDataPipe(IterDataPipe[T_co]): warnings.warn("Lambda function is not supported for pickle, please use " "regular python function or functools.partial instead.") self.filter_fn = filter_fn # type: ignore[assignment] - self.args = () if fn_args is None else fn_args - self.kwargs = {} if fn_kwargs is None else fn_kwargs self.drop_empty_batches = drop_empty_batches def __iter__(self) -> Iterator[T_co]: @@ -62,7 +56,7 @@ class FilterIterDataPipe(IterDataPipe[T_co]): yield filtered def _returnIfTrue(self, data): - condition = self.filter_fn(data, *self.args, **self.kwargs) + condition = self.filter_fn(data) if df_wrapper.is_column(condition): # We are operating on DataFrames filter here @@ -95,11 +89,11 @@ class FilterIterDataPipe(IterDataPipe[T_co]): dill_function = dill.dumps(self.filter_fn) else: dill_function = self.filter_fn - state = (self.datapipe, dill_function, self.args, self.kwargs, self.drop_empty_batches) + state = (self.datapipe, dill_function, self.drop_empty_batches) return state def __setstate__(self, state): - (self.datapipe, dill_function, self.args, self.kwargs, self.drop_empty_batches) = state + (self.datapipe, dill_function, self.drop_empty_batches) = state if DILL_AVAILABLE: self.filter_fn = dill.loads(dill_function) # type: ignore[assignment] else: diff --git a/torch/utils/data/datapipes/map/callable.py b/torch/utils/data/datapipes/map/callable.py index 8dbad957e069..a7527d24e248 100644 --- a/torch/utils/data/datapipes/map/callable.py +++ b/torch/utils/data/datapipes/map/callable.py @@ -1,5 +1,5 @@ import warnings -from typing import Callable, Dict, Optional, Tuple, TypeVar +from typing import Callable, TypeVar from torch.utils.data import MapDataPipe, functional_datapipe @@ -35,8 +35,6 @@ class MapperMapDataPipe(MapDataPipe[T_co]): args: datapipe: Source Map DataPipe fn: Function called over each item - fn_args: Positional arguments for `fn` - fn_kwargs: Keyword arguments for `fn` """ datapipe: MapDataPipe fn: Callable @@ -45,8 +43,6 @@ class MapperMapDataPipe(MapDataPipe[T_co]): self, datapipe: MapDataPipe, fn: Callable = default_fn, - fn_args: Optional[Tuple] = None, - fn_kwargs: Optional[Dict] = None, ) -> None: super().__init__() self.datapipe = datapipe @@ -57,25 +53,23 @@ class MapperMapDataPipe(MapDataPipe[T_co]): "regular python function or functools.partial instead." ) self.fn = fn # type: ignore[assignment] - self.args = () if fn_args is None else fn_args - self.kwargs = {} if fn_kwargs is None else fn_kwargs def __len__(self) -> int: return len(self.datapipe) def __getitem__(self, index) -> T_co: - return self.fn(self.datapipe[index], *self.args, **self.kwargs) + return self.fn(self.datapipe[index]) def __getstate__(self): if DILL_AVAILABLE: dill_function = dill.dumps(self.fn) else: dill_function = self.fn - state = (self.datapipe, dill_function, self.args, self.kwargs) + state = (self.datapipe, dill_function) return state def __setstate__(self, state): - (self.datapipe, dill_function, self.args, self.kwargs) = state + (self.datapipe, dill_function) = state if DILL_AVAILABLE: self.fn = dill.loads(dill_function) # type: ignore[assignment] else: diff --git a/torch/utils/data/dataset.pyi b/torch/utils/data/dataset.pyi index f2ac8102ea2b..a84a1bee364d 100644 --- a/torch/utils/data/dataset.pyi +++ b/torch/utils/data/dataset.pyi @@ -44,7 +44,7 @@ class MapDataPipe(Generic[T_co]): # Functional form of 'ConcaterMapDataPipe' def concat(self, *datapipes: MapDataPipe) -> MapDataPipe: ... # Functional form of 'MapperMapDataPipe' - def map(self, fn: Callable= ..., fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None) -> MapDataPipe: ... + def map(self, fn: Callable= ...) -> MapDataPipe: ... # Functional form of 'ShufflerMapDataPipe' def shuffle(self, *, indices: Optional[List] = None) -> MapDataPipe: ... # Functional form of 'ZipperMapDataPipe' @@ -65,7 +65,7 @@ class IterableDataset(Dataset[T_co], metaclass=_DataPipeMeta): # Functional form of 'BatcherIterDataPipe' def batch(self, batch_size: int, drop_last: bool = False, wrapper_class=DataChunk) -> IterDataPipe: ... # Functional form of 'CollatorIterDataPipe' - def collate(self, collate_fn: Callable= ..., fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None) -> IterDataPipe: ... + def collate(self, collate_fn: Callable= ...) -> IterDataPipe: ... # Functional form of 'ConcaterIterDataPipe' def concat(self, *datapipes: IterDataPipe) -> IterDataPipe: ... # Functional form of 'RoutedDecoderIterDataPipe' @@ -73,13 +73,13 @@ class IterableDataset(Dataset[T_co], metaclass=_DataPipeMeta): # Functional form of 'DemultiplexerIterDataPipe' def demux(self, num_instances: int, classifier_fn: Callable[[T_co], Optional[int]], drop_none: bool = False, buffer_size: int = 1000) -> List[IterDataPipe]: ... # Functional form of 'FilterIterDataPipe' - def filter(self, filter_fn: Callable, fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None, drop_empty_batches: bool = True) -> IterDataPipe: ... + def filter(self, filter_fn: Callable, drop_empty_batches: bool = True) -> IterDataPipe: ... # Functional form of 'ForkerIterDataPipe' def fork(self, num_instances: int, buffer_size: int = 1000) -> List[IterDataPipe]: ... # Functional form of 'GrouperIterDataPipe' def groupby(self, group_key_fn: Callable, *, buffer_size: int = 10000, group_size: Optional[int] = None, unbatch_level: int = 0, guaranteed_group_size: Optional[int] = None, drop_remaining: bool = False) -> IterDataPipe: ... # Functional form of 'MapperIterDataPipe' - def map(self, fn: Callable, input_col=None, output_col=None, *, fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None) -> IterDataPipe: ... + def map(self, fn: Callable, input_col=None, output_col=None) -> IterDataPipe: ... # Functional form of 'MultiplexerIterDataPipe' def mux(self, *datapipes) -> IterDataPipe: ... # Functional form of 'ShardingFilterIterDataPipe'