mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-29 19:24:55 +08:00
Compare commits
32 Commits
v2.1.1-rc6
...
v1.8.0-rc5
| Author | SHA1 | Date | |
|---|---|---|---|
| 37c1f4a7fe | |||
| 49b74a52a4 | |||
| 11c78e9cb3 | |||
| d6943ea58d | |||
| 02b61b49ea | |||
| d553478c98 | |||
| 63333e2a25 | |||
| 8e7eebfc9a | |||
| f8afb8bdd0 | |||
| 0851cc42b0 | |||
| 804f7b6018 | |||
| 32758d30b3 | |||
| bcb64a8084 | |||
| f07991d396 | |||
| c458cd4852 | |||
| f7c4afc0f4 | |||
| 20554c00b6 | |||
| 3464d64f08 | |||
| c6972eb3ac | |||
| 25562d3d41 | |||
| cd63c37bc6 | |||
| c79decdbba | |||
| c307a3f336 | |||
| f071020756 | |||
| 4f436f8570 | |||
| ae11589710 | |||
| 9e5bcc1020 | |||
| fa8578241d | |||
| 1368809532 | |||
| 4073248fc2 | |||
| 75153cb730 | |||
| 5bb69b080c |
@ -52,6 +52,14 @@ CONFIG_TREE_DATA = OrderedDict(
|
|||||||
"3.7",
|
"3.7",
|
||||||
],
|
],
|
||||||
)),
|
)),
|
||||||
|
macos_arm64=([None], OrderedDict(
|
||||||
|
wheel=[
|
||||||
|
"3.8",
|
||||||
|
],
|
||||||
|
conda=[
|
||||||
|
"3.8",
|
||||||
|
],
|
||||||
|
)),
|
||||||
# Skip CUDA-9.2 builds on Windows
|
# Skip CUDA-9.2 builds on Windows
|
||||||
windows=(
|
windows=(
|
||||||
[v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
|
[v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
|
||||||
|
|||||||
@ -164,7 +164,7 @@ def gen_build_env_list(smoke):
|
|||||||
c.find_prop("gpu"),
|
c.find_prop("gpu"),
|
||||||
c.find_prop("package_format"),
|
c.find_prop("package_format"),
|
||||||
[c.find_prop("pyver")],
|
[c.find_prop("pyver")],
|
||||||
c.find_prop("smoke"),
|
c.find_prop("smoke") and not (c.find_prop("os_name") == "macos_arm64"), # don't test arm64
|
||||||
c.find_prop("libtorch_variant"),
|
c.find_prop("libtorch_variant"),
|
||||||
c.find_prop("gcc_config_variant"),
|
c.find_prop("gcc_config_variant"),
|
||||||
c.find_prop("libtorch_config_variant"),
|
c.find_prop("libtorch_config_variant"),
|
||||||
@ -216,7 +216,9 @@ def get_jobs(toplevel_key, smoke):
|
|||||||
configs = gen_build_env_list(smoke)
|
configs = gen_build_env_list(smoke)
|
||||||
phase = "build" if toplevel_key == "binarybuilds" else "test"
|
phase = "build" if toplevel_key == "binarybuilds" else "test"
|
||||||
for build_config in configs:
|
for build_config in configs:
|
||||||
jobs_list.append(build_config.gen_workflow_job(phase, nightly=True))
|
# don't test for macos_arm64 as it's cross compiled
|
||||||
|
if phase != "test" or build_config.os != "macos_arm64":
|
||||||
|
jobs_list.append(build_config.gen_workflow_job(phase, nightly=True))
|
||||||
|
|
||||||
return jobs_list
|
return jobs_list
|
||||||
|
|
||||||
|
|||||||
@ -3,7 +3,7 @@ PHASES = ["build", "test"]
|
|||||||
CUDA_VERSIONS = [
|
CUDA_VERSIONS = [
|
||||||
"101",
|
"101",
|
||||||
"102",
|
"102",
|
||||||
"112",
|
"111",
|
||||||
]
|
]
|
||||||
|
|
||||||
ROCM_VERSIONS = [
|
ROCM_VERSIONS = [
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,10 @@ source /env
|
|||||||
# Defaults here so they can be changed in one place
|
# Defaults here so they can be changed in one place
|
||||||
export MAX_JOBS=${MAX_JOBS:-$(( $(nproc) - 2 ))}
|
export MAX_JOBS=${MAX_JOBS:-$(( $(nproc) - 2 ))}
|
||||||
|
|
||||||
|
if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
|
||||||
|
export BUILD_SPLIT_CUDA="ON"
|
||||||
|
fi
|
||||||
|
|
||||||
# Parse the parameters
|
# Parse the parameters
|
||||||
if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
|
if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
|
||||||
build_script='conda/build_pytorch.sh'
|
build_script='conda/build_pytorch.sh'
|
||||||
|
|||||||
@ -15,6 +15,10 @@ else
|
|||||||
export VC_YEAR=2019
|
export VC_YEAR=2019
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
|
||||||
|
export BUILD_SPLIT_CUDA="ON"
|
||||||
|
fi
|
||||||
|
|
||||||
set +x
|
set +x
|
||||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
|
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
|
||||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
|
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
|
||||||
|
|||||||
@ -111,11 +111,11 @@ commands:
|
|||||||
git config --global user.email "circleci.ossci@gmail.com"
|
git config --global user.email "circleci.ossci@gmail.com"
|
||||||
git config --global user.name "CircleCI"
|
git config --global user.name "CircleCI"
|
||||||
git config remote.origin.url https://github.com/pytorch/pytorch.git
|
git config remote.origin.url https://github.com/pytorch/pytorch.git
|
||||||
git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
|
git config --add remote.origin.fetch +refs/heads/release/1.8:refs/remotes/origin/release/1.8
|
||||||
git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
|
git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/release/1.8:refs/remotes/origin/release/1.8 --depth=100 --quiet
|
||||||
# PRs generated from ghstack has format CIRCLE_PR_BASE_BRANCH=gh/xxx/1234/base
|
# PRs generated from ghstack has format CIRCLE_PR_BASE_BRANCH=gh/xxx/1234/base
|
||||||
if [[ "${CIRCLE_PR_BASE_BRANCH}" == "gh/"* ]]; then
|
if [[ "${CIRCLE_PR_BASE_BRANCH}" == "gh/"* ]]; then
|
||||||
CIRCLE_PR_BASE_BRANCH=master
|
CIRCLE_PR_BASE_BRANCH=release/1.8
|
||||||
fi
|
fi
|
||||||
export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/$CIRCLE_PR_BASE_BRANCH`
|
export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/$CIRCLE_PR_BASE_BRANCH`
|
||||||
echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
|
echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
|
||||||
|
|||||||
@ -198,6 +198,44 @@
|
|||||||
root: /Users/distiller/project
|
root: /Users/distiller/project
|
||||||
paths: final_pkgs
|
paths: final_pkgs
|
||||||
|
|
||||||
|
- store_artifacts:
|
||||||
|
path: /Users/distiller/project/final_pkgs
|
||||||
|
|
||||||
|
binary_macos_arm64_build:
|
||||||
|
<<: *binary_mac_params
|
||||||
|
macos:
|
||||||
|
xcode: "12.3.0"
|
||||||
|
steps:
|
||||||
|
# See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
|
||||||
|
- checkout
|
||||||
|
- run:
|
||||||
|
<<: *binary_checkout
|
||||||
|
- run:
|
||||||
|
<<: *binary_populate_env
|
||||||
|
- brew_update
|
||||||
|
- run:
|
||||||
|
<<: *binary_install_miniconda
|
||||||
|
|
||||||
|
- run:
|
||||||
|
name: Build
|
||||||
|
no_output_timeout: "90m"
|
||||||
|
command: |
|
||||||
|
# Do not set -u here; there is some problem with CircleCI
|
||||||
|
# variable expansion with PROMPT_COMMAND
|
||||||
|
set -ex -o pipefail
|
||||||
|
export CROSS_COMPILE_ARM64=1
|
||||||
|
script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_build.sh"
|
||||||
|
cat "$script"
|
||||||
|
source "$script"
|
||||||
|
|
||||||
|
- persist_to_workspace:
|
||||||
|
root: /Users/distiller/project
|
||||||
|
paths: final_pkgs
|
||||||
|
|
||||||
|
- store_artifacts:
|
||||||
|
path: /Users/distiller/project/final_pkgs
|
||||||
|
|
||||||
|
|
||||||
binary_ios_build:
|
binary_ios_build:
|
||||||
<<: *pytorch_ios_params
|
<<: *pytorch_ios_params
|
||||||
macos:
|
macos:
|
||||||
|
|||||||
5
.github/workflows/lint.yml
vendored
5
.github/workflows/lint.yml
vendored
@ -93,9 +93,12 @@ jobs:
|
|||||||
check_name: 'flake8-py3'
|
check_name: 'flake8-py3'
|
||||||
linter_output_path: 'flake8-output.txt'
|
linter_output_path: 'flake8-output.txt'
|
||||||
commit_sha: ${{ steps.get_pr_tip.outputs.commit_sha }}
|
commit_sha: ${{ steps.get_pr_tip.outputs.commit_sha }}
|
||||||
regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w\d+) (?<errorDesc>.*)'
|
regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w+\d+) (?<errorDesc>.*)'
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
- name: Catch any other warnings
|
||||||
|
run: |
|
||||||
|
[ ! -s flake8-output.txt ]
|
||||||
|
|
||||||
clang-tidy:
|
clang-tidy:
|
||||||
if: github.event_name == 'pull_request'
|
if: github.event_name == 'pull_request'
|
||||||
|
|||||||
2
.gitmodules
vendored
2
.gitmodules
vendored
@ -121,7 +121,7 @@
|
|||||||
[submodule "third_party/XNNPACK"]
|
[submodule "third_party/XNNPACK"]
|
||||||
ignore = dirty
|
ignore = dirty
|
||||||
path = third_party/XNNPACK
|
path = third_party/XNNPACK
|
||||||
url = https://github.com/google/XNNPACK.git
|
url = https://github.com/malfet/XNNPACK.git
|
||||||
[submodule "third_party/fmt"]
|
[submodule "third_party/fmt"]
|
||||||
ignore = dirty
|
ignore = dirty
|
||||||
path = third_party/fmt
|
path = third_party/fmt
|
||||||
|
|||||||
@ -182,7 +182,7 @@ fi
|
|||||||
|
|
||||||
# Patch required to build xla
|
# Patch required to build xla
|
||||||
if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
|
if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
|
||||||
git clone --recursive https://github.com/pytorch/xla.git
|
git clone --recursive -b r1.8 https://github.com/pytorch/xla.git
|
||||||
./xla/scripts/apply_patches.sh
|
./xla/scripts/apply_patches.sh
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@ -54,7 +54,7 @@ function file_diff_from_base() {
|
|||||||
set +e
|
set +e
|
||||||
git fetch origin master --quiet
|
git fetch origin master --quiet
|
||||||
set -e
|
set -e
|
||||||
git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
|
git diff --name-only "$(git merge-base origin/release/1.8 HEAD)" > "$1"
|
||||||
}
|
}
|
||||||
|
|
||||||
function get_bazel() {
|
function get_bazel() {
|
||||||
|
|||||||
@ -300,7 +300,7 @@ test_backward_compatibility() {
|
|||||||
pushd test/backward_compatibility
|
pushd test/backward_compatibility
|
||||||
python -m venv venv
|
python -m venv venv
|
||||||
. venv/bin/activate
|
. venv/bin/activate
|
||||||
pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
|
pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
|
||||||
pip show torch
|
pip show torch
|
||||||
python dump_all_function_schemas.py --filename nightly_schemas.txt
|
python dump_all_function_schemas.py --filename nightly_schemas.txt
|
||||||
deactivate
|
deactivate
|
||||||
|
|||||||
@ -11,7 +11,6 @@
|
|||||||
#include <ATen/DeviceGuard.h>
|
#include <ATen/DeviceGuard.h>
|
||||||
#include <ATen/DimVector.h>
|
#include <ATen/DimVector.h>
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
#include <ATen/DynamicLibrary.h>
|
|
||||||
#include <ATen/Formatting.h>
|
#include <ATen/Formatting.h>
|
||||||
#include <ATen/Functions.h>
|
#include <ATen/Functions.h>
|
||||||
#include <ATen/NamedTensor.h>
|
#include <ATen/NamedTensor.h>
|
||||||
|
|||||||
@ -25,9 +25,16 @@ static void* checkDL(void* x) {
|
|||||||
|
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
DynamicLibrary::DynamicLibrary(const char* name) {
|
DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
|
||||||
// NOLINTNEXTLINE(hicpp-signed-bitwise)
|
// NOLINTNEXTLINE(hicpp-signed-bitwise)
|
||||||
handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
|
handle = dlopen(name, RTLD_LOCAL | RTLD_NOW);
|
||||||
|
if (!handle) {
|
||||||
|
if (alt_name) {
|
||||||
|
handle = checkDL(dlopen(alt_name, RTLD_LOCAL | RTLD_NOW));
|
||||||
|
} else {
|
||||||
|
AT_ERROR("Error in dlopen or dlsym: ", dlerror());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void* DynamicLibrary::sym(const char* name) {
|
void* DynamicLibrary::sym(const char* name) {
|
||||||
@ -45,7 +52,7 @@ DynamicLibrary::~DynamicLibrary() {
|
|||||||
|
|
||||||
// Windows
|
// Windows
|
||||||
|
|
||||||
DynamicLibrary::DynamicLibrary(const char* name) {
|
DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
|
||||||
// NOLINTNEXTLINE(hicpp-signed-bitwise)
|
// NOLINTNEXTLINE(hicpp-signed-bitwise)
|
||||||
HMODULE theModule;
|
HMODULE theModule;
|
||||||
bool reload = true;
|
bool reload = true;
|
||||||
|
|||||||
@ -8,7 +8,7 @@ namespace at {
|
|||||||
struct DynamicLibrary {
|
struct DynamicLibrary {
|
||||||
AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
|
AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
|
||||||
|
|
||||||
TORCH_API DynamicLibrary(const char* name);
|
TORCH_API DynamicLibrary(const char* name, const char* alt_name = nullptr);
|
||||||
|
|
||||||
TORCH_API void* sym(const char* name);
|
TORCH_API void* sym(const char* name);
|
||||||
|
|
||||||
|
|||||||
@ -23,10 +23,17 @@ at::DynamicLibrary& getNVRTCLibrary() {
|
|||||||
constexpr auto minor = ( CUDA_VERSION / 10 ) % 10;
|
constexpr auto minor = ( CUDA_VERSION / 10 ) % 10;
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll";
|
auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll";
|
||||||
|
std::string alt_libname;
|
||||||
#else
|
#else
|
||||||
static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor);
|
static auto lib_version = std::to_string(major) + "." + std::to_string(minor);
|
||||||
|
static auto libname = std::string("libnvrtc.so.") + lib_version;
|
||||||
|
#ifdef NVRTC_SHORTHASH
|
||||||
|
static auto alt_libname = std::string("libnvrtc-") + C10_STRINGIZE(NVRTC_SHORTHASH) + ".so." + lib_version;
|
||||||
|
#else
|
||||||
|
std::string alt_libname;
|
||||||
#endif
|
#endif
|
||||||
static at::DynamicLibrary lib(libname.c_str());
|
#endif
|
||||||
|
static at::DynamicLibrary lib(libname.c_str(), alt_libname.empty() ? nullptr : alt_libname.c_str());
|
||||||
return lib;
|
return lib;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -238,7 +238,12 @@ auto ConvParams::use_mkldnn(const at::Tensor& input, const at::Tensor& weight) c
|
|||||||
(groups > 1
|
(groups > 1
|
||||||
|| (weight.size(-1) > 3 && weight.size(-2) > 3)
|
|| (weight.size(-1) > 3 && weight.size(-2) > 3)
|
||||||
|| input.size(0) > 1
|
|| input.size(0) > 1
|
||||||
|| input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480)); // for some case, native is faster
|
|| input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster
|
||||||
|
// OneDNN < 1.8.1 produce incorrect results in this case (see #50042)
|
||||||
|
// TODO(VitalyFedyunin): Remove this patch after OneDNN 1.8.1 merged in
|
||||||
|
&& !(groups == 24 && weight.size(0) == 24 && weight.size(1) == 1)
|
||||||
|
);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -26,7 +26,7 @@ static void upsample_bicubic2d_out_frame(
|
|||||||
const scalar_t* in = &idata[output_y * input_width + output_x];
|
const scalar_t* in = &idata[output_y * input_width + output_x];
|
||||||
scalar_t* out = &odata[output_y * output_width + output_x];
|
scalar_t* out = &odata[output_y * output_width + output_x];
|
||||||
|
|
||||||
for (int64_t c = 0; c < channels; ++c) {
|
for (int64_t c = 0; c < channels * nbatch; ++c) {
|
||||||
out[0] = in[0];
|
out[0] = in[0];
|
||||||
in += input_width * input_height;
|
in += input_width * input_height;
|
||||||
out += output_width * output_height;
|
out += output_width * output_height;
|
||||||
|
|||||||
@ -19,6 +19,27 @@ namespace {
|
|||||||
|
|
||||||
using namespace vec256;
|
using namespace vec256;
|
||||||
|
|
||||||
|
// Note: Explicit implementation of copysign for Half and BFloat16
|
||||||
|
// is needed to workaround g++-7/8 crash on aarch64, but also makes
|
||||||
|
// copysign faster for the half-precision types
|
||||||
|
template<typename T>
|
||||||
|
T copysign(T a, T b) {
|
||||||
|
return std::copysign(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Implement copysign for half precision floats using bit ops
|
||||||
|
// Sign is the most significant bit for both half and bfloat16 types
|
||||||
|
template<>
|
||||||
|
c10::Half copysign(c10::Half a, c10::Half b) {
|
||||||
|
return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
|
||||||
|
return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Note: Undefined behavior when performing addition is intentionally
|
// Note: Undefined behavior when performing addition is intentionally
|
||||||
// ignored.
|
// ignored.
|
||||||
void add_kernel(TensorIteratorBase& iter, Scalar alpha_scalar) {
|
void add_kernel(TensorIteratorBase& iter, Scalar alpha_scalar) {
|
||||||
@ -180,7 +201,7 @@ void div_floor_kernel(TensorIterator& iter) {
|
|||||||
floordiv += scalar_t(1.0);
|
floordiv += scalar_t(1.0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
floordiv = std::copysign(scalar_t(0), a / b);
|
floordiv = copysign(scalar_t(0), a / b);
|
||||||
}
|
}
|
||||||
return floordiv;
|
return floordiv;
|
||||||
});
|
});
|
||||||
@ -889,23 +910,6 @@ void heaviside_kernel(TensorIterator& iter) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
T copysign(T a, T b) {
|
|
||||||
return std::copysign(a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Implement copysign for half precision floats using bit ops
|
|
||||||
// Sign is the most significant bit for both half and bfloat16 types
|
|
||||||
template<>
|
|
||||||
c10::Half copysign(c10::Half a, c10::Half b) {
|
|
||||||
return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
|
||||||
c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
|
|
||||||
return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
|
|
||||||
}
|
|
||||||
|
|
||||||
void copysign_kernel(TensorIterator& iter) {
|
void copysign_kernel(TensorIterator& iter) {
|
||||||
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() {
|
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() {
|
||||||
cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {
|
cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {
|
||||||
|
|||||||
@ -113,31 +113,46 @@ __global__ void upsample_trilinear3d_out_frame(
|
|||||||
template <typename scalar_t, typename accscalar_t>
|
template <typename scalar_t, typename accscalar_t>
|
||||||
C10_LAUNCH_BOUNDS_1(1024)
|
C10_LAUNCH_BOUNDS_1(1024)
|
||||||
__global__ void upsample_trilinear3d_backward_out_frame(
|
__global__ void upsample_trilinear3d_backward_out_frame(
|
||||||
const size_t nc_,
|
const int num_kernels,
|
||||||
const int depth1,
|
|
||||||
const int height1,
|
|
||||||
const int width1,
|
|
||||||
const int depth2,
|
|
||||||
const int height2,
|
|
||||||
const int width2,
|
|
||||||
const accscalar_t rdepth,
|
const accscalar_t rdepth,
|
||||||
const accscalar_t rheight,
|
const accscalar_t rheight,
|
||||||
const accscalar_t rwidth,
|
const accscalar_t rwidth,
|
||||||
const bool align_corners,
|
const bool align_corners,
|
||||||
scalar_t* __restrict__ idata,
|
PackedTensorAccessor64<scalar_t, 5> idata,
|
||||||
const scalar_t* __restrict__ odata) {
|
const PackedTensorAccessor64<scalar_t, 5> odata,
|
||||||
const size_t i_numel = nc_ * depth1 * height1 * width1;
|
scalar_t* idata_ptr) {
|
||||||
const size_t o_numel = nc_ * depth2 * height2 * width2;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
||||||
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel; index += blockDim.x * gridDim.x) {
|
const int batchsize = idata.size(0);
|
||||||
size_t index_temp = index;
|
const int channels = idata.size(1);
|
||||||
const int w2 = index_temp % width2; // 0:width2-1
|
const int depth1 = idata.size(2);
|
||||||
index_temp /= width2;
|
const int height1 = idata.size(3);
|
||||||
const int h2 = index_temp % height2; // 0:height2-1
|
const int width1 = idata.size(4);
|
||||||
index_temp /= height2;
|
const int depth2 = odata.size(2);
|
||||||
const int t2 = index_temp % depth2; // 0:depth2-1
|
const int height2 = odata.size(3);
|
||||||
const int nc = index_temp / depth2;
|
const int width2 = odata.size(4);
|
||||||
|
|
||||||
|
const size_t i_numel = batchsize * channels * depth1 * height1 * width1;
|
||||||
|
|
||||||
|
if (index < num_kernels) {
|
||||||
|
const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
|
||||||
|
const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
|
||||||
|
const int t2 = index / (height2 * width2); // 0:depth2-1
|
||||||
|
// special case: just copy
|
||||||
|
if (depth1 == depth2 && height1 == height2 && width1 == width2) {
|
||||||
|
const int t1 = t2;
|
||||||
|
const int h1 = h2;
|
||||||
|
const int w1 = w2;
|
||||||
|
|
||||||
|
for (int n = 0; n < batchsize; n++) {
|
||||||
|
for (int c = 0; c < channels; ++c) {
|
||||||
|
const scalar_t val = odata[n][c][t1][h1][w1];
|
||||||
|
idata[n][c][t2][h2][w2] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
//
|
||||||
const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>(
|
const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>(
|
||||||
rdepth, t2, align_corners, /*cubic=*/false);
|
rdepth, t2, align_corners, /*cubic=*/false);
|
||||||
const int t1 = t1r;
|
const int t1 = t1r;
|
||||||
@ -159,55 +174,60 @@ __global__ void upsample_trilinear3d_backward_out_frame(
|
|||||||
const accscalar_t w1lambda = w1r - w1;
|
const accscalar_t w1lambda = w1r - w1;
|
||||||
const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
|
const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
|
||||||
//
|
//
|
||||||
const scalar_t d2val = odata[index];
|
for (int n = 0; n < batchsize; n++) {
|
||||||
fastAtomicAdd(
|
for (int c = 0; c < channels; ++c) {
|
||||||
idata,
|
const scalar_t d2val = odata[n][c][t2][h2][w2];
|
||||||
idx_3d(nc, depth1, height1, width1, t1, h1, w1),
|
const size_t nc = n * channels + c;
|
||||||
i_numel,
|
fastAtomicAdd(
|
||||||
static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val),
|
idata_ptr,
|
||||||
true);
|
idx_3d(nc, depth1, height1, width1, t1, h1, w1),
|
||||||
fastAtomicAdd(
|
i_numel,
|
||||||
idata,
|
static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val),
|
||||||
idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
|
true);
|
||||||
i_numel,
|
fastAtomicAdd(
|
||||||
static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
|
idata_ptr,
|
||||||
true);
|
idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
|
||||||
fastAtomicAdd(
|
i_numel,
|
||||||
idata,
|
static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
|
||||||
idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
|
true);
|
||||||
i_numel,
|
fastAtomicAdd(
|
||||||
static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
|
idata_ptr,
|
||||||
true);
|
idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
|
||||||
fastAtomicAdd(
|
i_numel,
|
||||||
idata,
|
static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
|
||||||
idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
|
true);
|
||||||
i_numel,
|
fastAtomicAdd(
|
||||||
static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
|
idata_ptr,
|
||||||
true);
|
idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
|
||||||
fastAtomicAdd(
|
i_numel,
|
||||||
idata,
|
static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
|
||||||
idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
|
true);
|
||||||
i_numel,
|
fastAtomicAdd(
|
||||||
static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
|
idata_ptr,
|
||||||
true);
|
idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
|
||||||
fastAtomicAdd(
|
i_numel,
|
||||||
idata,
|
static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
|
||||||
idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
|
true);
|
||||||
i_numel,
|
fastAtomicAdd(
|
||||||
static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
|
idata_ptr,
|
||||||
true);
|
idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
|
||||||
fastAtomicAdd(
|
i_numel,
|
||||||
idata,
|
static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
|
||||||
idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
|
true);
|
||||||
i_numel,
|
fastAtomicAdd(
|
||||||
static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
|
idata_ptr,
|
||||||
true);
|
idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
|
||||||
fastAtomicAdd(
|
i_numel,
|
||||||
idata,
|
static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
|
||||||
idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
|
true);
|
||||||
i_numel,
|
fastAtomicAdd(
|
||||||
static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
|
idata_ptr,
|
||||||
true);
|
idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
|
||||||
|
i_numel,
|
||||||
|
static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -350,21 +370,20 @@ static void upsample_trilinear3d_backward_out_cuda_template(
|
|||||||
// so it has to be initialized to zero.
|
// so it has to be initialized to zero.
|
||||||
grad_input.zero_();
|
grad_input.zero_();
|
||||||
|
|
||||||
// const size_t num_kernels = nbatch * channels * output_depth * output_height * output_width;
|
const int num_kernels = output_depth * output_height * output_width;
|
||||||
const size_t num_kernels = grad_output.numel();
|
|
||||||
const int num_threads = std::min(
|
const int num_threads = std::min(
|
||||||
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
|
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
|
||||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
if (num_kernels > 0) {
|
|
||||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||||
grad_output.scalar_type(),
|
grad_output.scalar_type(),
|
||||||
"upsample_trilinear3d_backward_out_frame",
|
"upsample_trilinear3d_backward_out_frame",
|
||||||
[&] {
|
[&] {
|
||||||
using accscalar_t = at::acc_type<scalar_t, true>;
|
using accscalar_t = at::acc_type<scalar_t, true>;
|
||||||
|
|
||||||
auto idata = grad_input.data_ptr<scalar_t>();
|
auto idata = grad_input.packed_accessor64<scalar_t, 5>();
|
||||||
auto odata = grad_output.data_ptr<scalar_t>();
|
auto odata = grad_output.packed_accessor64<scalar_t, 5>();
|
||||||
|
scalar_t* idata_ptr = grad_input.data_ptr<scalar_t>();
|
||||||
|
|
||||||
const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
|
const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
|
||||||
input_depth, output_depth, align_corners, scales_d);
|
input_depth, output_depth, align_corners, scales_d);
|
||||||
@ -374,26 +393,20 @@ static void upsample_trilinear3d_backward_out_cuda_template(
|
|||||||
input_width, output_width, align_corners, scales_w);
|
input_width, output_width, align_corners, scales_w);
|
||||||
|
|
||||||
upsample_trilinear3d_backward_out_frame<scalar_t, accscalar_t>
|
upsample_trilinear3d_backward_out_frame<scalar_t, accscalar_t>
|
||||||
<<<cuda::ATenCeilDiv(num_kernels, static_cast<size_t>(num_threads)),
|
<<<cuda::ATenCeilDiv(num_kernels, num_threads),
|
||||||
num_threads,
|
num_threads,
|
||||||
0,
|
0,
|
||||||
stream>>>(
|
stream>>>(
|
||||||
nbatch * channels,
|
num_kernels,
|
||||||
input_depth,
|
|
||||||
input_height,
|
|
||||||
input_width,
|
|
||||||
output_depth,
|
|
||||||
output_height,
|
|
||||||
output_width,
|
|
||||||
rdepth,
|
rdepth,
|
||||||
rheight,
|
rheight,
|
||||||
rwidth,
|
rwidth,
|
||||||
align_corners,
|
align_corners,
|
||||||
idata,
|
idata,
|
||||||
odata);
|
odata,
|
||||||
|
idata_ptr);
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
});
|
});
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|||||||
@ -133,7 +133,9 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
|
|||||||
ASSERT_EQ(buffer1[i].z, buffer2[i].z);
|
ASSERT_EQ(buffer1[i].z, buffer2[i].z);
|
||||||
ASSERT_EQ(buffer1[i].w, buffer2[i].w);
|
ASSERT_EQ(buffer1[i].w, buffer2[i].w);
|
||||||
}
|
}
|
||||||
|
// Skipping this part until https://github.com/pytorch/pytorch/issues/51863 is resolved
|
||||||
|
|
||||||
|
#if 0
|
||||||
// unaligned
|
// unaligned
|
||||||
for (int i = 0; i < 16; i++) {
|
for (int i = 0; i < 16; i++) {
|
||||||
for (int j = 0; j < 16; j++) {
|
for (int j = 0; j < 16; j++) {
|
||||||
@ -151,4 +153,5 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@ int32_t driver_version() {
|
|||||||
return driver_version;
|
return driver_version;
|
||||||
}
|
}
|
||||||
|
|
||||||
int device_count_impl() {
|
int device_count_impl(bool fail_if_no_driver) {
|
||||||
int count;
|
int count;
|
||||||
auto err = cudaGetDeviceCount(&count);
|
auto err = cudaGetDeviceCount(&count);
|
||||||
if (err == cudaSuccess) {
|
if (err == cudaSuccess) {
|
||||||
@ -34,6 +34,11 @@ int device_count_impl() {
|
|||||||
case cudaErrorInsufficientDriver: {
|
case cudaErrorInsufficientDriver: {
|
||||||
auto version = driver_version();
|
auto version = driver_version();
|
||||||
if (version <= 0) {
|
if (version <= 0) {
|
||||||
|
if (!fail_if_no_driver) {
|
||||||
|
// No CUDA driver means no devices
|
||||||
|
count = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
false,
|
false,
|
||||||
"Found no NVIDIA driver on your system. Please check that you "
|
"Found no NVIDIA driver on your system. Please check that you "
|
||||||
@ -95,9 +100,9 @@ DeviceIndex device_count() noexcept {
|
|||||||
// initialize number of devices only once
|
// initialize number of devices only once
|
||||||
static int count = []() {
|
static int count = []() {
|
||||||
try {
|
try {
|
||||||
auto result = device_count_impl();
|
auto result = device_count_impl(/*fail_if_no_driver=*/false);
|
||||||
TORCH_INTERNAL_ASSERT(result <= std::numeric_limits<DeviceIndex>::max(), "Too many CUDA devices, DeviceIndex overflowed");
|
TORCH_INTERNAL_ASSERT(result <= std::numeric_limits<DeviceIndex>::max(), "Too many CUDA devices, DeviceIndex overflowed");
|
||||||
return device_count_impl();
|
return result;
|
||||||
} catch (const c10::Error& ex) {
|
} catch (const c10::Error& ex) {
|
||||||
// We don't want to fail, but still log the warning
|
// We don't want to fail, but still log the warning
|
||||||
// msg() returns the message without the stack trace
|
// msg() returns the message without the stack trace
|
||||||
@ -110,7 +115,7 @@ DeviceIndex device_count() noexcept {
|
|||||||
|
|
||||||
DeviceIndex device_count_ensure_non_zero() {
|
DeviceIndex device_count_ensure_non_zero() {
|
||||||
// Call the implementation every time to throw the exception
|
// Call the implementation every time to throw the exception
|
||||||
int count = device_count_impl();
|
int count = device_count_impl(/*fail_if_no_driver=*/true);
|
||||||
// Zero gpus doesn't produce a warning in `device_count` but we fail here
|
// Zero gpus doesn't produce a warning in `device_count` but we fail here
|
||||||
TORCH_CHECK(count, "No CUDA GPUs are available");
|
TORCH_CHECK(count, "No CUDA GPUs are available");
|
||||||
return static_cast<DeviceIndex>(count);
|
return static_cast<DeviceIndex>(count);
|
||||||
|
|||||||
@ -590,6 +590,10 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
|
|||||||
list(APPEND Caffe2_GPU_SRCS
|
list(APPEND Caffe2_GPU_SRCS
|
||||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
set_source_files_properties(
|
||||||
|
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
||||||
|
PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
|
||||||
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_ROCM)
|
if(USE_ROCM)
|
||||||
@ -741,6 +745,10 @@ file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT})
|
|||||||
# Wrapper library for people who link against torch and expect both CPU and CUDA support
|
# Wrapper library for people who link against torch and expect both CPU and CUDA support
|
||||||
# Contains "torch_cpu" and "torch_cuda"
|
# Contains "torch_cpu" and "torch_cuda"
|
||||||
add_library(torch ${DUMMY_EMPTY_FILE})
|
add_library(torch ${DUMMY_EMPTY_FILE})
|
||||||
|
if(BUILD_SPLIT_CUDA)
|
||||||
|
# When we split torch_cuda, we want a dummy torch_cuda library that contains both parts
|
||||||
|
add_library(torch_cuda ${DUMMY_EMPTY_FILE})
|
||||||
|
endif()
|
||||||
if(HAVE_SOVERSION)
|
if(HAVE_SOVERSION)
|
||||||
set_target_properties(torch PROPERTIES
|
set_target_properties(torch PROPERTIES
|
||||||
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
|
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
|
||||||
@ -1233,11 +1241,12 @@ endif()
|
|||||||
|
|
||||||
caffe2_interface_library(torch_cpu torch_cpu_library)
|
caffe2_interface_library(torch_cpu torch_cpu_library)
|
||||||
|
|
||||||
if(BUILD_SPLIT_CUDA)
|
if(USE_CUDA)
|
||||||
caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library)
|
|
||||||
caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library)
|
|
||||||
elseif(USE_CUDA)
|
|
||||||
caffe2_interface_library(torch_cuda torch_cuda_library)
|
caffe2_interface_library(torch_cuda torch_cuda_library)
|
||||||
|
if(BUILD_SPLIT_CUDA)
|
||||||
|
caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library)
|
||||||
|
caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library)
|
||||||
|
endif()
|
||||||
elseif(USE_ROCM)
|
elseif(USE_ROCM)
|
||||||
caffe2_interface_library(torch_hip torch_hip_library)
|
caffe2_interface_library(torch_hip torch_hip_library)
|
||||||
endif()
|
endif()
|
||||||
@ -1245,22 +1254,26 @@ endif()
|
|||||||
caffe2_interface_library(torch torch_library)
|
caffe2_interface_library(torch torch_library)
|
||||||
|
|
||||||
install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
||||||
if(BUILD_SPLIT_CUDA)
|
|
||||||
install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
if(USE_CUDA)
|
||||||
install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
|
||||||
elseif(USE_CUDA)
|
|
||||||
install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
||||||
|
if(BUILD_SPLIT_CUDA)
|
||||||
|
install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
||||||
|
install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
||||||
|
endif()
|
||||||
elseif(USE_ROCM)
|
elseif(USE_ROCM)
|
||||||
install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
||||||
endif()
|
endif()
|
||||||
install(TARGETS torch torch_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
install(TARGETS torch torch_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
||||||
|
|
||||||
target_link_libraries(torch PUBLIC torch_cpu_library)
|
target_link_libraries(torch PUBLIC torch_cpu_library)
|
||||||
if(BUILD_SPLIT_CUDA)
|
|
||||||
target_link_libraries(torch PUBLIC torch_cuda_cu_library)
|
if(USE_CUDA)
|
||||||
target_link_libraries(torch PUBLIC torch_cuda_cpp_library)
|
|
||||||
elseif(USE_CUDA)
|
|
||||||
target_link_libraries(torch PUBLIC torch_cuda_library)
|
target_link_libraries(torch PUBLIC torch_cuda_library)
|
||||||
|
if(BUILD_SPLIT_CUDA)
|
||||||
|
target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library)
|
||||||
|
target_link_libraries(torch_cuda PUBLIC torch_cuda_cpp_library)
|
||||||
|
endif()
|
||||||
elseif(USE_ROCM)
|
elseif(USE_ROCM)
|
||||||
target_link_libraries(torch PUBLIC torch_hip_library)
|
target_link_libraries(torch PUBLIC torch_hip_library)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@ -188,6 +188,20 @@ find_library(CUDA_CUDA_LIB cuda
|
|||||||
find_library(CUDA_NVRTC_LIB nvrtc
|
find_library(CUDA_NVRTC_LIB nvrtc
|
||||||
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
|
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
|
||||||
PATH_SUFFIXES lib lib64 lib/x64)
|
PATH_SUFFIXES lib lib64 lib/x64)
|
||||||
|
if(CUDA_NVRTC_LIB AND NOT CUDA_NVRTC_SHORTHASH)
|
||||||
|
execute_process(
|
||||||
|
COMMAND "${PYTHON_EXECUTABLE}" -c
|
||||||
|
"import hashlib;hash=hashlib.sha256();hash.update(open('${CUDA_NVRTC_LIB}','rb').read());print(hash.hexdigest()[:8])"
|
||||||
|
RESULT_VARIABLE _retval
|
||||||
|
OUTPUT_VARIABLE CUDA_NVRTC_SHORTHASH)
|
||||||
|
if(NOT _retval EQUAL 0)
|
||||||
|
message(WARNING "Failed to compute shorthash for libnvrtc.so")
|
||||||
|
set(CUDA_NVRTC_SHORTHASH "XXXXXXXX")
|
||||||
|
else()
|
||||||
|
string(STRIP "${CUDA_NVRTC_SHORTHASH}" CUDA_NVRTC_SHORTHASH)
|
||||||
|
message(STATUS "${CUDA_NVRTC_LIB} shorthash is ${CUDA_NVRTC_SHORTHASH}")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
# Create new style imported libraries.
|
# Create new style imported libraries.
|
||||||
# Several of these libraries have a hardcoded path if CAFFE2_STATIC_LINK_CUDA
|
# Several of these libraries have a hardcoded path if CAFFE2_STATIC_LINK_CUDA
|
||||||
@ -338,6 +352,12 @@ if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
|
|||||||
set_property(
|
set_property(
|
||||||
TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
|
TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
|
||||||
"${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublasLt_static.a")
|
"${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublasLt_static.a")
|
||||||
|
# Add explicit dependency to cudart_static to fix
|
||||||
|
# libcublasLt_static.a.o): undefined reference to symbol 'cudaStreamWaitEvent'
|
||||||
|
# error adding symbols: DSO missing from command line
|
||||||
|
set_property(
|
||||||
|
TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
|
||||||
|
"${CUDA_cudart_static_LIBRARY}" rt dl)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
set_property(
|
set_property(
|
||||||
|
|||||||
74
docs/source/ddp_comm_hooks.rst
Normal file
74
docs/source/ddp_comm_hooks.rst
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
DDP Communication Hooks
|
||||||
|
=======================
|
||||||
|
|
||||||
|
DDP communication hook is a generic interface to control how to communicate
|
||||||
|
gradients across workers by overriding the vanilla allreduce in
|
||||||
|
`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.>`_.
|
||||||
|
A few built-in communication hooks are provided,
|
||||||
|
and users can easily apply any of these hooks to optimize communication.
|
||||||
|
Besides, the hook interface can also support user-defined communication
|
||||||
|
strategies for more advanced use cases.
|
||||||
|
|
||||||
|
.. warning ::
|
||||||
|
DDP communication hook is experimental and subject to change.
|
||||||
|
|
||||||
|
.. warning ::
|
||||||
|
DDP communication hooks can only support single process single device mode
|
||||||
|
on NCCL backend.
|
||||||
|
|
||||||
|
How to Use a Communication Hook?
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
To use a communication hook, the user just needs to let the DDP model register
|
||||||
|
the hook before the training loop as below.
|
||||||
|
|
||||||
|
:func:`torch.nn.parallel.DistributedDataParallel.register_comm_hook`.
|
||||||
|
:noindex:
|
||||||
|
|
||||||
|
Default Communication Hooks
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
Default communication hooks are simple **stateless** hooks, so the input state
|
||||||
|
in ``register_comm_hook`` is either a process group or ``None``.
|
||||||
|
|
||||||
|
.. automodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks
|
||||||
|
:members:
|
||||||
|
|
||||||
|
PowerSGD Communication Hook
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
PowerSGD (`Vogels et al., NeurIPS 2019 <https://arxiv.org/abs/1905.13727>`_)
|
||||||
|
is a gradient compression algorithm, which can provide very high compression
|
||||||
|
rates and accelerate bandwidth-bound distributed training.
|
||||||
|
This algorithm needs to maintain both some hyperparameters and the internal
|
||||||
|
state. Therefore, PowerSGD communication hook is a **stateful** hook,
|
||||||
|
and the user needs to provide a state object defined as below.
|
||||||
|
|
||||||
|
PowerSGD State
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
|
||||||
|
.. autoclass:: PowerSGDState
|
||||||
|
|
||||||
|
PowerSGD Hooks
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. warning ::
|
||||||
|
PowerSGD typically requires extra memory of the same size as the model's
|
||||||
|
gradients to enable error feedback, which can compensate for biased
|
||||||
|
compressed communication and improve accuracy.
|
||||||
|
|
||||||
|
.. warning ::
|
||||||
|
The current implementation may cause gradient overflow for FP16 input.
|
||||||
|
|
||||||
|
.. autofunction:: powerSGD_hook
|
||||||
|
.. autofunction:: batched_powerSGD_hook
|
||||||
|
|
||||||
|
Acknowledgements
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Many thanks to PowerSGD paper author **Thijs Vogels** for the code review on
|
||||||
|
PowerSGD communication hook, as well as the
|
||||||
|
`comparison experiments <https://observablehq.com/@tvogels/powersgd-benchmark>`_,
|
||||||
|
which show that the performance of PowerSGD communication hook is on par with
|
||||||
|
the implementation in the original `paper <https://arxiv.org/abs/1905.13727>`_.
|
||||||
@ -58,16 +58,16 @@ distributed (NCCL only when building with CUDA). MPI is an optional backend that
|
|||||||
included if you build PyTorch from source. (e.g.building PyTorch on a host that has MPI
|
included if you build PyTorch from source. (e.g.building PyTorch on a host that has MPI
|
||||||
installed.)
|
installed.)
|
||||||
|
|
||||||
.. warning ::
|
.. note ::
|
||||||
As of PyTorch v1.7, Windows support for the distributed package only covers collective
|
As of PyTorch v1.8, Windows supports all collective communications backend but NCCL,
|
||||||
communications with Gloo backend, `FileStore`, and `DistributedDataParallel`. Therefore,
|
If the `init_method` argument of :func:`init_process_group` points to a file it must adhere
|
||||||
the `init_method` argument in :func:`init_process_group` must point to a file. This works
|
to the following schema:
|
||||||
for both local and shared file systems:
|
|
||||||
|
|
||||||
- Local file system, ``init_method="file:///d:/tmp/some_file"``
|
- Local file system, ``init_method="file:///d:/tmp/some_file"``
|
||||||
- Shared file system, ``init_method="file://////{machine_name}/{share_folder_name}/some_file"``
|
- Shared file system, ``init_method="file://////{machine_name}/{share_folder_name}/some_file"``
|
||||||
|
|
||||||
Similarly, if you directly pass in a `store` argument, it must be a ``FileStore`` instance.
|
Same as on Linux platform, you can enable TcpStore by setting environment variables,
|
||||||
|
MASTER_ADDR and MASTER_PORT.
|
||||||
|
|
||||||
Which backend to use?
|
Which backend to use?
|
||||||
^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
@ -330,13 +330,13 @@ as they should never be created manually, but they are guaranteed to support two
|
|||||||
|
|
||||||
Synchronous and asynchronous collective operations
|
Synchronous and asynchronous collective operations
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
Every collective operation function supports the following two kinds of operations,
|
Every collective operation function supports the following two kinds of operations,
|
||||||
depending on the setting of the ``async_op`` flag passed into the collective:
|
depending on the setting of the ``async_op`` flag passed into the collective:
|
||||||
|
|
||||||
**Synchronous operation** - the default mode, when ``async_op`` is set to ``False``.
|
**Synchronous operation** - the default mode, when ``async_op`` is set to ``False``.
|
||||||
When the function returns, it is guaranteed that
|
When the function returns, it is guaranteed that
|
||||||
the collective operation is performed. In the case of CUDA operations, it is not guaranteed
|
the collective operation is performed. In the case of CUDA operations, it is not guaranteed
|
||||||
that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any
|
that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any
|
||||||
further function calls utilizing the output of the collective call will behave as expected. For CUDA collectives,
|
further function calls utilizing the output of the collective call will behave as expected. For CUDA collectives,
|
||||||
function calls utilizing the output on the same CUDA stream will behave as expected. Users must take care of
|
function calls utilizing the output on the same CUDA stream will behave as expected. Users must take care of
|
||||||
synchronization under the scenario of running under different streams. For details on CUDA semantics such as stream
|
synchronization under the scenario of running under different streams. For details on CUDA semantics such as stream
|
||||||
@ -347,12 +347,12 @@ See the below script to see examples of differences in these semantics for CPU a
|
|||||||
returns a distributed request object. In general, you don't need to create it manually and it
|
returns a distributed request object. In general, you don't need to create it manually and it
|
||||||
is guaranteed to support two methods:
|
is guaranteed to support two methods:
|
||||||
|
|
||||||
* ``is_completed()`` - in the case of CPU collectives, returns ``True`` if completed. In the case of CUDA operations,
|
* ``is_completed()`` - in the case of CPU collectives, returns ``True`` if completed. In the case of CUDA operations,
|
||||||
returns ``True`` if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the
|
returns ``True`` if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the
|
||||||
default stream without further synchronization.
|
default stream without further synchronization.
|
||||||
* ``wait()`` - in the case of CPU collectives, will block the process until the operation is completed. In the case
|
* ``wait()`` - in the case of CPU collectives, will block the process until the operation is completed. In the case
|
||||||
of CUDA collectives, will block until the operation has been successfully enqueued onto a CUDA stream and the
|
of CUDA collectives, will block until the operation has been successfully enqueued onto a CUDA stream and the
|
||||||
output can be utilized on the default stream without further synchronization.
|
output can be utilized on the default stream without further synchronization.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
@ -368,7 +368,7 @@ It shows the explicit need to synchronize when using collective outputs on diffe
|
|||||||
handle = dist.all_reduce(output, async_op=True)
|
handle = dist.all_reduce(output, async_op=True)
|
||||||
# Wait ensures the operation is enqueued, but not necessarily complete.
|
# Wait ensures the operation is enqueued, but not necessarily complete.
|
||||||
handle.wait()
|
handle.wait()
|
||||||
# Using result on non-default stream.
|
# Using result on non-default stream.
|
||||||
with torch.cuda.stream(s):
|
with torch.cuda.stream(s):
|
||||||
s.wait_stream(torch.cuda.default_stream())
|
s.wait_stream(torch.cuda.default_stream())
|
||||||
output.add_(100)
|
output.add_(100)
|
||||||
@ -382,7 +382,7 @@ It shows the explicit need to synchronize when using collective outputs on diffe
|
|||||||
Collective functions
|
Collective functions
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
.. autofunction:: broadcast
|
.. autofunction:: broadcast
|
||||||
|
|
||||||
.. autofunction:: broadcast_object_list
|
.. autofunction:: broadcast_object_list
|
||||||
|
|
||||||
@ -426,7 +426,7 @@ you can find an implementation of those in the `torch.distributed.nn.*` module.
|
|||||||
Functions here are synchronous and will be inserted in the autograd graph, so
|
Functions here are synchronous and will be inserted in the autograd graph, so
|
||||||
you need to ensure that all the processes that participated in the collective operation
|
you need to ensure that all the processes that participated in the collective operation
|
||||||
will do the backward pass for the backward communication to effectively happen and
|
will do the backward pass for the backward communication to effectively happen and
|
||||||
don't cause a deadlock.
|
don't cause a deadlock.
|
||||||
|
|
||||||
Please notice that currently the only backend where all the functions are guaranteed to work is ``gloo``.
|
Please notice that currently the only backend where all the functions are guaranteed to work is ``gloo``.
|
||||||
.. autofunction:: torch.distributed.nn.broadcast
|
.. autofunction:: torch.distributed.nn.broadcast
|
||||||
|
|||||||
@ -176,6 +176,15 @@ Probability distributions - torch.distributions
|
|||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
||||||
|
:hidden:`LKJCholesky`
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. currentmodule:: torch.distributions.lkj_cholesky
|
||||||
|
.. autoclass:: LKJCholesky
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
|
||||||
:hidden:`Laplace`
|
:hidden:`Laplace`
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -337,7 +346,7 @@ Probability distributions - torch.distributions
|
|||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|
||||||
:hidden:`Weibull`
|
:hidden:`Weibull`
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
|||||||
@ -71,6 +71,7 @@ Features described in this documentation are classified by release status:
|
|||||||
onnx
|
onnx
|
||||||
optim
|
optim
|
||||||
complex_numbers
|
complex_numbers
|
||||||
|
ddp_comm_hooks
|
||||||
pipeline
|
pipeline
|
||||||
quantization
|
quantization
|
||||||
rpc
|
rpc
|
||||||
|
|||||||
@ -484,6 +484,7 @@ Sparse tensor functions
|
|||||||
+++++++++++++++++++++++
|
+++++++++++++++++++++++
|
||||||
|
|
||||||
.. autofunction:: torch.sparse_coo_tensor
|
.. autofunction:: torch.sparse_coo_tensor
|
||||||
|
:noindex:
|
||||||
.. autofunction:: torch.sparse.sum
|
.. autofunction:: torch.sparse.sum
|
||||||
.. autofunction:: torch.sparse.addmm
|
.. autofunction:: torch.sparse.addmm
|
||||||
.. autofunction:: torch.sparse.mm
|
.. autofunction:: torch.sparse.mm
|
||||||
|
|||||||
@ -563,5 +563,4 @@ Utilities
|
|||||||
promote_types
|
promote_types
|
||||||
use_deterministic_algorithms
|
use_deterministic_algorithms
|
||||||
are_deterministic_algorithms_enabled
|
are_deterministic_algorithms_enabled
|
||||||
vmap
|
|
||||||
_assert
|
_assert
|
||||||
|
|||||||
45
setup.py
45
setup.py
@ -552,6 +552,50 @@ class build_ext(setuptools.command.build_ext.build_ext):
|
|||||||
with open('compile_commands.json', 'w') as f:
|
with open('compile_commands.json', 'w') as f:
|
||||||
f.write(new_contents)
|
f.write(new_contents)
|
||||||
|
|
||||||
|
class concat_license_files():
|
||||||
|
"""Merge LICENSE and LICENSES_BUNDLED.txt as a context manager
|
||||||
|
|
||||||
|
LICENSE is the main PyTorch license, LICENSES_BUNDLED.txt is auto-generated
|
||||||
|
from all the licenses found in ./third_party/. We concatenate them so there
|
||||||
|
is a single license file in the sdist and wheels with all of the necessary
|
||||||
|
licensing info.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
self.f1 = 'LICENSE'
|
||||||
|
self.f2 = 'third_party/LICENSES_BUNDLED.txt'
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
"""Concatenate files"""
|
||||||
|
with open(self.f1, 'r') as f1:
|
||||||
|
self.bsd_text = f1.read()
|
||||||
|
|
||||||
|
with open(self.f1, 'a') as f1:
|
||||||
|
with open(self.f2, 'r') as f2:
|
||||||
|
self.bundled_text = f2.read()
|
||||||
|
f1.write('\n\n')
|
||||||
|
f1.write(self.bundled_text)
|
||||||
|
|
||||||
|
def __exit__(self, exception_type, exception_value, traceback):
|
||||||
|
"""Restore content of f1"""
|
||||||
|
with open(self.f1, 'w') as f:
|
||||||
|
f.write(self.bsd_text)
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from wheel.bdist_wheel import bdist_wheel
|
||||||
|
except ImportError:
|
||||||
|
# This is useful when wheel is not installed and bdist_wheel is not
|
||||||
|
# specified on the command line. If it _is_ specified, parsing the command
|
||||||
|
# line will fail before wheel_concatenate is needed
|
||||||
|
wheel_concatenate = None
|
||||||
|
else:
|
||||||
|
# Need to create the proper LICENSE.txt for the wheel
|
||||||
|
class wheel_concatenate(bdist_wheel):
|
||||||
|
""" check submodules on sdist to prevent incomplete tarballs """
|
||||||
|
def run(self):
|
||||||
|
with concat_license_files():
|
||||||
|
super().run()
|
||||||
|
|
||||||
|
|
||||||
class install(setuptools.command.install.install):
|
class install(setuptools.command.install.install):
|
||||||
def run(self):
|
def run(self):
|
||||||
@ -724,6 +768,7 @@ def configure_extension_build():
|
|||||||
'build_ext': build_ext,
|
'build_ext': build_ext,
|
||||||
'clean': clean,
|
'clean': clean,
|
||||||
'install': install,
|
'install': install,
|
||||||
|
'bdist_wheel': wheel_concatenate,
|
||||||
}
|
}
|
||||||
|
|
||||||
entry_points = {
|
entry_points = {
|
||||||
|
|||||||
@ -3,9 +3,11 @@
|
|||||||
#include <test/cpp/jit/test_utils.h>
|
#include <test/cpp/jit/test_utils.h>
|
||||||
|
|
||||||
#include <ATen/core/qualified_name.h>
|
#include <ATen/core/qualified_name.h>
|
||||||
|
#include <torch/csrc/jit/api/module.h>
|
||||||
#include <torch/csrc/jit/frontend/resolver.h>
|
#include <torch/csrc/jit/frontend/resolver.h>
|
||||||
#include <torch/csrc/jit/serialization/import.h>
|
#include <torch/csrc/jit/serialization/import.h>
|
||||||
#include <torch/csrc/jit/serialization/import_source.h>
|
#include <torch/csrc/jit/serialization/import_source.h>
|
||||||
|
#include <torch/csrc/jit/testing/file_check.h>
|
||||||
#include <torch/torch.h>
|
#include <torch/torch.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
@ -341,6 +343,20 @@ TEST(ModuleAPITest, Define) {
|
|||||||
AT_ASSERT(result.toTensor().item<float>() == 6);
|
AT_ASSERT(result.toTensor().item<float>() == 6);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(ModuleAPITest, Freezing) {
|
||||||
|
Module m("m");
|
||||||
|
m.register_parameter("foo", torch::ones({}), false);
|
||||||
|
m.define(R"(
|
||||||
|
def forward(self, x, b : int = 4):
|
||||||
|
return self.foo + x + b
|
||||||
|
)");
|
||||||
|
m.eval();
|
||||||
|
auto frozen_mod = torch::jit::freeze(m);
|
||||||
|
auto forward_g = frozen_mod.get_method("forward").graph();
|
||||||
|
testing::FileCheck().check_not("GetAttr")->run(*forward_g);
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
TEST(ModuleAPITest, To_CUDA) {
|
TEST(ModuleAPITest, To_CUDA) {
|
||||||
Module m("test");
|
Module m("test");
|
||||||
{
|
{
|
||||||
|
|||||||
0
test/distributed/test_c10d.py
Executable file → Normal file
0
test/distributed/test_c10d.py
Executable file → Normal file
@ -1508,7 +1508,7 @@ class TestFrozenOptimizations(JitTestCase):
|
|||||||
bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
|
bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
|
||||||
mod = torch.nn.Sequential(conv, bn)
|
mod = torch.nn.Sequential(conv, bn)
|
||||||
# set optimize to False here, by default freezing runs optimize_frozen_module
|
# set optimize to False here, by default freezing runs optimize_frozen_module
|
||||||
frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize=False)
|
frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize_numerics=False)
|
||||||
# inspect frozen mod
|
# inspect frozen mod
|
||||||
FileCheck().check("batch_norm").run(frozen_mod.graph)
|
FileCheck().check("batch_norm").run(frozen_mod.graph)
|
||||||
torch.jit.optimize_frozen_module(frozen_mod)
|
torch.jit.optimize_frozen_module(frozen_mod)
|
||||||
|
|||||||
@ -182,7 +182,7 @@ class TestModels(TestCase):
|
|||||||
self.exportTest(toC(FakeQuantNet()), toC(x))
|
self.exportTest(toC(FakeQuantNet()), toC(x))
|
||||||
|
|
||||||
@skipIfUnsupportedMinOpsetVersion(10)
|
@skipIfUnsupportedMinOpsetVersion(10)
|
||||||
def test_qat_resnet(self):
|
def test_qat_resnet_pertensor(self):
|
||||||
# Quantize ResNet50 model
|
# Quantize ResNet50 model
|
||||||
x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
|
x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
|
||||||
qat_resnet50 = resnet50()
|
qat_resnet50 = resnet50()
|
||||||
@ -202,6 +202,27 @@ class TestModels(TestCase):
|
|||||||
|
|
||||||
self.exportTest(toC(qat_resnet50), toC(x))
|
self.exportTest(toC(qat_resnet50), toC(x))
|
||||||
|
|
||||||
|
@skipIfUnsupportedMinOpsetVersion(13)
|
||||||
|
def test_qat_resnet_per_channel(self):
|
||||||
|
# Quantize ResNet50 model
|
||||||
|
x = torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
|
||||||
|
qat_resnet50 = resnet50()
|
||||||
|
|
||||||
|
qat_resnet50.qconfig = quantization.QConfig(
|
||||||
|
activation=quantization.default_fake_quant,
|
||||||
|
weight=quantization.default_per_channel_weight_fake_quant)
|
||||||
|
quantization.prepare_qat(qat_resnet50, inplace=True)
|
||||||
|
qat_resnet50.apply(torch.quantization.enable_observer)
|
||||||
|
qat_resnet50.apply(torch.quantization.enable_fake_quant)
|
||||||
|
|
||||||
|
_ = qat_resnet50(x)
|
||||||
|
for module in qat_resnet50.modules():
|
||||||
|
if isinstance(module, quantization.FakeQuantize):
|
||||||
|
module.calculate_qparams()
|
||||||
|
qat_resnet50.apply(torch.quantization.disable_observer)
|
||||||
|
|
||||||
|
self.exportTest(toC(qat_resnet50), toC(x))
|
||||||
|
|
||||||
@disableScriptTest() # None type in outputs
|
@disableScriptTest() # None type in outputs
|
||||||
def test_googlenet(self):
|
def test_googlenet(self):
|
||||||
x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
|
x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
|
||||||
|
|||||||
@ -5998,6 +5998,20 @@ class TestONNXRuntime(unittest.TestCase):
|
|||||||
x = torch.randn(6, 4, 3, 3)
|
x = torch.randn(6, 4, 3, 3)
|
||||||
self.run_test(FakeQuantizePerTensorModel(), (x))
|
self.run_test(FakeQuantizePerTensorModel(), (x))
|
||||||
|
|
||||||
|
@skipIfUnsupportedMinOpsetVersion(13)
|
||||||
|
def test_fake_quantize_per_channel(self):
|
||||||
|
class FakeQuantizePerChannelModel(torch.nn.Module):
|
||||||
|
def forward(self, input):
|
||||||
|
amax = torch.ones(4)
|
||||||
|
scale = amax / 127.
|
||||||
|
zero_point = torch.zeros_like(amax, dtype=torch.long)
|
||||||
|
# Quantize twice to test differnet branches
|
||||||
|
y = torch.fake_quantize_per_channel_affine(input, scale, zero_point, 1, 0, 255)
|
||||||
|
return torch.fake_quantize_per_channel_affine(y, scale, zero_point, 1, -128, 127)
|
||||||
|
|
||||||
|
x = torch.randn(6, 4, 3, 3)
|
||||||
|
self.run_test(FakeQuantizePerChannelModel(), (x))
|
||||||
|
|
||||||
def test_batchnorm_training(self):
|
def test_batchnorm_training(self):
|
||||||
class MyModule(torch.nn.Module):
|
class MyModule(torch.nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|||||||
@ -2,6 +2,8 @@ import unittest
|
|||||||
import onnxruntime # noqa
|
import onnxruntime # noqa
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from torch.cuda.amp import autocast
|
||||||
|
|
||||||
from test_pytorch_common import skipIfUnsupportedMinOpsetVersion
|
from test_pytorch_common import skipIfUnsupportedMinOpsetVersion
|
||||||
from test_pytorch_common import skipIfNoCuda
|
from test_pytorch_common import skipIfNoCuda
|
||||||
|
|
||||||
@ -24,6 +26,43 @@ class TestONNXRuntime_cuda(unittest.TestCase):
|
|||||||
x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
|
x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
|
||||||
self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5)
|
self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5)
|
||||||
|
|
||||||
|
@skipIfUnsupportedMinOpsetVersion(9)
|
||||||
|
@skipIfNoCuda
|
||||||
|
def test_layer_norm_fp16(self):
|
||||||
|
class LayerNormModel(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super(LayerNormModel, self).__init__()
|
||||||
|
self.layer_norm = torch.nn.LayerNorm([10, 10])
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.layer_norm(x)
|
||||||
|
|
||||||
|
x = torch.randn(20, 5, 10, 10, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
|
||||||
|
self.run_test(LayerNormModel(), x, rtol=1e-3, atol=1e-5)
|
||||||
|
|
||||||
|
|
||||||
|
@skipIfUnsupportedMinOpsetVersion(12)
|
||||||
|
@skipIfNoCuda
|
||||||
|
def test_softmaxCrossEntropy_fusion_fp16(self):
|
||||||
|
class FusionModel(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super(FusionModel, self).__init__()
|
||||||
|
self.loss = torch.nn.NLLLoss(reduction='none')
|
||||||
|
self.m = torch.nn.LogSoftmax(dim=1)
|
||||||
|
|
||||||
|
@autocast()
|
||||||
|
def forward(self, input, target):
|
||||||
|
output = self.loss(self.m(2 * input), target)
|
||||||
|
return output
|
||||||
|
|
||||||
|
N, C = 5, 4
|
||||||
|
input = torch.randn(N, 16, dtype=torch.float16, device=torch.device('cuda'))
|
||||||
|
target = torch.empty(N, dtype=torch.long, device=torch.device('cuda')).random_(0, C)
|
||||||
|
|
||||||
|
# using test data containing default ignore_index=-100
|
||||||
|
target[target == 1] = -100
|
||||||
|
self.run_test(FusionModel(), (input, target))
|
||||||
|
|
||||||
TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp
|
TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp
|
||||||
TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test
|
TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test
|
||||||
|
|
||||||
|
|||||||
@ -872,7 +872,7 @@ class TestFakeQuantize(TestCase):
|
|||||||
scale, zero_point = float(scale), int(zero_point)
|
scale, zero_point = float(scale), int(zero_point)
|
||||||
quant_min, quant_max = obs._calculate_qmin_qmax()
|
quant_min, quant_max = obs._calculate_qmin_qmax()
|
||||||
|
|
||||||
Y_test, _mask = torch.fake_quantize_per_tensor_affine_cachemask(
|
Y_test = torch.fake_quantize_per_tensor_affine(
|
||||||
X, scale, zero_point, quant_min, quant_max)
|
X, scale, zero_point, quant_min, quant_max)
|
||||||
Y_ref = _fake_quantize_per_tensor_affine_reference(
|
Y_ref = _fake_quantize_per_tensor_affine_reference(
|
||||||
X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
|
X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
|
||||||
@ -899,7 +899,7 @@ class TestFakeQuantize(TestCase):
|
|||||||
quant_min, quant_max = obs._calculate_qmin_qmax()
|
quant_min, quant_max = obs._calculate_qmin_qmax()
|
||||||
|
|
||||||
# forward pass
|
# forward pass
|
||||||
Y_test, mask = torch.fake_quantize_per_tensor_affine_cachemask(
|
Y_test = torch.fake_quantize_per_tensor_affine(
|
||||||
X, scale, zero_point, quant_min, quant_max)
|
X, scale, zero_point, quant_min, quant_max)
|
||||||
Y_ref = _fake_quantize_per_tensor_affine_reference(
|
Y_ref = _fake_quantize_per_tensor_affine_reference(
|
||||||
X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
|
X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
|
||||||
@ -1246,7 +1246,7 @@ class TestFakeQuantize(TestCase):
|
|||||||
|
|
||||||
Y = _fake_quantize_per_channel_affine_reference(
|
Y = _fake_quantize_per_channel_affine_reference(
|
||||||
X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max)
|
X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max)
|
||||||
Y_prime, _mask = torch.fake_quantize_per_channel_affine_cachemask(
|
Y_prime = torch.fake_quantize_per_channel_affine(
|
||||||
X, scale, zero_point, axis, quant_min, quant_max)
|
X, scale, zero_point, axis, quant_min, quant_max)
|
||||||
np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)
|
np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)
|
||||||
|
|
||||||
@ -1339,7 +1339,7 @@ class TestFakeQuantize(TestCase):
|
|||||||
zero_point = zero_point.to(torch.int64)
|
zero_point = zero_point.to(torch.int64)
|
||||||
quant_min, quant_max = obs._calculate_qmin_qmax()
|
quant_min, quant_max = obs._calculate_qmin_qmax()
|
||||||
X.requires_grad_()
|
X.requires_grad_()
|
||||||
Y_prime, _mask = torch.fake_quantize_per_channel_affine_cachemask(
|
Y_prime = torch.fake_quantize_per_channel_affine(
|
||||||
X, scale, zero_point, axis, quant_min, quant_max)
|
X, scale, zero_point, axis, quant_min, quant_max)
|
||||||
dout = torch.rand(X.shape, dtype=torch.float).to(device)
|
dout = torch.rand(X.shape, dtype=torch.float).to(device)
|
||||||
dX = _fake_quantize_per_channel_affine_grad_reference(
|
dX = _fake_quantize_per_channel_affine_grad_reference(
|
||||||
|
|||||||
@ -108,6 +108,7 @@ TESTS = [
|
|||||||
'test_fx_experimental',
|
'test_fx_experimental',
|
||||||
'test_functional_autograd_benchmark',
|
'test_functional_autograd_benchmark',
|
||||||
'test_package',
|
'test_package',
|
||||||
|
'test_license',
|
||||||
'distributed/pipeline/sync/skip/test_api',
|
'distributed/pipeline/sync/skip/test_api',
|
||||||
'distributed/pipeline/sync/skip/test_gpipe',
|
'distributed/pipeline/sync/skip/test_gpipe',
|
||||||
'distributed/pipeline/sync/skip/test_inspect_skip_layout',
|
'distributed/pipeline/sync/skip/test_inspect_skip_layout',
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from math import sqrt
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from torch.multiprocessing import Process
|
from torch.multiprocessing import Process
|
||||||
from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Interpreter, Tracer, Transformer, Graph, wrap
|
from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Interpreter, Tracer, Transformer, Graph, wrap
|
||||||
from torch.fx.node import Target
|
from torch.fx.node import Target, Argument
|
||||||
from torch.fx.passes import shape_prop
|
from torch.fx.passes import shape_prop
|
||||||
from torch.fx.immutable_collections import immutable_dict, immutable_list
|
from torch.fx.immutable_collections import immutable_dict, immutable_list
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
@ -187,7 +187,7 @@ class TestFX(JitTestCase):
|
|||||||
# Custom delegate to disallow in-place tensor operations
|
# Custom delegate to disallow in-place tensor operations
|
||||||
class NoMutableCallTracer(Tracer):
|
class NoMutableCallTracer(Tracer):
|
||||||
def create_node(self, kind : str, target : Union[str, Callable],
|
def create_node(self, kind : str, target : Union[str, Callable],
|
||||||
args : Tuple[Any], kwargs : Dict[str, Any], name : Optional[str] = None,
|
args : Tuple[Argument, ...], kwargs : Dict[str, Any], name : Optional[str] = None,
|
||||||
type_expr : Optional[Any] = None) -> Node:
|
type_expr : Optional[Any] = None) -> Node:
|
||||||
name = target if isinstance(target, str) else torch.typename(target)
|
name = target if isinstance(target, str) else torch.typename(target)
|
||||||
if name[-1] == '_':
|
if name[-1] == '_':
|
||||||
@ -539,7 +539,7 @@ class TestFX(JitTestCase):
|
|||||||
def test_node_tagging(self):
|
def test_node_tagging(self):
|
||||||
class TaggingTracer(Tracer):
|
class TaggingTracer(Tracer):
|
||||||
def create_node(self, kind : str, target : Union[str, Callable],
|
def create_node(self, kind : str, target : Union[str, Callable],
|
||||||
args : Tuple[Any], kwargs : Dict[str, Any], name : Optional[str] = None,
|
args : Tuple[Argument, ...], kwargs : Dict[str, Any], name : Optional[str] = None,
|
||||||
type_expr : Optional[Any] = None) -> Node:
|
type_expr : Optional[Any] = None) -> Node:
|
||||||
n = super().create_node(kind, target, args, kwargs, name)
|
n = super().create_node(kind, target, args, kwargs, name)
|
||||||
n.tag = 'foo'
|
n.tag = 'foo'
|
||||||
@ -1057,6 +1057,13 @@ class TestFX(JitTestCase):
|
|||||||
result = interp.run(torch.ones(3, 4), torch.ones(3, 4), torch.rand(3, 4))
|
result = interp.run(torch.ones(3, 4), torch.ones(3, 4), torch.rand(3, 4))
|
||||||
self.assertEqual(result, torch.ones(3, 4) * 2.0)
|
self.assertEqual(result, torch.ones(3, 4) * 2.0)
|
||||||
|
|
||||||
|
@skipIfNoTorchVision
|
||||||
|
def test_interpreter_noop_resnet18(self):
|
||||||
|
rn18 = resnet18()
|
||||||
|
transformed = torch.fx.Transformer(symbolic_trace(rn18)).transform()
|
||||||
|
inp = torch.randn(5, 3, 224, 224)
|
||||||
|
self.assertEqual(transformed(inp), rn18(inp))
|
||||||
|
|
||||||
def test_transformer_noop(self):
|
def test_transformer_noop(self):
|
||||||
class MyModule(torch.nn.Module):
|
class MyModule(torch.nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -1377,6 +1384,45 @@ class TestFX(JitTestCase):
|
|||||||
x, y = torch.randn(3, 4), torch.randn(3, 4)
|
x, y = torch.randn(3, 4), torch.randn(3, 4)
|
||||||
self.checkGraphModule(foo, (x, y))
|
self.checkGraphModule(foo, (x, y))
|
||||||
|
|
||||||
|
def test_trace_dict_int_keys(self):
|
||||||
|
class ModWithDictArg(torch.nn.Module):
|
||||||
|
def forward(self, d : Dict[int, torch.Tensor]):
|
||||||
|
return d[42]
|
||||||
|
|
||||||
|
class CallsModWithDict(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.m = ModWithDictArg()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.m({42: x})
|
||||||
|
|
||||||
|
class MyTracer(torch.fx.Tracer):
|
||||||
|
def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
|
||||||
|
return isinstance(m, ModWithDictArg)
|
||||||
|
|
||||||
|
traced_graph = MyTracer().trace(CallsModWithDict())
|
||||||
|
|
||||||
|
def test_trace_dict_proxy_keys(self):
|
||||||
|
class ModWithDictArg(torch.nn.Module):
|
||||||
|
def forward(self, d : Dict[torch.Tensor, torch.Tensor]):
|
||||||
|
return d[42]
|
||||||
|
|
||||||
|
class CallsModWithDict(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.m = ModWithDictArg()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.m({x: x})
|
||||||
|
|
||||||
|
class MyTracer(torch.fx.Tracer):
|
||||||
|
def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
|
||||||
|
return isinstance(m, ModWithDictArg)
|
||||||
|
|
||||||
|
with self.assertRaisesRegex(RuntimeError, 'cannot contain a Node'):
|
||||||
|
traced_graph = MyTracer().trace(CallsModWithDict())
|
||||||
|
|
||||||
def test_direct_param_use(self):
|
def test_direct_param_use(self):
|
||||||
class TransposeTest(torch.nn.Module):
|
class TransposeTest(torch.nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|||||||
@ -5,14 +5,14 @@ from typing import Callable, Dict, Union, List
|
|||||||
from torch.fx.symbolic_trace import symbolic_trace
|
from torch.fx.symbolic_trace import symbolic_trace
|
||||||
from torch.fx.graph_module import GraphModule
|
from torch.fx.graph_module import GraphModule
|
||||||
from torch.fx.node import Node
|
from torch.fx.node import Node
|
||||||
from torch.fx.experimental import graph_manipulation
|
from torch.fx._experimental import graph_manipulation
|
||||||
from torch.fx.experimental.accelerator_partitioner import Partitioner
|
from torch.fx._experimental.accelerator_partitioner import Partitioner
|
||||||
from torch.fx.experimental.rewriter import RewritingTracer
|
from torch.fx._experimental.rewriter import RewritingTracer
|
||||||
from torch.fx.experimental.param_fetch import lift_lowering_attrs_to_nodes
|
from torch.fx._experimental.param_fetch import lift_lowering_attrs_to_nodes
|
||||||
from torch.testing._internal.common_utils import run_tests
|
from torch.testing._internal.common_utils import run_tests
|
||||||
from torch.testing._internal.jit_utils import JitTestCase
|
from torch.testing._internal.jit_utils import JitTestCase
|
||||||
from torch.fx.passes.split_module import split_module
|
from torch.fx.passes.split_module import split_module
|
||||||
from torch.fx.experimental.partitioner_utils import (
|
from torch.fx._experimental.partitioner_utils import (
|
||||||
NodeLatency,
|
NodeLatency,
|
||||||
get_partition_to_latency_mapping,
|
get_partition_to_latency_mapping,
|
||||||
get_latency_of_partitioned_graph,
|
get_latency_of_partitioned_graph,
|
||||||
@ -20,8 +20,8 @@ from torch.fx.experimental.partitioner_utils import (
|
|||||||
PartitionerConfig,
|
PartitionerConfig,
|
||||||
PartitionMode
|
PartitionMode
|
||||||
)
|
)
|
||||||
from torch.fx.experimental.fuser import fuse
|
from torch.fx._experimental.fuser import fuse
|
||||||
from torch.fx.experimental import merge_matmul
|
from torch.fx._experimental import merge_matmul
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from torchvision.models import resnet18
|
from torchvision.models import resnet18
|
||||||
@ -849,7 +849,7 @@ terrible spacing
|
|||||||
|
|
||||||
def test_merge_matmuls(self):
|
def test_merge_matmuls(self):
|
||||||
"""
|
"""
|
||||||
A collection of test cases for torch.fx.experimental.merge_matmul,
|
A collection of test cases for torch.fx._experimental.merge_matmul,
|
||||||
a graph transformation that merges matrix multiplication operations.
|
a graph transformation that merges matrix multiplication operations.
|
||||||
"""
|
"""
|
||||||
# Utility function for counting matmuls for test assertions.
|
# Utility function for counting matmuls for test assertions.
|
||||||
|
|||||||
@ -6503,6 +6503,38 @@ a")
|
|||||||
self.checkModule(module().train(), ())
|
self.checkModule(module().train(), ())
|
||||||
self.checkModule(module().eval(), ())
|
self.checkModule(module().eval(), ())
|
||||||
|
|
||||||
|
def test_ternary_static_if(self):
|
||||||
|
# Test for True branch when condition variable
|
||||||
|
# is annotated as Final
|
||||||
|
class M1(torch.nn.Module):
|
||||||
|
flag: torch.jit.Final[bool]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.flag = True
|
||||||
|
|
||||||
|
def forward(self) -> torch.Tensor:
|
||||||
|
return torch.ones(3) if self.flag else {}
|
||||||
|
|
||||||
|
# Test for True branch when condition variable
|
||||||
|
# is annotated as Final
|
||||||
|
class M2(torch.nn.Module):
|
||||||
|
flag: torch.jit.Final[bool]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.flag = False
|
||||||
|
|
||||||
|
def forward(self) -> torch.Tensor:
|
||||||
|
return {} if self.flag else torch.ones(3)
|
||||||
|
|
||||||
|
model1 = M1()
|
||||||
|
model2 = M2()
|
||||||
|
script_model_1 = torch.jit.script(model1)
|
||||||
|
script_model_2 = torch.jit.script(model2)
|
||||||
|
self.assertEqual(model1.forward(), script_model_1.forward())
|
||||||
|
self.assertEqual(model2.forward(), script_model_2.forward())
|
||||||
|
|
||||||
def test_print(self):
|
def test_print(self):
|
||||||
def func(x, y):
|
def func(x, y):
|
||||||
q = (x + y).sigmoid()
|
q = (x + y).sigmoid()
|
||||||
|
|||||||
@ -1,6 +1,9 @@
|
|||||||
|
import glob
|
||||||
import io
|
import io
|
||||||
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
import torch
|
||||||
from torch.testing._internal.common_utils import TestCase, run_tests
|
from torch.testing._internal.common_utils import TestCase, run_tests
|
||||||
|
|
||||||
|
|
||||||
@ -10,11 +13,14 @@ except ImportError:
|
|||||||
create_bundled = None
|
create_bundled = None
|
||||||
|
|
||||||
license_file = 'third_party/LICENSES_BUNDLED.txt'
|
license_file = 'third_party/LICENSES_BUNDLED.txt'
|
||||||
|
starting_txt = 'The Pytorch repository and source distributions bundle'
|
||||||
|
site_packages = os.path.dirname(os.path.dirname(torch.__file__))
|
||||||
|
distinfo = glob.glob(os.path.join(site_packages, 'torch-*dist-info'))
|
||||||
|
|
||||||
class TestLicense(TestCase):
|
class TestLicense(TestCase):
|
||||||
|
|
||||||
@unittest.skipIf(not create_bundled, "can only be run in a source tree")
|
@unittest.skipIf(not create_bundled, "can only be run in a source tree")
|
||||||
def test_license_in_wheel(self):
|
def test_license_for_wheel(self):
|
||||||
current = io.StringIO()
|
current = io.StringIO()
|
||||||
create_bundled('third_party', current)
|
create_bundled('third_party', current)
|
||||||
with open(license_file) as fid:
|
with open(license_file) as fid:
|
||||||
@ -25,6 +31,18 @@ class TestLicense(TestCase):
|
|||||||
'match the current state of the third_party files. Use '
|
'match the current state of the third_party files. Use '
|
||||||
'"python third_party/build_bundled.py" to regenerate it')
|
'"python third_party/build_bundled.py" to regenerate it')
|
||||||
|
|
||||||
|
@unittest.skipIf(len(distinfo) == 0, "no installation in site-package to test")
|
||||||
|
def test_distinfo_license(self):
|
||||||
|
"""If run when pytorch is installed via a wheel, the license will be in
|
||||||
|
site-package/torch-*dist-info/LICENSE. Make sure it contains the third
|
||||||
|
party bundle of licenses"""
|
||||||
|
|
||||||
|
if len(distinfo) > 1:
|
||||||
|
raise AssertionError('Found too many "torch-*dist-info" directories '
|
||||||
|
f'in "{site_packages}, expected only one')
|
||||||
|
with open(os.path.join(os.path.join(distinfo[0], 'LICENSE'))) as fid:
|
||||||
|
txt = fid.read()
|
||||||
|
self.assertTrue(starting_txt in txt)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
run_tests()
|
run_tests()
|
||||||
|
|||||||
@ -4276,6 +4276,37 @@ class TestNN(NNTestCase):
|
|||||||
with torch.backends.mkldnn.flags(enabled=enabled):
|
with torch.backends.mkldnn.flags(enabled=enabled):
|
||||||
gradcheck(F.conv2d, (input, mod.weight))
|
gradcheck(F.conv2d, (input, mod.weight))
|
||||||
|
|
||||||
|
def test_Conv2d_OneDNN(self):
|
||||||
|
def run_once():
|
||||||
|
group_val = 24
|
||||||
|
ifm = torch.ones([1, group_val, 6, 6], dtype=torch.float32)
|
||||||
|
weights = torch.ones([group_val, 1, 3, 3], dtype=torch.float32)
|
||||||
|
op = torch.nn.Conv2d(
|
||||||
|
in_channels=group_val,
|
||||||
|
out_channels=group_val,
|
||||||
|
kernel_size=[3, 3],
|
||||||
|
stride=[2, 2],
|
||||||
|
padding=[1, 1],
|
||||||
|
dilation=[1, 1],
|
||||||
|
groups=group_val,
|
||||||
|
bias=False,
|
||||||
|
padding_mode='zeros'
|
||||||
|
)
|
||||||
|
|
||||||
|
op.weight.data = weights
|
||||||
|
res = op(ifm)
|
||||||
|
grad_in = torch.ones(res.shape, dtype=torch.float32)
|
||||||
|
res.backward(grad_in)
|
||||||
|
return op.weight.grad
|
||||||
|
|
||||||
|
with torch.backends.mkldnn.flags(enabled=False):
|
||||||
|
without_onednn = run_once()
|
||||||
|
|
||||||
|
with torch.backends.mkldnn.flags(enabled=True):
|
||||||
|
with_onednn = run_once()
|
||||||
|
|
||||||
|
self.assertEqual(without_onednn, with_onednn)
|
||||||
|
|
||||||
@unittest.skipIf(not TEST_CUDA, 'CUDA not available')
|
@unittest.skipIf(not TEST_CUDA, 'CUDA not available')
|
||||||
@unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
|
@unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
|
||||||
def test_cudnn_non_contiguous(self):
|
def test_cudnn_non_contiguous(self):
|
||||||
@ -8643,7 +8674,7 @@ class TestNN(NNTestCase):
|
|||||||
kwargs = dict(mode='bicubic', align_corners=align_corners)
|
kwargs = dict(mode='bicubic', align_corners=align_corners)
|
||||||
# test float scale factor up & downsampling
|
# test float scale factor up & downsampling
|
||||||
for device in device_list:
|
for device in device_list:
|
||||||
for scale_factor in [0.5, 1.5, 2]:
|
for scale_factor in [0.5, 1, 1.5, 2]:
|
||||||
in_t = torch.ones(2, 2, 2, 2).to(device)
|
in_t = torch.ones(2, 2, 2, 2).to(device)
|
||||||
out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
|
out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
|
||||||
out_size = int(math.floor(in_t.shape[-1] * scale_factor))
|
out_size = int(math.floor(in_t.shape[-1] * scale_factor))
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
from torch.testing._internal.common_utils import TestCase, run_tests
|
from torch.testing._internal.common_utils import TestCase, run_tests
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import Tensor, vmap
|
from torch import Tensor
|
||||||
|
from torch._vmap_internals import vmap
|
||||||
import functools
|
import functools
|
||||||
import itertools
|
import itertools
|
||||||
import warnings
|
import warnings
|
||||||
|
|||||||
2
third_party/XNNPACK
vendored
2
third_party/XNNPACK
vendored
Submodule third_party/XNNPACK updated: e1ffe15459...383b0752fe
2
third_party/tensorpipe
vendored
2
third_party/tensorpipe
vendored
Submodule third_party/tensorpipe updated: a814dda3ef...05467ba9bc
@ -82,7 +82,8 @@ SKIP_PYTHON_BINDINGS = [
|
|||||||
'set_data',
|
'set_data',
|
||||||
'.*_overrideable', # overrideable functions for backend extension
|
'.*_overrideable', # overrideable functions for backend extension
|
||||||
'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retain_grad', 'set_',
|
'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retain_grad', 'set_',
|
||||||
'_fw_primal'
|
'_fw_primal', 'fake_quantize_per_tensor_affine_cachemask',
|
||||||
|
'fake_quantize_per_channel_affine_cachemask',
|
||||||
]
|
]
|
||||||
|
|
||||||
# These function signatures are not exposed to Python. Note that this signature
|
# These function signatures are not exposed to Python. Note that this signature
|
||||||
|
|||||||
@ -350,8 +350,8 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
|
|||||||
'saddmm': ['def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number=1, '
|
'saddmm': ['def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number=1, '
|
||||||
'alpha: Number=1, out: Optional[Tensor]=None) -> Tensor: ...'],
|
'alpha: Number=1, out: Optional[Tensor]=None) -> Tensor: ...'],
|
||||||
'spmm': ['def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...'],
|
'spmm': ['def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...'],
|
||||||
'div': ['def div(input: Union[Tensor, Number], other: Union[Tensor, Number], '
|
'div': ['def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, '
|
||||||
'rounding_mode: str = "true", *, out: Optional[Tensor]=None) -> Tensor: ...'],
|
'rounding_mode: Optional[str]=None, out: Optional[Tensor]=None) -> Tensor: ...'],
|
||||||
})
|
})
|
||||||
for binop in ['mul', 'true_divide', 'floor_divide']:
|
for binop in ['mul', 'true_divide', 'floor_divide']:
|
||||||
unsorted_function_hints[binop].append(
|
unsorted_function_hints[binop].append(
|
||||||
@ -462,9 +462,9 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
|
|||||||
'def set_(self, storage: Storage) -> Tensor: ...'],
|
'def set_(self, storage: Storage) -> Tensor: ...'],
|
||||||
'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...',
|
'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...',
|
||||||
'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'],
|
'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'],
|
||||||
'div': ['def div(self, other: Union[Tensor, Number], '
|
'div': ['def div(self, other: Union[Tensor, Number], *, '
|
||||||
'rounding_mode: str = "true", *, out: Optional[Tensor]=None) -> Tensor: ...'],
|
'rounding_mode: Optional[str]=None, out: Optional[Tensor]=None) -> Tensor: ...'],
|
||||||
'div_': ['def div_(self, other: Union[Tensor, Number], rounding_mode: str = "true") -> Tensor: ...'],
|
'div_': ['def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str]=None) -> Tensor: ...'],
|
||||||
})
|
})
|
||||||
for binop in ['mul', 'true_divide', 'floor_divide']:
|
for binop in ['mul', 'true_divide', 'floor_divide']:
|
||||||
for inplace in [False, True]:
|
for inplace in [False, True]:
|
||||||
|
|||||||
@ -162,7 +162,7 @@ endif()
|
|||||||
|
|
||||||
# In the most recent CMake versions, a new 'TRANSFORM' subcommand of 'list' allows much of the boilerplate of defining the lists
|
# In the most recent CMake versions, a new 'TRANSFORM' subcommand of 'list' allows much of the boilerplate of defining the lists
|
||||||
# of type stub files to be omitted.
|
# of type stub files to be omitted.
|
||||||
# For comptability with older CMake versions, we omit it for now, but leave it as a comment in case comptability with the older
|
# For compatibility with older CMake versions, we omit it for now, but leave it as a comment in case compatibility with the older
|
||||||
# CMake versions is eventually dropped.
|
# CMake versions is eventually dropped.
|
||||||
# set(Modules
|
# set(Modules
|
||||||
# __init__
|
# __init__
|
||||||
|
|||||||
@ -174,6 +174,11 @@ def _freeze_module(module: ScriptModule,
|
|||||||
freeze_interfaces: _bool = True,
|
freeze_interfaces: _bool = True,
|
||||||
preserveParameters: _bool = True) -> ScriptModule: ...
|
preserveParameters: _bool = True) -> ScriptModule: ...
|
||||||
def _jit_pass_optimize_frozen_graph(Graph) -> None: ...
|
def _jit_pass_optimize_frozen_graph(Graph) -> None: ...
|
||||||
|
def _jit_pass_fold_frozen_conv_bn(graph: Graph): ...
|
||||||
|
def _jit_pass_fold_frozen_conv_add_or_sub(graph: Graph): ...
|
||||||
|
def _jit_pass_fold_frozen_conv_mul_or_div(graph: Graph): ...
|
||||||
|
def _jit_pass_remove_dropout(module: 'torch.jit.ScriptModule'): ...
|
||||||
|
|
||||||
def _is_tracing() -> _bool: ...
|
def _is_tracing() -> _bool: ...
|
||||||
def _jit_init() -> _bool: ...
|
def _jit_init() -> _bool: ...
|
||||||
def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ...
|
def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ...
|
||||||
|
|||||||
@ -662,8 +662,6 @@ del register_after_fork
|
|||||||
# torch.jit.script as a decorator, for instance):
|
# torch.jit.script as a decorator, for instance):
|
||||||
from ._lobpcg import lobpcg
|
from ._lobpcg import lobpcg
|
||||||
|
|
||||||
from ._vmap_internals import vmap
|
|
||||||
|
|
||||||
# These were previously defined in native_functions.yaml and appeared on the
|
# These were previously defined in native_functions.yaml and appeared on the
|
||||||
# `torch` namespace, but we moved them to c10 dispatch to facilitate custom
|
# `torch` namespace, but we moved them to c10 dispatch to facilitate custom
|
||||||
# class usage. We add these lines here to preserve backward compatibility.
|
# class usage. We add these lines here to preserve backward compatibility.
|
||||||
|
|||||||
@ -1194,25 +1194,25 @@ See :func:`torch.dist`
|
|||||||
""")
|
""")
|
||||||
|
|
||||||
add_docstr_all('div', r"""
|
add_docstr_all('div', r"""
|
||||||
div(value, *, rounding_mode='true') -> Tensor
|
div(value, *, rounding_mode=None) -> Tensor
|
||||||
|
|
||||||
See :func:`torch.div`
|
See :func:`torch.div`
|
||||||
""")
|
""")
|
||||||
|
|
||||||
add_docstr_all('div_', r"""
|
add_docstr_all('div_', r"""
|
||||||
div_(value, *, rounding_mode='true') -> Tensor
|
div_(value, *, rounding_mode=None) -> Tensor
|
||||||
|
|
||||||
In-place version of :meth:`~Tensor.div`
|
In-place version of :meth:`~Tensor.div`
|
||||||
""")
|
""")
|
||||||
|
|
||||||
add_docstr_all('divide', r"""
|
add_docstr_all('divide', r"""
|
||||||
divide(value, *, rounding_mode='true') -> Tensor
|
divide(value, *, rounding_mode=None) -> Tensor
|
||||||
|
|
||||||
See :func:`torch.divide`
|
See :func:`torch.divide`
|
||||||
""")
|
""")
|
||||||
|
|
||||||
add_docstr_all('divide_', r"""
|
add_docstr_all('divide_', r"""
|
||||||
divide_(value, *, rounding_mode='true') -> Tensor
|
divide_(value, *, rounding_mode=None) -> Tensor
|
||||||
|
|
||||||
In-place version of :meth:`~Tensor.divide`
|
In-place version of :meth:`~Tensor.divide`
|
||||||
""")
|
""")
|
||||||
|
|||||||
@ -2741,7 +2741,7 @@ Example::
|
|||||||
""".format(**common_args))
|
""".format(**common_args))
|
||||||
|
|
||||||
add_docstr(torch.div, r"""
|
add_docstr(torch.div, r"""
|
||||||
div(input, other, *, rounding_mode='true' out=None) -> Tensor
|
div(input, other, *, rounding_mode=None, out=None) -> Tensor
|
||||||
|
|
||||||
Divides each element of the input ``input`` by the corresponding element of
|
Divides each element of the input ``input`` by the corresponding element of
|
||||||
:attr:`other`.
|
:attr:`other`.
|
||||||
@ -2764,7 +2764,7 @@ Args:
|
|||||||
Keyword args:
|
Keyword args:
|
||||||
rounding_mode (str, optional): Type of rounding applied to the result:
|
rounding_mode (str, optional): Type of rounding applied to the result:
|
||||||
|
|
||||||
* ``"true"`` - default behavior. Performs no rounding and, if both :attr:`input` and
|
* None - default behavior. Performs no rounding and, if both :attr:`input` and
|
||||||
:attr:`other` are integer types, promotes the inputs to the default scalar type.
|
:attr:`other` are integer types, promotes the inputs to the default scalar type.
|
||||||
Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``.
|
Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``.
|
||||||
* ``"trunc"`` - rounds the results of the division towards zero.
|
* ``"trunc"`` - rounds the results of the division towards zero.
|
||||||
@ -2806,7 +2806,7 @@ Examples::
|
|||||||
""".format(**common_args))
|
""".format(**common_args))
|
||||||
|
|
||||||
add_docstr(torch.divide, r"""
|
add_docstr(torch.divide, r"""
|
||||||
divide(input, other, *, rounding_mode='true', out=None) -> Tensor
|
divide(input, other, *, rounding_mode=None, out=None) -> Tensor
|
||||||
|
|
||||||
Alias for :func:`torch.div`.
|
Alias for :func:`torch.div`.
|
||||||
""")
|
""")
|
||||||
@ -8515,9 +8515,9 @@ If :attr:`upper` is ``False``, then lower triangular portion is used.
|
|||||||
.. note:: Irrespective of the original strides, the returned matrix `V` will
|
.. note:: Irrespective of the original strides, the returned matrix `V` will
|
||||||
be transposed, i.e. with strides `V.contiguous().transpose(-1, -2).stride()`.
|
be transposed, i.e. with strides `V.contiguous().transpose(-1, -2).stride()`.
|
||||||
|
|
||||||
.. note:: Extra care needs to be taken when backward through outputs. Such
|
.. warning:: Extra care needs to be taken when backward through outputs. Such
|
||||||
operation is really only stable when all eigenvalues are distinct.
|
operation is only stable when all eigenvalues are distinct and becomes
|
||||||
Otherwise, ``NaN`` can appear as the gradients are not properly defined.
|
less stable the smaller :math:`\min_{i \neq j} |\lambda_i - \lambda_j|` is.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input (Tensor): the input tensor of size :math:`(*, n, n)` where `*` is zero or more
|
input (Tensor): the input tensor of size :math:`(*, n, n)` where `*` is zero or more
|
||||||
@ -9207,7 +9207,7 @@ Example::
|
|||||||
add_docstr(torch.true_divide, r"""
|
add_docstr(torch.true_divide, r"""
|
||||||
true_divide(dividend, divisor, *, out) -> Tensor
|
true_divide(dividend, divisor, *, out) -> Tensor
|
||||||
|
|
||||||
Alias for :func:`torch.div` with ``rounding_mode='true'``.
|
Alias for :func:`torch.div` with ``rounding_mode=None``.
|
||||||
""".format(**common_args))
|
""".format(**common_args))
|
||||||
|
|
||||||
add_docstr(torch.trunc,
|
add_docstr(torch.trunc,
|
||||||
|
|||||||
@ -8,6 +8,8 @@
|
|||||||
#include <torch/csrc/jit/frontend/schema_matching.h>
|
#include <torch/csrc/jit/frontend/schema_matching.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
||||||
|
#include <torch/csrc/jit/passes/freeze_module.h>
|
||||||
|
#include <torch/csrc/jit/passes/frozen_graph_optimizations.h>
|
||||||
#include <torch/csrc/jit/passes/inliner.h>
|
#include <torch/csrc/jit/passes/inliner.h>
|
||||||
#include <torch/csrc/jit/runtime/operator.h>
|
#include <torch/csrc/jit/runtime/operator.h>
|
||||||
|
|
||||||
@ -336,6 +338,21 @@ IValue Module::create_class(const c10::QualifiedName& name, Stack stack) const {
|
|||||||
return obj;
|
return obj;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Module freeze(
|
||||||
|
const Module& module,
|
||||||
|
c10::optional<std::vector<std::string>> preserved_attrs,
|
||||||
|
bool optimize_numerics) {
|
||||||
|
TORCH_CHECK(
|
||||||
|
module.is_training(),
|
||||||
|
"Freezing is currently only implemented for modules in eval mode. Please call .eval() before freezing");
|
||||||
|
|
||||||
|
Module out_mod = freeze_module(
|
||||||
|
module, preserved_attrs.value_or(std::vector<std::string>({})));
|
||||||
|
auto graph = module.get_method("forward").graph();
|
||||||
|
OptimizeFrozenGraph(graph, optimize_numerics);
|
||||||
|
return out_mod;
|
||||||
|
}
|
||||||
|
|
||||||
buffer_list Module::buffers(bool recurse) const {
|
buffer_list Module::buffers(bool recurse) const {
|
||||||
return buffer_list(*this, recurse, /*return_module=*/false);
|
return buffer_list(*this, recurse, /*return_module=*/false);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -276,6 +276,13 @@ struct TORCH_API Module : public Object {
|
|||||||
bool non_blocking);
|
bool non_blocking);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// C++ equivalent api of `torch.jit.freeze`. See documentation there for
|
||||||
|
// details.
|
||||||
|
TORCH_API Module freeze(
|
||||||
|
const Module& module,
|
||||||
|
c10::optional<std::vector<std::string>> preserved_attrs = c10::nullopt,
|
||||||
|
bool optimize_numerics = true);
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
struct TORCH_API SlotCursor {
|
struct TORCH_API SlotCursor {
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h>
|
#include <torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h>
|
||||||
|
|
||||||
|
#include <ATen/DynamicLibrary.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
#include <torch/csrc/jit/codegen/fuser/compiler.h>
|
#include <torch/csrc/jit/codegen/fuser/compiler.h>
|
||||||
|
|||||||
@ -9,13 +9,18 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
// Forward declare DynamicLibrary
|
||||||
|
namespace at {
|
||||||
|
struct DynamicLibrary;
|
||||||
|
}
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace fuser {
|
namespace fuser {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
|
|
||||||
// Represents a compiled CPU kernel and the metadata necessary to run it
|
// Represents a compiled CPU kernel and the metadata necessary to run it
|
||||||
struct TORCH_API FusedKernelCPU : public ::torch::jit::fuser::FusedKernel {
|
struct TORCH_API FusedKernelCPU : public FusedKernel {
|
||||||
FusedKernelCPU(
|
FusedKernelCPU(
|
||||||
std::string name,
|
std::string name,
|
||||||
std::string code,
|
std::string code,
|
||||||
|
|||||||
@ -1258,6 +1258,15 @@ struct to_ir {
|
|||||||
const TernaryIf& expr,
|
const TernaryIf& expr,
|
||||||
const TypePtr& type_hint = nullptr) {
|
const TypePtr& type_hint = nullptr) {
|
||||||
CondValue cond_value = emitCondExpr(expr.cond());
|
CondValue cond_value = emitCondExpr(expr.cond());
|
||||||
|
// If the cond expr is a static value, then we metacompile the `if`
|
||||||
|
// statemement and only emit true or false branch
|
||||||
|
if (cond_value.staticIf()) {
|
||||||
|
if (*cond_value.staticIf()) {
|
||||||
|
return emitExpr(expr.true_expr(), type_hint);
|
||||||
|
} else {
|
||||||
|
return emitExpr(expr.false_expr(), type_hint);
|
||||||
|
}
|
||||||
|
}
|
||||||
auto true_expr = [&] { return emitExpr(expr.true_expr(), type_hint); };
|
auto true_expr = [&] { return emitExpr(expr.true_expr(), type_hint); };
|
||||||
auto false_expr = [&] { return emitExpr(expr.false_expr(), type_hint); };
|
auto false_expr = [&] { return emitExpr(expr.false_expr(), type_hint); };
|
||||||
return emitIfExpr(expr.range(), cond_value, true_expr, false_expr);
|
return emitIfExpr(expr.range(), cond_value, true_expr, false_expr);
|
||||||
|
|||||||
@ -8,12 +8,16 @@
|
|||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
|
|
||||||
void OptimizeFrozenGraph(std::shared_ptr<Graph>& graph) {
|
void OptimizeFrozenGraph(
|
||||||
|
std::shared_ptr<Graph>& graph,
|
||||||
|
bool optimize_numerics) {
|
||||||
// run a couple times to capture Conv -> Mul -> Add etc
|
// run a couple times to capture Conv -> Mul -> Add etc
|
||||||
for (size_t i = 0; i < 2; i++) {
|
if (optimize_numerics) {
|
||||||
FoldFrozenConvBatchnorm(graph);
|
for (size_t i = 0; i < 2; i++) {
|
||||||
FoldFrozenConvAddOrSub(graph);
|
FoldFrozenConvBatchnorm(graph);
|
||||||
FoldFrozenConvMulOrDiv(graph);
|
FoldFrozenConvAddOrSub(graph);
|
||||||
|
FoldFrozenConvMulOrDiv(graph);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -13,7 +13,9 @@
|
|||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
|
|
||||||
TORCH_API void OptimizeFrozenGraph(std::shared_ptr<Graph>& graph);
|
TORCH_API void OptimizeFrozenGraph(
|
||||||
|
std::shared_ptr<Graph>& graph,
|
||||||
|
bool optimize_numerics = true);
|
||||||
|
|
||||||
} // namespace jit
|
} // namespace jit
|
||||||
} // namespace torch
|
} // namespace torch
|
||||||
|
|||||||
@ -668,6 +668,24 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
|
|||||||
auto prev = it->input(0)->node();
|
auto prev = it->input(0)->node();
|
||||||
Node* origNllLossNode = *it;
|
Node* origNllLossNode = *it;
|
||||||
Node* origLogSoftmaxNode;
|
Node* origLogSoftmaxNode;
|
||||||
|
|
||||||
|
// Check for patterns especially in cases with autocasting enabled
|
||||||
|
// in which a cast node is inserted before the NegativeLogLikelihoodLoss
|
||||||
|
// node and this causes the patterns below not to be recognizable by the
|
||||||
|
// fuseLogSoftmaxNllLoss function
|
||||||
|
// For example if the input is 2D
|
||||||
|
// graph(%input : Half(3, 5),
|
||||||
|
// %target : Long(3)):
|
||||||
|
// %4 : Half(3, 5) = onnx::LogSoftmaxaxis=1
|
||||||
|
// %8 : Float = onnx::Cast[to=1](%4)
|
||||||
|
// %9 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"]
|
||||||
|
// return (%8)
|
||||||
|
Node* castNode = nullptr;
|
||||||
|
if (prev->kind() == onnx::Cast) {
|
||||||
|
castNode = prev;
|
||||||
|
prev = prev->input(0)->node();
|
||||||
|
}
|
||||||
|
|
||||||
if (prev->kind() == onnx::LogSoftmax) {
|
if (prev->kind() == onnx::LogSoftmax) {
|
||||||
// if the input is 2D
|
// if the input is 2D
|
||||||
// graph(%input : Float(3, 5),
|
// graph(%input : Float(3, 5),
|
||||||
@ -675,7 +693,7 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
|
|||||||
// %4 : Float(3, 5) = onnx::LogSoftmaxaxis=1
|
// %4 : Float(3, 5) = onnx::LogSoftmaxaxis=1
|
||||||
// %8 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"]
|
// %8 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"]
|
||||||
// return (%8)
|
// return (%8)
|
||||||
origLogSoftmaxNode = it->input(0)->node();
|
origLogSoftmaxNode = prev;
|
||||||
} else if (
|
} else if (
|
||||||
prev->kind() == onnx::Transpose &&
|
prev->kind() == onnx::Transpose &&
|
||||||
prev->input(0)->node()->kind() == onnx::LogSoftmax) {
|
prev->input(0)->node()->kind() == onnx::LogSoftmax) {
|
||||||
@ -751,6 +769,19 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the pattern indeed consists of a cast node before the
|
||||||
|
// NegativeLogLikelihoodLoss node, place a cast node in the beginning
|
||||||
|
// of the pattern instead
|
||||||
|
if (castNode != nullptr) {
|
||||||
|
auto onnx_type = castNode->i(attr::to);
|
||||||
|
Node* cast_node = b->owningGraph()->create(onnx::Cast, 1);
|
||||||
|
cast_node->addInput(origLogSoftmaxNode->inputs().at(0));
|
||||||
|
cast_node->i_(attr::to, onnx_type);
|
||||||
|
cast_node->insertBefore(origLogSoftmaxNode);
|
||||||
|
origLogSoftmaxNode->replaceInputWith(
|
||||||
|
origLogSoftmaxNode->inputs().at(0), cast_node->output());
|
||||||
|
}
|
||||||
|
|
||||||
Node* softmaxCrossEntropyNode = b->owningGraph()->create(
|
Node* softmaxCrossEntropyNode = b->owningGraph()->create(
|
||||||
onnx::SoftmaxCrossEntropyLoss, it->outputs().size());
|
onnx::SoftmaxCrossEntropyLoss, it->outputs().size());
|
||||||
for (size_t i = 0; i < softmaxCrossEntropyNode->outputs().size(); ++i) {
|
for (size_t i = 0; i < softmaxCrossEntropyNode->outputs().size(); ++i) {
|
||||||
|
|||||||
@ -33,39 +33,38 @@ def _orthogonalize(matrix, epsilon=1e-8):
|
|||||||
|
|
||||||
|
|
||||||
class PowerSGDState(object):
|
class PowerSGDState(object):
|
||||||
"""
|
r"""
|
||||||
Stores both the gradient compression configs and the internal states for all the gradients during the training.
|
Stores both the algorithm's hyperparameters and the internal state for all the gradients during the training.
|
||||||
Particularly, `matrix_approximation_rank` and `start_powerSGD_iter` are the main configs that need to be tuned by the user.
|
Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main hyperparameters that should be tuned by the user.
|
||||||
Although `use_error_feedback` and `warm_start` can also be tuned by the user,
|
For performance, we suggest to keep binary hyperparameters ``use_error_feedback`` and ``warm_start`` on.
|
||||||
they are typically turned on for performance.
|
|
||||||
|
|
||||||
Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`]
|
1. ``matrix_approximation_rank`` controls the size of compressed low-rank tensors, which determines the compression rate. The lower the rank, the stronger the compression.
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
1) To tune `matrix_approximation_rank`, the user can increase it from 1 by factors of 2,
|
1.1. If ``matrix_approximation_rank`` is too low, the full model quality will need more training steps to reach or will never reach and yield loss in accuracy.
|
||||||
until a satisfying accuracy can be reached.
|
|
||||||
The increase of `matrix_approximation_rank` can substantially increase the computation costs of the compression.
|
1.2. The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression, and the accuracy may not be futher improved beyond a certain ``matrix_approximation_rank`` threshold.
|
||||||
However, the accuracy may not be futher improved beyond a certain `matrix_approximation_rank` value.
|
|
||||||
2) To tune `start_powerSGD_iter`, the user can typically start with 10% of total training steps,
|
To tune ``matrix_approximation_rank``, we suggest to start from 1 and increase by factors of 2 (like an expoential grid search, 1, 2, 4, ...), until a satisfactory accuracy is reached. Typically only a small value 1-4 is used. For some NLP tasks (as shown in Appendix D of the original paper), this value has been increased to 32.
|
||||||
and increase it until a satisfying accuracy can be reached.
|
|
||||||
Deferrring PowerSGD can effectively improve the accuracy,
|
2. ``start_powerSGD_iter`` defers PowerSGD compression util step ``start_powerSGD_iter``, and vanilla allreduce runs prior to step ``start_powerSGD_iter``. This hybrid scheme of **vanilla allreduce + PowerSGD** can effectively improve the accuracy, even a relatively small ``matrix_approximation_rank`` is used. This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, and compressing gradients too early may make the training quickly take a suboptimal trajectory, which can result in an irrecoverable impact on the accuracy.
|
||||||
even a relatively small `matrix_approximation_rank` is used.
|
|
||||||
This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients,
|
To tune ``start_powerSGD_iter``, we suggest to start with 10% of total training steps, and increase it until a satisfactory accuracy is reached.
|
||||||
and compressing gradients too early may make the training quickly take a suboptimal trajectory,
|
|
||||||
which can result in an irrecoverable impact on the accuracy.
|
.. warning ::
|
||||||
The minimum value allowed in DDP is 2, if error feedback or warm-up is enabled.
|
If error feedback or warm-up is enabled, the minimum value of ``start_powerSGD_iter`` allowed in DDP is 2.
|
||||||
This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP,
|
This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP,
|
||||||
and this can conflict with any tensor memorized before the rebuild process.
|
and this can conflict with any tensor memorized before the rebuild process.
|
||||||
"""
|
""" # noqa
|
||||||
|
|
||||||
__slots__ = [
|
__slots__ = [
|
||||||
"process_group",
|
"process_group",
|
||||||
# The two fields below are the configs that usually need to be tuned by the user.
|
# The two fields below are the hyperparameters that should be tuned by the user.
|
||||||
"matrix_approximation_rank",
|
"matrix_approximation_rank",
|
||||||
"start_powerSGD_iter",
|
"start_powerSGD_iter",
|
||||||
# The two fields below are the configs that usually need to be turned on for performance.
|
# The two fields below are the binary hyperparameters recommended to be turned on for performance.
|
||||||
"use_error_feedback",
|
"use_error_feedback",
|
||||||
"warm_start",
|
"warm_start",
|
||||||
# The fields below are not configs.
|
# The fields below are internal state.
|
||||||
"rng",
|
"rng",
|
||||||
"error_dict",
|
"error_dict",
|
||||||
"p_memory_dict",
|
"p_memory_dict",
|
||||||
@ -93,21 +92,12 @@ class PowerSGDState(object):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.process_group = process_group
|
self.process_group = process_group
|
||||||
# The low rank for matrix approximation controls the size of compressed low-rank tensors,
|
|
||||||
# which determines the computation ratio.
|
|
||||||
# Typically only a small value 1-4 is used.
|
|
||||||
# For some NLP tasks (as shown in Appendix D of the original paper
|
|
||||||
# https://arxiv.org/pdf/1905.13727.pdf, the rank value has been increased to 32.
|
|
||||||
# A high rank value will increase the computation costs of compression exponentially.
|
|
||||||
# A good choice depends on how much extra computation can be hidden by the dominating communication costs.
|
|
||||||
self.matrix_approximation_rank = matrix_approximation_rank
|
self.matrix_approximation_rank = matrix_approximation_rank
|
||||||
# This defers PowerSGD compression util step 'start_powerSGD_iter',
|
# Deferring PowerSGD compression util step 'start_powerSGD_iter' can have two advantages:
|
||||||
# and vanilla allreduce runs before step 'start_powerSGD_iter'.
|
|
||||||
# This hybrid scheme of vanilla allreduce + PowerSGD can have two advantages:
|
|
||||||
# 1) It turns out that PowerSGD may lead to a non-trivial accuracy loss,
|
# 1) It turns out that PowerSGD may lead to a non-trivial accuracy loss,
|
||||||
# even if the matrix approximation rank is increased to a large value.
|
# even if the matrix approximation rank is increased to a large value.
|
||||||
# To mitigate the accuracy loss, a simple yet effective way is mixing vanilla allreduce
|
# To mitigate the accuracy loss, a simple yet effective way is mixing vanilla allreduce
|
||||||
# (or a more convervative compression such as FP16 compression) with PowerSGD.
|
# (or a more conservative compression such as FP16 compression) with PowerSGD.
|
||||||
# 2) There is an internal optimization of rebuilding buckets process in DDP,
|
# 2) There is an internal optimization of rebuilding buckets process in DDP,
|
||||||
# in order to save the memory space.
|
# in order to save the memory space.
|
||||||
# This step takes place after the first iteration.
|
# This step takes place after the first iteration.
|
||||||
@ -162,38 +152,44 @@ class PowerSGDState(object):
|
|||||||
|
|
||||||
|
|
||||||
def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
||||||
"""
|
r"""
|
||||||
This DDP communication hook implements the original PowerSGD gradient compression
|
This DDP communication hook implements PowerSGD gradient compression
|
||||||
algorithm described in https://arxiv.org/abs/1905.13727.
|
algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_.
|
||||||
Once gradient tensors are aggregated across all workers, this hook applies
|
Once gradient tensors are aggregated across all workers, this hook applies
|
||||||
compression as follows:
|
compression as follows:
|
||||||
1) Views the input flattened 1D gradient tensor as two groups of per-parameter tensors:
|
|
||||||
high-rank tensors and vector-like rank-1 tensors (for biases).
|
1. Views the input flattened 1D gradient tensor as two groups of per-parameter tensors: high-rank tensors and vector-like rank-1 tensors (for biases).
|
||||||
2) Handles rank-1 tensors by allreducing them without compression:
|
|
||||||
2.1) Allocate contiguous memory for those rank-1 tensors,
|
2. Handles rank-1 tensors by allreducing them without compression:
|
||||||
and allreduces all the rank-1 tensors as a batch, without compression;
|
|
||||||
2.2) Copies the individual rank-1 tensors from the contiguous memory back to the input tensor.
|
2.1. Allocate contiguous memory for those rank-1 tensors, and allreduces all the rank-1 tensors as a batch, without compression;
|
||||||
3) Handles high-rank tensors by PowerSGD compression:
|
|
||||||
3.1) For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M,
|
2.2. Copies the individual rank-1 tensors from the contiguous memory back to the input tensor.
|
||||||
|
|
||||||
|
3. Handles high-rank tensors by PowerSGD compression:
|
||||||
|
|
||||||
|
3.1. For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M,
|
||||||
such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
|
such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
|
||||||
3.2) Computes each P in Ps, which is equal to MQ;
|
|
||||||
3.3) Allreduces Ps as a batch;
|
|
||||||
3.4) Orthogonalizes each P in Ps;
|
|
||||||
3.5) Computes each Q in Qs, which is approximately equal to M^TP;
|
|
||||||
3.6) Allreduces Qs as a batch;
|
|
||||||
3.7) Computes each M among all the high-rank tensors, which is approximately equal to PQ^T.
|
|
||||||
|
|
||||||
Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations.
|
3.2. Computes each P in Ps, which is equal to MQ;
|
||||||
This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy,
|
|
||||||
but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers.
|
|
||||||
|
|
||||||
TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration --
|
3.3. Allreduces Ps as a batch;
|
||||||
one left multiplication and one right multiplication.
|
|
||||||
For warm-start, can take one such step at a time, and alternate between them.
|
3.4. Orthogonalizes each P in Ps;
|
||||||
|
|
||||||
|
3.5. Computes each Q in Qs, which is approximately equal to M^TP;
|
||||||
|
|
||||||
|
3.6. Allreduces Qs as a batch;
|
||||||
|
|
||||||
|
3.7. Computes each M among all the high-rank tensors, which is approximately equal to PQ^T.
|
||||||
|
|
||||||
|
Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
|
||||||
|
This not only gives the user more control over the tradeoff between speedup and accuracy,
|
||||||
|
but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
|
state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
|
||||||
To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`].
|
To tune the compression configs, mainly need to tune `matrix_approximation_rank`` and ``start_powerSGD_iter``.
|
||||||
bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
|
bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
|
||||||
Note that since DDP comm hook only supports single process single device mode at this time,
|
Note that since DDP comm hook only supports single process single device mode at this time,
|
||||||
only exactly one tensor is stored in this bucket.
|
only exactly one tensor is stored in this bucket.
|
||||||
@ -202,9 +198,9 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
|||||||
Future handler of the communication, which updates the gradients in place.
|
Future handler of the communication, which updates the gradients in place.
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
|
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
|
||||||
>>> ddp_model.register_comm_hook(state, powerSGD_hook)
|
>>> ddp_model.register_comm_hook(state, powerSGD_hook)
|
||||||
"""
|
""" # noqa
|
||||||
process_group = state.process_group
|
process_group = state.process_group
|
||||||
group_to_use = process_group if process_group is not None else dist.group.WORLD
|
group_to_use = process_group if process_group is not None else dist.group.WORLD
|
||||||
world_size = group_to_use.size()
|
world_size = group_to_use.size()
|
||||||
@ -374,6 +370,10 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
|||||||
for tensor, p, q in zip(high_rank_tensors, ps, qs):
|
for tensor, p, q in zip(high_rank_tensors, ps, qs):
|
||||||
torch.matmul(tensor.t(), p, out=q)
|
torch.matmul(tensor.t(), p, out=q)
|
||||||
|
|
||||||
|
# TODO: The above procedure does two matmul+allreduce steps per iteration --
|
||||||
|
# one left multiplication and one right multiplication.
|
||||||
|
# For warm-start, can take one such step at a time, and alternate between them.
|
||||||
|
|
||||||
# Allreduce Qs.
|
# Allreduce Qs.
|
||||||
return [
|
return [
|
||||||
dist.all_reduce(
|
dist.all_reduce(
|
||||||
@ -412,40 +412,48 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
|||||||
|
|
||||||
|
|
||||||
def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
||||||
"""
|
r"""
|
||||||
This DDP communication hook implements a simplified PowerSGD gradient compression
|
This DDP communication hook implements a simplified PowerSGD gradient compression
|
||||||
algorithm described in https://arxiv.org/abs/1905.13727.
|
algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_.
|
||||||
|
This variant does not compress the gradients layer by layer,
|
||||||
|
but instead compresses the flattened input tensor that batches all the gradients.
|
||||||
|
Therefore, it is **faster** than :meth:`powerSGD_hook`,
|
||||||
|
but usually results in a **much lower accuracy**, unless ``matrix_approximation_rank`` is 1.
|
||||||
|
|
||||||
|
.. warning ::
|
||||||
|
Increasing ``matrix_approximation_rank`` here may not necessarily increase the accuracy,
|
||||||
|
because batching per-parameter tensors without column/row alignment can destroy low-rank structure.
|
||||||
|
Therefore, the user should always consider :meth:`powerSGD_hook` first,
|
||||||
|
and only consider this variant when a satisfactory accuracy can be achieved when ``matrix_approximation_rank`` is 1.
|
||||||
|
|
||||||
Once gradient tensors are aggregated across all workers, this hook applies
|
Once gradient tensors are aggregated across all workers, this hook applies
|
||||||
compression to the flattened input tensor that batches per-parameter tensors as follows:
|
compression as follows:
|
||||||
1) Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
|
|
||||||
2) Creates two low-rank tensors P and Q for decomposing M,
|
|
||||||
such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
|
|
||||||
2) Computes P, which is equal to MQ;
|
|
||||||
3) Allreduces P;
|
|
||||||
4) Orthogonalizes P;
|
|
||||||
5) Computes Q, which is approximately equal to M^TP;
|
|
||||||
6) Allreduces Q;
|
|
||||||
7) Computes M, which is approximately equal to PQ^T.
|
|
||||||
8) Truncates the input tensor to the original length.
|
|
||||||
|
|
||||||
This variant is faster than `powerSGD_hook` that runs layer-wise gradient compression,
|
1. Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
|
||||||
but it usually results in a much lower accuracy, unless `matrix_approximation_rank` in the state is 1.
|
|
||||||
Increasing `matrix_approximation_rank` may not necessarily increase the accuracy,
|
|
||||||
because batching per-parameter tensors without column/row alignment can destroy low-rank structure.
|
|
||||||
Therefore, the user shoud always consider `powerSGD_hook` first,
|
|
||||||
and only consider this variant when a satisfying accuracy can be achieved when `matrix_approximation_rank` is 1.
|
|
||||||
|
|
||||||
Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations.
|
2. Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
|
||||||
This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy,
|
|
||||||
but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers.
|
|
||||||
|
|
||||||
TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration --
|
3. Computes P, which is equal to MQ;
|
||||||
one left multiplication and one right multiplication.
|
|
||||||
For warm-start, can take one such step at a time, and alternate between them.
|
4. Allreduces P;
|
||||||
|
|
||||||
|
5. Orthogonalizes P;
|
||||||
|
|
||||||
|
6. Computes Q, which is approximately equal to M^TP;
|
||||||
|
|
||||||
|
7. Allreduces Q;
|
||||||
|
|
||||||
|
8. Computes M, which is approximately equal to PQ^T.
|
||||||
|
|
||||||
|
9. Truncates the input tensor to the original length.
|
||||||
|
|
||||||
|
Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
|
||||||
|
This not only gives the user more control over the tradeoff between speedup and accuracy,
|
||||||
|
but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
|
state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
|
||||||
To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`].
|
To tune the compression configs, mainly need to tune ``matrix_approximation_rank`` and ``start_powerSGD_iter``.
|
||||||
bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
|
bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
|
||||||
Note that since DDP comm hook only supports single process single device mode at this time,
|
Note that since DDP comm hook only supports single process single device mode at this time,
|
||||||
only exactly one tensor is stored in this bucket.
|
only exactly one tensor is stored in this bucket.
|
||||||
@ -454,9 +462,9 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
|||||||
Future handler of the communication, which updates the gradients in place.
|
Future handler of the communication, which updates the gradients in place.
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
|
>>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
|
||||||
>>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
|
>>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
|
||||||
"""
|
""" # noqa
|
||||||
process_group = state.process_group
|
process_group = state.process_group
|
||||||
group_to_use = process_group if process_group is not None else dist.group.WORLD
|
group_to_use = process_group if process_group is not None else dist.group.WORLD
|
||||||
world_size = group_to_use.size()
|
world_size = group_to_use.size()
|
||||||
@ -563,6 +571,10 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
|
|||||||
out=state.q_memory_dict[bucket_index],
|
out=state.q_memory_dict[bucket_index],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO: The above procedure does two matmul+allreduce steps per iteration --
|
||||||
|
# one left multiplication and one right multiplication.
|
||||||
|
# For warm-start, can take one such step at a time, and alternate between them.
|
||||||
|
|
||||||
return [
|
return [
|
||||||
dist.all_reduce(
|
dist.all_reduce(
|
||||||
state.q_memory_dict[bucket_index], group=group_to_use, async_op=True
|
state.q_memory_dict[bucket_index], group=group_to_use, async_op=True
|
||||||
|
|||||||
@ -4,7 +4,7 @@ from typing import Dict, List, Set, NamedTuple, Tuple
|
|||||||
import torch
|
import torch
|
||||||
from torch.fx.passes.split_module import split_module
|
from torch.fx.passes.split_module import split_module
|
||||||
import operator
|
import operator
|
||||||
from torch.fx.experimental.partitioner_utils import Partition, \
|
from torch.fx._experimental.partitioner_utils import Partition, \
|
||||||
Device, PartitionerConfig, get_partition_to_latency_mapping,\
|
Device, PartitionerConfig, get_partition_to_latency_mapping,\
|
||||||
get_latency_of_partitioned_graph, NodeLatency, get_extra_size_of, \
|
get_latency_of_partitioned_graph, NodeLatency, get_extra_size_of, \
|
||||||
PartitionMode
|
PartitionMode
|
||||||
@ -2,7 +2,7 @@ from typing import Dict, List, NamedTuple, Any
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.fx.passes.shape_prop import ShapeProp
|
from torch.fx.passes.shape_prop import ShapeProp
|
||||||
from torch.fx.experimental.param_fetch import lift_lowering_attrs_to_nodes
|
from torch.fx._experimental.param_fetch import lift_lowering_attrs_to_nodes
|
||||||
from torch.fx.graph import Graph, get_qualified_name
|
from torch.fx.graph import Graph, get_qualified_name
|
||||||
from torch.fx.graph_module import GraphModule
|
from torch.fx.graph_module import GraphModule
|
||||||
from torch.fx.node import Node, Target, map_arg
|
from torch.fx.node import Node, Target, map_arg
|
||||||
@ -116,7 +116,7 @@ class Interpreter:
|
|||||||
|
|
||||||
# Main Node running APIs
|
# Main Node running APIs
|
||||||
|
|
||||||
def placeholder(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
|
def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
"""
|
"""
|
||||||
Execute a ``placeholder`` node. Note that this is stateful:
|
Execute a ``placeholder`` node. Note that this is stateful:
|
||||||
``Interpreter`` maintains an internal iterator over
|
``Interpreter`` maintains an internal iterator over
|
||||||
@ -141,7 +141,7 @@ class Interpreter:
|
|||||||
else:
|
else:
|
||||||
return next(self.args_iter)
|
return next(self.args_iter)
|
||||||
|
|
||||||
def get_attr(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
|
def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
"""
|
"""
|
||||||
Execute a ``get_attr`` node. Will retrieve an attribute
|
Execute a ``get_attr`` node. Will retrieve an attribute
|
||||||
value from the ``Module`` hierarchy of ``self.module``.
|
value from the ``Module`` hierarchy of ``self.module``.
|
||||||
@ -159,7 +159,7 @@ class Interpreter:
|
|||||||
assert isinstance(target, str)
|
assert isinstance(target, str)
|
||||||
return self.fetch_attr(target)
|
return self.fetch_attr(target)
|
||||||
|
|
||||||
def call_function(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
|
def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
"""
|
"""
|
||||||
Execute a ``call_function`` node and return the result.
|
Execute a ``call_function`` node and return the result.
|
||||||
|
|
||||||
@ -178,7 +178,7 @@ class Interpreter:
|
|||||||
# Execute the function and return the result
|
# Execute the function and return the result
|
||||||
return target(*args, **kwargs)
|
return target(*args, **kwargs)
|
||||||
|
|
||||||
def call_method(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
|
def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
"""
|
"""
|
||||||
Execute a ``call_method`` node and return the result.
|
Execute a ``call_method`` node and return the result.
|
||||||
|
|
||||||
@ -199,7 +199,7 @@ class Interpreter:
|
|||||||
assert isinstance(target, str)
|
assert isinstance(target, str)
|
||||||
return getattr(self_obj, target)(*args_tail, **kwargs)
|
return getattr(self_obj, target)(*args_tail, **kwargs)
|
||||||
|
|
||||||
def call_module(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
|
def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
"""
|
"""
|
||||||
Execute a ``call_module`` node and return the result.
|
Execute a ``call_module`` node and return the result.
|
||||||
|
|
||||||
@ -221,7 +221,7 @@ class Interpreter:
|
|||||||
|
|
||||||
return submod(*args, **kwargs)
|
return submod(*args, **kwargs)
|
||||||
|
|
||||||
def output(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
|
def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
"""
|
"""
|
||||||
Execute an ``output`` node. This really just retrieves
|
Execute an ``output`` node. This really just retrieves
|
||||||
the value referenced by the ``output`` node and returns it.
|
the value referenced by the ``output`` node and returns it.
|
||||||
@ -307,12 +307,12 @@ class Transformer(Interpreter):
|
|||||||
method equivalents). We could subclass ``Transformer`` like so::
|
method equivalents). We could subclass ``Transformer`` like so::
|
||||||
|
|
||||||
class NegSigmSwapXformer(Transformer):
|
class NegSigmSwapXformer(Transformer):
|
||||||
def call_function(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
|
def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
if target == torch.sigmoid:
|
if target == torch.sigmoid:
|
||||||
return torch.neg(*args, **kwargs)
|
return torch.neg(*args, **kwargs)
|
||||||
return super().call_function(n)
|
return super().call_function(n)
|
||||||
|
|
||||||
def call_method(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
|
def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
if target == 'neg':
|
if target == 'neg':
|
||||||
call_self, *args_tail = args
|
call_self, *args_tail = args
|
||||||
return call_self.sigmoid(*args_tail, **kwargs)
|
return call_self.sigmoid(*args_tail, **kwargs)
|
||||||
@ -344,7 +344,7 @@ class Transformer(Interpreter):
|
|||||||
self.tracer = TransformerTracer(self.new_graph)
|
self.tracer = TransformerTracer(self.new_graph)
|
||||||
self.tracer.root = module
|
self.tracer.root = module
|
||||||
|
|
||||||
def placeholder(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Proxy:
|
def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
|
||||||
"""
|
"""
|
||||||
Execute a ``placeholder`` node. In ``Transformer``, this is
|
Execute a ``placeholder`` node. In ``Transformer``, this is
|
||||||
overridden to insert a new ``placeholder`` into the output
|
overridden to insert a new ``placeholder`` into the output
|
||||||
@ -360,7 +360,7 @@ class Transformer(Interpreter):
|
|||||||
assert isinstance(target, str)
|
assert isinstance(target, str)
|
||||||
return Proxy(self.new_graph.placeholder(target), self.tracer)
|
return Proxy(self.new_graph.placeholder(target), self.tracer)
|
||||||
|
|
||||||
def get_attr(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Proxy:
|
def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
|
||||||
"""
|
"""
|
||||||
Execute a ``get_attr`` node. In ``Transformer``, this is
|
Execute a ``get_attr`` node. In ``Transformer``, this is
|
||||||
overridden to insert a new ``get_attr`` node into the output
|
overridden to insert a new ``get_attr`` node into the output
|
||||||
@ -376,6 +376,12 @@ class Transformer(Interpreter):
|
|||||||
assert isinstance(target, str)
|
assert isinstance(target, str)
|
||||||
return Proxy(self.new_graph.get_attr(target), self.tracer)
|
return Proxy(self.new_graph.get_attr(target), self.tracer)
|
||||||
|
|
||||||
|
def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
|
||||||
|
# Override so that the leaf module policy from `self.tracer` is respected.
|
||||||
|
assert isinstance(target, str)
|
||||||
|
submod = self.fetch_attr(target)
|
||||||
|
return self.tracer.call_module(submod, submod.forward, args, kwargs)
|
||||||
|
|
||||||
def transform(self) -> GraphModule:
|
def transform(self) -> GraphModule:
|
||||||
"""
|
"""
|
||||||
Transform ``self.module`` and return the transformed
|
Transform ``self.module`` and return the transformed
|
||||||
|
|||||||
@ -5,7 +5,7 @@ import operator
|
|||||||
|
|
||||||
from .graph import magic_methods, reflectable_magic_methods, Graph
|
from .graph import magic_methods, reflectable_magic_methods, Graph
|
||||||
from typing import Tuple, Dict, Optional, Iterable, Any, Iterator
|
from typing import Tuple, Dict, Optional, Iterable, Any, Iterator
|
||||||
from .node import Target, Node, Argument, base_types
|
from .node import Target, Node, Argument, base_types, map_aggregate
|
||||||
|
|
||||||
class TracerBase:
|
class TracerBase:
|
||||||
graph: Graph
|
graph: Graph
|
||||||
@ -61,8 +61,17 @@ class TracerBase:
|
|||||||
elif isinstance(a, dict):
|
elif isinstance(a, dict):
|
||||||
r = {}
|
r = {}
|
||||||
for k, v in a.items():
|
for k, v in a.items():
|
||||||
if not isinstance(k, str):
|
# Check for invalid dict keys. We do not want a Proxy to appear
|
||||||
raise NotImplementedError(f"dictionaries with non-string keys: {a}")
|
# anywhere within the key. Since keys can be collection types,
|
||||||
|
# we iterate through the key with map_aggregate
|
||||||
|
k = self.create_arg(k)
|
||||||
|
|
||||||
|
def no_node(arg):
|
||||||
|
if isinstance(arg, Node):
|
||||||
|
raise RuntimeError("Keys for dictionaries used as an argument cannot contain a "
|
||||||
|
"Node. Got key: {k}")
|
||||||
|
map_aggregate(k, no_node)
|
||||||
|
|
||||||
r[k] = self.create_arg(v)
|
r[k] = self.create_arg(v)
|
||||||
return r
|
return r
|
||||||
elif isinstance(a, slice):
|
elif isinstance(a, slice):
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import torch
|
|||||||
from torch.jit._script import RecursiveScriptModule, ScriptModule
|
from torch.jit._script import RecursiveScriptModule, ScriptModule
|
||||||
|
|
||||||
|
|
||||||
def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = True):
|
def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics: bool = True):
|
||||||
r"""
|
r"""
|
||||||
Freezing a :class:`ScriptModule` will clone it and attempt to inline the cloned
|
Freezing a :class:`ScriptModule` will clone it and attempt to inline the cloned
|
||||||
module's submodules, parameters, and attributes as constants in the TorchScript IR Graph.
|
module's submodules, parameters, and attributes as constants in the TorchScript IR Graph.
|
||||||
@ -26,10 +26,8 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = Tr
|
|||||||
preserved_attrs (Optional[List[str]]): a list of attributes to preserve in addition to the forward method.
|
preserved_attrs (Optional[List[str]]): a list of attributes to preserve in addition to the forward method.
|
||||||
Attributes modified in preserved methods will also be preserved.
|
Attributes modified in preserved methods will also be preserved.
|
||||||
|
|
||||||
optimize (bool): If ``True``, a set of optimization passes will be run to prepare the graph for inference,
|
optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
|
||||||
in addition to the graph cleanup that already occurs. The details of the optimizations can be found in
|
preserve numerics. Full details of optimization can be found at `torch.jit.optimize_frozen_module`.
|
||||||
`torch.jit.optimize_frozen_module.`
|
|
||||||
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Frozen :class:`ScriptModule`.
|
Frozen :class:`ScriptModule`.
|
||||||
@ -102,16 +100,16 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = Tr
|
|||||||
|
|
||||||
out = RecursiveScriptModule(torch._C._freeze_module(mod._c, preserved_attrs))
|
out = RecursiveScriptModule(torch._C._freeze_module(mod._c, preserved_attrs))
|
||||||
RecursiveScriptModule._finalize_scriptmodule(out)
|
RecursiveScriptModule._finalize_scriptmodule(out)
|
||||||
if optimize:
|
optimize_frozen_module(out, optimize_numerics)
|
||||||
optimize_frozen_module(out)
|
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def optimize_frozen_module(mod):
|
def optimize_frozen_module(mod, optimize_numerics: bool = True):
|
||||||
r"""
|
r"""
|
||||||
Runs a series of optimizations looking for patterns that occur in frozen graphs.
|
Runs a series of optimizations looking for patterns that occur in frozen graphs.
|
||||||
The current set of optimizations is:
|
The current set of optimizations is:
|
||||||
|
- Dropout Removal
|
||||||
- Conv -> Batchnorm folding
|
- Conv -> Batchnorm folding
|
||||||
- Conv -> Add/Sub folding
|
- Conv -> Add/Sub folding
|
||||||
- Conv -> Mul/Div folding
|
- Conv -> Mul/Div folding
|
||||||
@ -119,6 +117,12 @@ def optimize_frozen_module(mod):
|
|||||||
Args:
|
Args:
|
||||||
mod (:class:`ScriptModule`): a frozen module to be optimized
|
mod (:class:`ScriptModule`): a frozen module to be optimized
|
||||||
|
|
||||||
|
optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
|
||||||
|
preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_allclose`
|
||||||
|
when applied on a single transformation, however in a module where many transformations are applied
|
||||||
|
the rtol or atol may no longer fall within the default `assert_allclose` tolerance. Conv -> Batchnorm folding,
|
||||||
|
Conv-Add/Sub, and Conv -> Mul/Div folding all may alter numerics.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
|
|
||||||
@ -140,4 +144,12 @@ def optimize_frozen_module(mod):
|
|||||||
assert "batch_norm" not in str(frozen_mod.graph)
|
assert "batch_norm" not in str(frozen_mod.graph)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
torch._C._jit_pass_optimize_frozen_graph(mod.graph)
|
# xxx: keep in sync with frozen_graph_optimization.cpp
|
||||||
|
# intentionally duplicated to make to make it easier to create custom optimization sequence
|
||||||
|
torch._C._jit_pass_remove_dropout(mod._c)
|
||||||
|
if optimize_numerics:
|
||||||
|
# run a couple times to capture Conv -> Mul -> Add etc
|
||||||
|
for _ in range(2):
|
||||||
|
torch._C._jit_pass_fold_frozen_conv_bn(mod.graph)
|
||||||
|
torch._C._jit_pass_fold_frozen_conv_add_or_sub(mod.graph)
|
||||||
|
torch._C._jit_pass_fold_frozen_conv_mul_or_div(mod.graph)
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from .replicate import replicate
|
|||||||
from .scatter_gather import scatter_kwargs, gather, is_namedtuple
|
from .scatter_gather import scatter_kwargs, gather, is_namedtuple
|
||||||
from .parallel_apply import parallel_apply
|
from .parallel_apply import parallel_apply
|
||||||
from torch._utils import _get_device_index, _get_all_device_indices
|
from torch._utils import _get_device_index, _get_all_device_indices
|
||||||
|
from ._functions import _get_stream
|
||||||
|
|
||||||
|
|
||||||
def _find_tensors(obj):
|
def _find_tensors(obj):
|
||||||
@ -438,6 +439,8 @@ class DistributedDataParallel(Module):
|
|||||||
|
|
||||||
# reduction bucket size
|
# reduction bucket size
|
||||||
self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
|
self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
|
||||||
|
# Whether to perform input tensor CPU to GPU copies on a side-stream
|
||||||
|
self.use_side_stream_for_tensor_copies = os.environ.get("PYTORCH_DDP_USE_SIDE_STREAM", "1") == "1"
|
||||||
|
|
||||||
# Sync params and buffers
|
# Sync params and buffers
|
||||||
self._sync_params_and_buffers(authoritative_rank=0)
|
self._sync_params_and_buffers(authoritative_rank=0)
|
||||||
@ -732,7 +735,23 @@ class DistributedDataParallel(Module):
|
|||||||
"""
|
"""
|
||||||
def to_map(obj):
|
def to_map(obj):
|
||||||
if isinstance(obj, torch.Tensor):
|
if isinstance(obj, torch.Tensor):
|
||||||
return (obj.to(target_gpu), )
|
if not self.use_side_stream_for_tensor_copies:
|
||||||
|
return (obj.to(target_gpu), )
|
||||||
|
else:
|
||||||
|
# Perform CPU -> GPU copies in a background stream. This code is
|
||||||
|
# motivated from similar logic in torch/nn/parallel/_functions.py
|
||||||
|
stream = _get_stream(target_gpu)
|
||||||
|
with torch.cuda.stream(stream):
|
||||||
|
output = obj.to(target_gpu)
|
||||||
|
# synchronize with the copy stream
|
||||||
|
with torch.cuda.device(target_gpu):
|
||||||
|
current_stream = torch.cuda.current_stream()
|
||||||
|
# Sync the current stream with the copy stream
|
||||||
|
current_stream.wait_stream(stream)
|
||||||
|
# Ensure tensor memory is not reused until work on
|
||||||
|
# main stream is complete
|
||||||
|
output.record_stream(current_stream)
|
||||||
|
return (output, )
|
||||||
if is_namedtuple(obj):
|
if is_namedtuple(obj):
|
||||||
return [type(obj)(*args) for args in zip(*map(to_map, obj))]
|
return [type(obj)(*args) for args in zip(*map(to_map, obj))]
|
||||||
if isinstance(obj, tuple) and len(obj) > 0:
|
if isinstance(obj, tuple) and len(obj) > 0:
|
||||||
@ -1021,13 +1040,14 @@ class DistributedDataParallel(Module):
|
|||||||
parameter syncs while running Distributed DataParallel training.
|
parameter syncs while running Distributed DataParallel training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
state (object): state is passed to the hook and can be used to maintain
|
state (object): Passed to the hook to maintain any state information during the training process.
|
||||||
and update any state information that users would like to
|
Examples include error feedback in gradient compression,
|
||||||
maintain as part of the training process. Examples: error
|
peers to communicate with next in GossipGrad, etc.
|
||||||
feedback in gradient compression, peers to communicate with
|
|
||||||
next in GossipGrad etc.
|
It is locally stored by each worker
|
||||||
hook (callable): is defined as:
|
and shared by all the gradient tensors on the worker.
|
||||||
hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future:
|
hook (callable): Averages gradient tensors across workers and defined as:
|
||||||
|
``hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future``:
|
||||||
|
|
||||||
This function is called once the bucket is ready. The
|
This function is called once the bucket is ready. The
|
||||||
hook can perform whatever processing is needed and return
|
hook can perform whatever processing is needed and return
|
||||||
@ -1067,7 +1087,7 @@ class DistributedDataParallel(Module):
|
|||||||
DDP communication hook is experimental and subject to change.
|
DDP communication hook is experimental and subject to change.
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
Below is an example of a noop hook that returns back the same tensors:
|
Below is an example of a noop hook that returns the same tensors.
|
||||||
|
|
||||||
>>> def noop(state: object, bucket: dist._GradBucket): -> torch.futures.Future
|
>>> def noop(state: object, bucket: dist._GradBucket): -> torch.futures.Future
|
||||||
>>> fut = torch.futures.Future()
|
>>> fut = torch.futures.Future()
|
||||||
@ -1091,7 +1111,6 @@ class DistributedDataParallel(Module):
|
|||||||
>>> return fut.then(decode)
|
>>> return fut.then(decode)
|
||||||
|
|
||||||
>>> ddp.register_comm_hook(state = None, hook = encode_and_decode)
|
>>> ddp.register_comm_hook(state = None, hook = encode_and_decode)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
self._check_comm_hook(hook)
|
self._check_comm_hook(hook)
|
||||||
dist._register_comm_hook(self.reducer, state, hook)
|
dist._register_comm_hook(self.reducer, state, hook)
|
||||||
|
|||||||
@ -296,6 +296,22 @@ def _is_fp(value):
|
|||||||
return (type == 'Float') or (type == 'Double') or (type == 'Half')
|
return (type == 'Float') or (type == 'Double') or (type == 'Half')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _generate_wrapped_number(g, scalar):
|
||||||
|
"""
|
||||||
|
Create a wrapped number based on https://github.com/pytorch/pytorch/issues/9515
|
||||||
|
A Tensor is a considered a "wrapped number" if it is
|
||||||
|
auto-wrapped from a C++ or Python number type. Integer types are
|
||||||
|
wrapped as 0-dim int64 tensors and floating-point types are
|
||||||
|
wrapped as 0-dim double tensors.
|
||||||
|
|
||||||
|
The input to this function is constant value. If the data type
|
||||||
|
is a floating point type, it is converted to a 0-dim double
|
||||||
|
tensor, else it is converted to a 0-dim tensor of its original type
|
||||||
|
"""
|
||||||
|
assert not isinstance(scalar, torch.Tensor)
|
||||||
|
if isinstance(scalar, float):
|
||||||
|
return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
|
||||||
|
return g.op("Constant", value_t=torch.tensor(scalar))
|
||||||
|
|
||||||
def _sort_helper(g, input, dim, decending=True, out=None):
|
def _sort_helper(g, input, dim, decending=True, out=None):
|
||||||
if out is not None:
|
if out is not None:
|
||||||
|
|||||||
@ -121,6 +121,21 @@ def where(g, condition, self=None, other=None, _outputs=None):
|
|||||||
return sym_help._unbind_helper(g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs)
|
return sym_help._unbind_helper(g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs)
|
||||||
return g.op("Where", condition, self, other)
|
return g.op("Where", condition, self, other)
|
||||||
|
|
||||||
|
@parse_args('v', 'v', 'v', 'i', 'i', 'i')
|
||||||
|
def fake_quantize_per_channel_affine(g, inputs, scale, zero_point, axis, quant_min=-128, quant_max=127):
|
||||||
|
if quant_min not in [0, -128] or quant_max not in [127, 255]:
|
||||||
|
raise RuntimeError(
|
||||||
|
"ONNX defines [0, 255] for quint8 and [-128, 127] for qint8, got [{}, {}]".format(quant_min, quant_max))
|
||||||
|
|
||||||
|
# ONNX defines zero_point to be int8 or uint8
|
||||||
|
if quant_min == 0:
|
||||||
|
zero_point = g.op("Cast", zero_point, to_i=sym_help.cast_pytorch_to_onnx['Byte'])
|
||||||
|
else:
|
||||||
|
zero_point = g.op("Cast", zero_point, to_i=sym_help.cast_pytorch_to_onnx['Char'])
|
||||||
|
return g.op(
|
||||||
|
"DequantizeLinear",
|
||||||
|
g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis),
|
||||||
|
scale, zero_point, axis_i=axis)
|
||||||
|
|
||||||
def _reduce_op_symbolic(onnx_op_name):
|
def _reduce_op_symbolic(onnx_op_name):
|
||||||
def symbolic(g, self, dim=None, keepdim=None):
|
def symbolic(g, self, dim=None, keepdim=None):
|
||||||
|
|||||||
@ -1319,8 +1319,8 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):
|
|||||||
|
|
||||||
axes = [-i for i in range(len(normalized_shape), 0, -1)]
|
axes = [-i for i in range(len(normalized_shape), 0, -1)]
|
||||||
|
|
||||||
two_cst = g.op("Constant", value_t=torch.tensor(2.))
|
two_cst = sym_help._generate_wrapped_number(g, 2.)
|
||||||
eps_cst = g.op("Constant", value_t=torch.tensor(eps))
|
eps_cst = sym_help._generate_wrapped_number(g, eps)
|
||||||
|
|
||||||
mean = g.op("ReduceMean", input, axes_i=axes)
|
mean = g.op("ReduceMean", input, axes_i=axes)
|
||||||
numerator = sub(g, input, mean)
|
numerator = sub(g, input, mean)
|
||||||
|
|||||||
@ -391,9 +391,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
|
|||||||
torch.exp2: lambda input, out=None: -1,
|
torch.exp2: lambda input, out=None: -1,
|
||||||
torch.expm1: lambda input, out=None: -1,
|
torch.expm1: lambda input, out=None: -1,
|
||||||
torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1,
|
torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1,
|
||||||
torch.fake_quantize_per_channel_affine_cachemask: lambda input, scale, zero_point, axis, quant_min, quant_max: -1,
|
|
||||||
torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1,
|
torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1,
|
||||||
torch.fake_quantize_per_tensor_affine_cachemask: lambda input, scale, zero_point, quant_min, quant_max: -1,
|
|
||||||
torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1,
|
torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1,
|
||||||
torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1,
|
torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1,
|
||||||
torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1,
|
torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1,
|
||||||
|
|||||||
@ -22,16 +22,21 @@ from typing import List, Optional, Union
|
|||||||
from setuptools.command.build_ext import build_ext
|
from setuptools.command.build_ext import build_ext
|
||||||
from pkg_resources import packaging # type: ignore
|
from pkg_resources import packaging # type: ignore
|
||||||
|
|
||||||
BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA')
|
|
||||||
IS_WINDOWS = sys.platform == 'win32'
|
IS_WINDOWS = sys.platform == 'win32'
|
||||||
LIB_EXT = '.pyd' if IS_WINDOWS else '.so'
|
LIB_EXT = '.pyd' if IS_WINDOWS else '.so'
|
||||||
EXEC_EXT = '.exe' if IS_WINDOWS else ''
|
EXEC_EXT = '.exe' if IS_WINDOWS else ''
|
||||||
|
CLIB_PREFIX = '' if IS_WINDOWS else 'lib'
|
||||||
|
CLIB_EXT = '.dll' if IS_WINDOWS else '.so'
|
||||||
SHARED_FLAG = '/DLL' if IS_WINDOWS else '-shared'
|
SHARED_FLAG = '/DLL' if IS_WINDOWS else '-shared'
|
||||||
|
|
||||||
_HERE = os.path.abspath(__file__)
|
_HERE = os.path.abspath(__file__)
|
||||||
_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
|
_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
|
||||||
TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib')
|
TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib')
|
||||||
|
|
||||||
|
|
||||||
|
BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA') or (os.path.exists(os.path.join(
|
||||||
|
TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cu{CLIB_EXT}')) and os.path.exists(os.path.join(TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cpp{CLIB_EXT}')))
|
||||||
|
|
||||||
# Taken directly from python stdlib < 3.9
|
# Taken directly from python stdlib < 3.9
|
||||||
# See https://github.com/pytorch/pytorch/issues/48617
|
# See https://github.com/pytorch/pytorch/issues/48617
|
||||||
def _nt_quote_args(args: Optional[List[str]]) -> List[str]:
|
def _nt_quote_args(args: Optional[List[str]]) -> List[str]:
|
||||||
|
|||||||
0
torch/utils/data/datapipes/__init__.py
Executable file → Normal file
0
torch/utils/data/datapipes/__init__.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/__init__.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/__init__.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/listdirfiles.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/listdirfiles.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/loadfilesfromdisk.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/loadfilesfromdisk.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/readfilesfromtar.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/readfilesfromtar.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/readfilesfromzip.py
Executable file → Normal file
0
torch/utils/data/datapipes/iter/readfilesfromzip.py
Executable file → Normal file
0
torch/utils/data/datapipes/utils/__init__.py
Executable file → Normal file
0
torch/utils/data/datapipes/utils/__init__.py
Executable file → Normal file
0
torch/utils/data/datapipes/utils/common.py
Executable file → Normal file
0
torch/utils/data/datapipes/utils/common.py
Executable file → Normal file
17
torch/utils/hipify/hipify_python.py
Executable file → Normal file
17
torch/utils/hipify/hipify_python.py
Executable file → Normal file
@ -174,7 +174,7 @@ def preprocess_file_and_save_result(
|
|||||||
result = preprocessor(output_directory, filepath, all_files, includes, stats,
|
result = preprocessor(output_directory, filepath, all_files, includes, stats,
|
||||||
hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
|
hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
|
||||||
|
|
||||||
fin_path = os.path.join(output_directory, filepath)
|
fin_path = os.path.abspath(os.path.join(output_directory, filepath))
|
||||||
# Show what happened
|
# Show what happened
|
||||||
if show_progress:
|
if show_progress:
|
||||||
print(
|
print(
|
||||||
@ -711,7 +711,7 @@ def preprocessor(
|
|||||||
clean_ctx: GeneratedFileCleaner,
|
clean_ctx: GeneratedFileCleaner,
|
||||||
show_progress: bool) -> HipifyResult:
|
show_progress: bool) -> HipifyResult:
|
||||||
""" Executes the CUDA -> HIP conversion on the specified file. """
|
""" Executes the CUDA -> HIP conversion on the specified file. """
|
||||||
fin_path = os.path.join(output_directory, filepath)
|
fin_path = os.path.abspath(os.path.join(output_directory, filepath))
|
||||||
|
|
||||||
with open(fin_path, 'r', encoding='utf-8') as fin:
|
with open(fin_path, 'r', encoding='utf-8') as fin:
|
||||||
if fin.readline() == HIPIFY_C_BREADCRUMB:
|
if fin.readline() == HIPIFY_C_BREADCRUMB:
|
||||||
@ -721,7 +721,7 @@ def preprocessor(
|
|||||||
|
|
||||||
orig_output_source = output_source
|
orig_output_source = output_source
|
||||||
|
|
||||||
fout_path = os.path.join(output_directory, get_hip_file_path(filepath, is_pytorch_extension))
|
fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(filepath, is_pytorch_extension)))
|
||||||
if not os.path.exists(os.path.dirname(fout_path)):
|
if not os.path.exists(os.path.dirname(fout_path)):
|
||||||
clean_ctx.makedirs(os.path.dirname(fout_path))
|
clean_ctx.makedirs(os.path.dirname(fout_path))
|
||||||
|
|
||||||
@ -829,9 +829,14 @@ def preprocessor(
|
|||||||
with open(fout_path, 'r', encoding='utf-8') as fout_old:
|
with open(fout_path, 'r', encoding='utf-8') as fout_old:
|
||||||
do_write = fout_old.read() != output_source
|
do_write = fout_old.read() != output_source
|
||||||
if do_write:
|
if do_write:
|
||||||
with clean_ctx.open(fout_path, 'w', encoding='utf-8') as fout:
|
try:
|
||||||
fout.write(output_source)
|
with clean_ctx.open(fout_path, 'w', encoding='utf-8') as fout:
|
||||||
return {"hipified_path": fout_path, "status": "ok"}
|
fout.write(output_source)
|
||||||
|
return {"hipified_path": fout_path, "status": "ok"}
|
||||||
|
except PermissionError as e:
|
||||||
|
print(f"{bcolors.WARNING}Failed to save {fout_path} with \"{e.strerror}\", leaving {fin_path} unchanged.{bcolors.ENDC}",
|
||||||
|
file=sys.stderr)
|
||||||
|
return {"hipified_path": fin_path, "status": "skipped"}
|
||||||
else:
|
else:
|
||||||
return {"hipified_path": fout_path, "status": "skipped"}
|
return {"hipified_path": fout_path, "status": "skipped"}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user