mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-24 15:44:58 +08:00
Compare commits
167 Commits
dev/joona/
...
v1.5.1
| Author | SHA1 | Date | |
|---|---|---|---|
| 3c31d73c87 | |||
| dfe8cdff5a | |||
| e7a6ed8151 | |||
| fc0dde5db3 | |||
| 83edd5164a | |||
| 833c4201ad | |||
| 5579c9e4c2 | |||
| 367901e1f9 | |||
| c4903bde61 | |||
| 7d2fcd505c | |||
| bb33e5fc85 | |||
| c5424a85dc | |||
| 5d01f87e58 | |||
| 82f549b0a8 | |||
| f306655d49 | |||
| 409e42e3b8 | |||
| 6151405f6c | |||
| d01065e50c | |||
| 67508dadaa | |||
| b54a731c8e | |||
| 3920c1d173 | |||
| 8d48a6490a | |||
| 17eae0e0cd | |||
| 4a9e45d50e | |||
| eb387a0a2b | |||
| 420c6dc43d | |||
| 39f0a2752a | |||
| 366026ab10 | |||
| 408e158df9 | |||
| 3598dea7ad | |||
| a5b05e8867 | |||
| 7fc2433458 | |||
| aba610b9e8 | |||
| dc30c519dd | |||
| 9bf2aaa659 | |||
| 25621d05df | |||
| 96f218d7dd | |||
| f810011c40 | |||
| 5f8bb352c3 | |||
| 52469a512b | |||
| c56adee862 | |||
| 4ff3872a20 | |||
| d7bdffabed | |||
| 9ba0a89489 | |||
| c164fbccb1 | |||
| 9a51e477ac | |||
| 375566fb78 | |||
| dfdc788076 | |||
| 9e6ef814cc | |||
| 31461800f6 | |||
| e741839b0e | |||
| 8eb39c9cfd | |||
| b5e4c0993d | |||
| 6bc6832bda | |||
| 593594839c | |||
| cf65c8ef15 | |||
| ca0dc1fcdc | |||
| b58f89b2e4 | |||
| 87b6685c6b | |||
| f746f1b746 | |||
| 1379415150 | |||
| 7d638d2596 | |||
| bad005d331 | |||
| 16d8a52407 | |||
| a33b264588 | |||
| 3a67e00889 | |||
| 6bd039551d | |||
| b6c3058d61 | |||
| ed908b4fbc | |||
| b66e0af58b | |||
| bf8a5ede96 | |||
| c2bc5c56c5 | |||
| db3c3ed662 | |||
| 9de4770bbd | |||
| 911a2a6b63 | |||
| 60375bcfdf | |||
| 63dcd9eccc | |||
| e8236d2ed4 | |||
| 0058b1bb7e | |||
| 419283e291 | |||
| 0e6f6ba218 | |||
| ec8dbaf920 | |||
| 7e168d134f | |||
| 6daae58871 | |||
| fee0ff1bf6 | |||
| deaf3b65cf | |||
| dca9c2501d | |||
| 842cd47416 | |||
| a30b49085c | |||
| 82626f8ad9 | |||
| 27fddfda4f | |||
| 7ecf6a1c10 | |||
| beb07a44c4 | |||
| a01c3bd1fe | |||
| ffd010f8a0 | |||
| 8ad59f03a8 | |||
| ed3640df68 | |||
| fb88942f6c | |||
| 5d05c51887 | |||
| df5986fbf3 | |||
| 165403f614 | |||
| fbf18c34ff | |||
| 84f806c821 | |||
| 94139a7d95 | |||
| 75e36186b2 | |||
| f4a0b406dd | |||
| e884e720f0 | |||
| dacdbc22d1 | |||
| 2a789cd0e0 | |||
| f9b010f399 | |||
| 55614ff306 | |||
| b12579da53 | |||
| 920e3eb761 | |||
| bec01e755a | |||
| 6a880e1bc9 | |||
| fa86e32a4e | |||
| 5aabaf2b18 | |||
| 4a707e8f95 | |||
| db127b21eb | |||
| 45313cd9e1 | |||
| df531973e1 | |||
| 9e3c577caa | |||
| 5357b8e4d9 | |||
| 0f23d23db4 | |||
| 7c24280a3f | |||
| 7100f0be13 | |||
| f7f611c2ec | |||
| acb982d0b0 | |||
| aa8b7ad989 | |||
| 2d403ed8be | |||
| c25a664f77 | |||
| ab660ae394 | |||
| 3c476a8858 | |||
| 651fa88645 | |||
| 565c3400b4 | |||
| 3e332778b4 | |||
| f598738920 | |||
| 4c6bfa0187 | |||
| 6f25003682 | |||
| 752c129fa1 | |||
| fb59a9caca | |||
| 4d30dbdd35 | |||
| b7f4a1a397 | |||
| afda1dc943 | |||
| d506ae882b | |||
| 36e5abe531 | |||
| 6e6f62230e | |||
| 5d15577e6c | |||
| 6aa5298c5c | |||
| f3df13725b | |||
| 4eee3caa11 | |||
| 4d96463130 | |||
| 246b824644 | |||
| 5ca9014318 | |||
| 48590d6a9b | |||
| 75148df1f5 | |||
| b89eb7c654 | |||
| 8877885454 | |||
| e2184ba083 | |||
| 8ef47ad2f0 | |||
| 6725b6f503 | |||
| bcd3f6da1a | |||
| 0b3d2f7b7d | |||
| f522651a7e | |||
| 01c8ef2757 | |||
| 7cfe68ce3a | |||
| 6f3120c6b9 |
@ -466,7 +466,7 @@ But if you want to try, then I’d recommend
|
|||||||
# Always install miniconda 3, even if building for Python <3
|
# Always install miniconda 3, even if building for Python <3
|
||||||
new_conda="~/my_new_conda"
|
new_conda="~/my_new_conda"
|
||||||
conda_sh="$new_conda/install_miniconda.sh"
|
conda_sh="$new_conda/install_miniconda.sh"
|
||||||
curl -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
|
curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
|
||||||
chmod +x "$conda_sh"
|
chmod +x "$conda_sh"
|
||||||
"$conda_sh" -b -p "$MINICONDA_ROOT"
|
"$conda_sh" -b -p "$MINICONDA_ROOT"
|
||||||
rm -f "$conda_sh"
|
rm -f "$conda_sh"
|
||||||
|
|||||||
@ -34,8 +34,6 @@ def get_processor_arch_name(cuda_version):
|
|||||||
|
|
||||||
LINUX_PACKAGE_VARIANTS = OrderedDict(
|
LINUX_PACKAGE_VARIANTS = OrderedDict(
|
||||||
manywheel=[
|
manywheel=[
|
||||||
"2.7m",
|
|
||||||
"2.7mu",
|
|
||||||
"3.5m",
|
"3.5m",
|
||||||
"3.6m",
|
"3.6m",
|
||||||
"3.7m",
|
"3.7m",
|
||||||
@ -43,7 +41,7 @@ LINUX_PACKAGE_VARIANTS = OrderedDict(
|
|||||||
],
|
],
|
||||||
conda=dimensions.STANDARD_PYTHON_VERSIONS,
|
conda=dimensions.STANDARD_PYTHON_VERSIONS,
|
||||||
libtorch=[
|
libtorch=[
|
||||||
"2.7m",
|
"3.7m",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -53,11 +51,21 @@ CONFIG_TREE_DATA = OrderedDict(
|
|||||||
wheel=dimensions.STANDARD_PYTHON_VERSIONS,
|
wheel=dimensions.STANDARD_PYTHON_VERSIONS,
|
||||||
conda=dimensions.STANDARD_PYTHON_VERSIONS,
|
conda=dimensions.STANDARD_PYTHON_VERSIONS,
|
||||||
libtorch=[
|
libtorch=[
|
||||||
"2.7",
|
"3.7",
|
||||||
|
],
|
||||||
|
)),
|
||||||
|
windows=(dimensions.CUDA_VERSIONS, OrderedDict(
|
||||||
|
wheel=dimensions.STANDARD_PYTHON_VERSIONS,
|
||||||
|
conda=dimensions.STANDARD_PYTHON_VERSIONS,
|
||||||
|
libtorch=[
|
||||||
|
"3.7",
|
||||||
],
|
],
|
||||||
)),
|
)),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
CONFIG_TREE_DATA_NO_WINDOWS = CONFIG_TREE_DATA.copy()
|
||||||
|
CONFIG_TREE_DATA_NO_WINDOWS.pop("windows")
|
||||||
|
|
||||||
# GCC config variants:
|
# GCC config variants:
|
||||||
#
|
#
|
||||||
# All the nightlies (except libtorch with new gcc ABI) are built with devtoolset7,
|
# All the nightlies (except libtorch with new gcc ABI) are built with devtoolset7,
|
||||||
@ -74,6 +82,11 @@ LINUX_GCC_CONFIG_VARIANTS = OrderedDict(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
WINDOWS_LIBTORCH_CONFIG_VARIANTS = [
|
||||||
|
"debug",
|
||||||
|
"release",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class TopLevelNode(ConfigNode):
|
class TopLevelNode(ConfigNode):
|
||||||
def __init__(self, node_name, config_tree_data, smoke):
|
def __init__(self, node_name, config_tree_data, smoke):
|
||||||
@ -108,6 +121,8 @@ class PackageFormatConfigNode(ConfigNode):
|
|||||||
def get_children(self):
|
def get_children(self):
|
||||||
if self.find_prop("os_name") == "linux":
|
if self.find_prop("os_name") == "linux":
|
||||||
return [LinuxGccConfigNode(self, v) for v in LINUX_GCC_CONFIG_VARIANTS[self.find_prop("package_format")]]
|
return [LinuxGccConfigNode(self, v) for v in LINUX_GCC_CONFIG_VARIANTS[self.find_prop("package_format")]]
|
||||||
|
elif self.find_prop("os_name") == "windows" and self.find_prop("package_format") == "libtorch":
|
||||||
|
return [WindowsLibtorchConfigNode(self, v) for v in WINDOWS_LIBTORCH_CONFIG_VARIANTS]
|
||||||
else:
|
else:
|
||||||
return [ArchConfigNode(self, v) for v in self.find_prop("cuda_versions")]
|
return [ArchConfigNode(self, v) for v in self.find_prop("cuda_versions")]
|
||||||
|
|
||||||
@ -129,6 +144,16 @@ class LinuxGccConfigNode(ConfigNode):
|
|||||||
return [ArchConfigNode(self, v) for v in cuda_versions]
|
return [ArchConfigNode(self, v) for v in cuda_versions]
|
||||||
|
|
||||||
|
|
||||||
|
class WindowsLibtorchConfigNode(ConfigNode):
|
||||||
|
def __init__(self, parent, libtorch_config_variant):
|
||||||
|
super(WindowsLibtorchConfigNode, self).__init__(parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant))
|
||||||
|
|
||||||
|
self.props["libtorch_config_variant"] = libtorch_config_variant
|
||||||
|
|
||||||
|
def get_children(self):
|
||||||
|
return [ArchConfigNode(self, v) for v in self.find_prop("cuda_versions")]
|
||||||
|
|
||||||
|
|
||||||
class ArchConfigNode(ConfigNode):
|
class ArchConfigNode(ConfigNode):
|
||||||
def __init__(self, parent, cu):
|
def __init__(self, parent, cu):
|
||||||
super(ArchConfigNode, self).__init__(parent, get_processor_arch_name(cu))
|
super(ArchConfigNode, self).__init__(parent, get_processor_arch_name(cu))
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import cimodel.lib.miniutils as miniutils
|
|||||||
|
|
||||||
|
|
||||||
class Conf(object):
|
class Conf(object):
|
||||||
def __init__(self, os, cuda_version, pydistro, parms, smoke, libtorch_variant, gcc_config_variant):
|
def __init__(self, os, cuda_version, pydistro, parms, smoke, libtorch_variant, gcc_config_variant, libtorch_config_variant):
|
||||||
|
|
||||||
self.os = os
|
self.os = os
|
||||||
self.cuda_version = cuda_version
|
self.cuda_version = cuda_version
|
||||||
@ -15,11 +15,14 @@ class Conf(object):
|
|||||||
self.smoke = smoke
|
self.smoke = smoke
|
||||||
self.libtorch_variant = libtorch_variant
|
self.libtorch_variant = libtorch_variant
|
||||||
self.gcc_config_variant = gcc_config_variant
|
self.gcc_config_variant = gcc_config_variant
|
||||||
|
self.libtorch_config_variant = libtorch_config_variant
|
||||||
|
|
||||||
def gen_build_env_parms(self):
|
def gen_build_env_parms(self):
|
||||||
elems = [self.pydistro] + self.parms + [binary_build_data.get_processor_arch_name(self.cuda_version)]
|
elems = [self.pydistro] + self.parms + [binary_build_data.get_processor_arch_name(self.cuda_version)]
|
||||||
if self.gcc_config_variant is not None:
|
if self.gcc_config_variant is not None:
|
||||||
elems.append(str(self.gcc_config_variant))
|
elems.append(str(self.gcc_config_variant))
|
||||||
|
if self.libtorch_config_variant is not None:
|
||||||
|
elems.append(str(self.libtorch_config_variant))
|
||||||
return elems
|
return elems
|
||||||
|
|
||||||
def gen_docker_image(self):
|
def gen_docker_image(self):
|
||||||
@ -67,9 +70,14 @@ class Conf(object):
|
|||||||
job_def["requires"].append("update_s3_htmls_for_nightlies_devtoolset7")
|
job_def["requires"].append("update_s3_htmls_for_nightlies_devtoolset7")
|
||||||
job_def["filters"] = {"branches": {"only": "postnightly"}}
|
job_def["filters"] = {"branches": {"only": "postnightly"}}
|
||||||
else:
|
else:
|
||||||
|
filter_branches = ["nightly"]
|
||||||
|
# we only want to add the release branch filter if we aren't
|
||||||
|
# uploading
|
||||||
|
if phase not in ["upload"]:
|
||||||
|
filter_branches.append(r"/release\/.*/")
|
||||||
job_def["filters"] = {
|
job_def["filters"] = {
|
||||||
"branches": {
|
"branches": {
|
||||||
"only": "nightly"
|
"only": filter_branches
|
||||||
},
|
},
|
||||||
# Will run on tags like v1.5.0-rc1, etc.
|
# Will run on tags like v1.5.0-rc1, etc.
|
||||||
"tags": {
|
"tags": {
|
||||||
@ -105,11 +113,18 @@ class Conf(object):
|
|||||||
|
|
||||||
def get_root(smoke, name):
|
def get_root(smoke, name):
|
||||||
|
|
||||||
return binary_build_data.TopLevelNode(
|
if smoke:
|
||||||
name,
|
return binary_build_data.TopLevelNode(
|
||||||
binary_build_data.CONFIG_TREE_DATA,
|
name,
|
||||||
smoke,
|
binary_build_data.CONFIG_TREE_DATA_NO_WINDOWS,
|
||||||
)
|
smoke,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return binary_build_data.TopLevelNode(
|
||||||
|
name,
|
||||||
|
binary_build_data.CONFIG_TREE_DATA,
|
||||||
|
smoke,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def gen_build_env_list(smoke):
|
def gen_build_env_list(smoke):
|
||||||
@ -127,6 +142,7 @@ def gen_build_env_list(smoke):
|
|||||||
c.find_prop("smoke"),
|
c.find_prop("smoke"),
|
||||||
c.find_prop("libtorch_variant"),
|
c.find_prop("libtorch_variant"),
|
||||||
c.find_prop("gcc_config_variant"),
|
c.find_prop("gcc_config_variant"),
|
||||||
|
c.find_prop("libtorch_config_variant"),
|
||||||
)
|
)
|
||||||
newlist.append(conf)
|
newlist.append(conf)
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,6 @@ from cimodel.lib.conf_tree import Ver
|
|||||||
|
|
||||||
CONFIG_TREE_DATA = [
|
CONFIG_TREE_DATA = [
|
||||||
(Ver("ubuntu", "16.04"), [
|
(Ver("ubuntu", "16.04"), [
|
||||||
([Ver("gcc", "5")], [XImportant("onnx_py2")]),
|
|
||||||
([Ver("clang", "7")], [XImportant("onnx_main_py3.6"),
|
([Ver("clang", "7")], [XImportant("onnx_main_py3.6"),
|
||||||
XImportant("onnx_ort1_py3.6"),
|
XImportant("onnx_ort1_py3.6"),
|
||||||
XImportant("onnx_ort2_py3.6")]),
|
XImportant("onnx_ort2_py3.6")]),
|
||||||
|
|||||||
@ -33,8 +33,7 @@ class Conf:
|
|||||||
# TODO: Eventually we can probably just remove the cudnn7 everywhere.
|
# TODO: Eventually we can probably just remove the cudnn7 everywhere.
|
||||||
def get_cudnn_insertion(self):
|
def get_cudnn_insertion(self):
|
||||||
|
|
||||||
omit = self.language == "onnx_py2" \
|
omit = self.language == "onnx_main_py3.6" \
|
||||||
or self.language == "onnx_main_py3.6" \
|
|
||||||
or self.language == "onnx_ort1_py3.6" \
|
or self.language == "onnx_ort1_py3.6" \
|
||||||
or self.language == "onnx_ort2_py3.6" \
|
or self.language == "onnx_ort2_py3.6" \
|
||||||
or set(self.compiler_names).intersection({"android", "mkl", "clang"}) \
|
or set(self.compiler_names).intersection({"android", "mkl", "clang"}) \
|
||||||
@ -71,11 +70,10 @@ class Conf:
|
|||||||
def gen_docker_image(self):
|
def gen_docker_image(self):
|
||||||
|
|
||||||
lang_substitutions = {
|
lang_substitutions = {
|
||||||
"onnx_py2": "py2",
|
|
||||||
"onnx_main_py3.6": "py3.6",
|
"onnx_main_py3.6": "py3.6",
|
||||||
"onnx_ort1_py3.6": "py3.6",
|
"onnx_ort1_py3.6": "py3.6",
|
||||||
"onnx_ort2_py3.6": "py3.6",
|
"onnx_ort2_py3.6": "py3.6",
|
||||||
"cmake": "py2",
|
"cmake": "py3",
|
||||||
}
|
}
|
||||||
|
|
||||||
lang = miniutils.override(self.language, lang_substitutions)
|
lang = miniutils.override(self.language, lang_substitutions)
|
||||||
@ -85,7 +83,7 @@ class Conf:
|
|||||||
def gen_workflow_params(self, phase):
|
def gen_workflow_params(self, phase):
|
||||||
parameters = OrderedDict()
|
parameters = OrderedDict()
|
||||||
lang_substitutions = {
|
lang_substitutions = {
|
||||||
"onnx_py2": "onnx-py2",
|
"onnx_py3": "onnx-py3",
|
||||||
"onnx_main_py3.6": "onnx-main-py3.6",
|
"onnx_main_py3.6": "onnx-main-py3.6",
|
||||||
"onnx_ort1_py3.6": "onnx-ort1-py3.6",
|
"onnx_ort1_py3.6": "onnx-ort1-py3.6",
|
||||||
"onnx_ort2_py3.6": "onnx-ort2-py3.6",
|
"onnx_ort2_py3.6": "onnx-ort2-py3.6",
|
||||||
@ -129,7 +127,7 @@ class Conf:
|
|||||||
job_name = "caffe2_" + self.get_platform() + "_build"
|
job_name = "caffe2_" + self.get_platform() + "_build"
|
||||||
|
|
||||||
if not self.is_important:
|
if not self.is_important:
|
||||||
job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/"]}}
|
job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/", r"/release\/.*/"]}}
|
||||||
job_def.update(self.gen_workflow_params(phase))
|
job_def.update(self.gen_workflow_params(phase))
|
||||||
return {job_name : job_def}
|
return {job_name : job_def}
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,6 @@ CUDA_VERSIONS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
STANDARD_PYTHON_VERSIONS = [
|
STANDARD_PYTHON_VERSIONS = [
|
||||||
"2.7",
|
|
||||||
"3.5",
|
"3.5",
|
||||||
"3.6",
|
"3.6",
|
||||||
"3.7",
|
"3.7",
|
||||||
|
|||||||
@ -114,7 +114,7 @@ class Conf:
|
|||||||
if not self.is_important:
|
if not self.is_important:
|
||||||
# If you update this, update
|
# If you update this, update
|
||||||
# caffe2_build_definitions.py too
|
# caffe2_build_definitions.py too
|
||||||
job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/"]}}
|
job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/", r"/release\/.*/"]}}
|
||||||
job_def.update(self.gen_workflow_params(phase))
|
job_def.update(self.gen_workflow_params(phase))
|
||||||
|
|
||||||
return {job_name : job_def}
|
return {job_name : job_def}
|
||||||
|
|||||||
3290
.circleci/config.yml
3290
.circleci/config.yml
File diff suppressed because it is too large
Load Diff
@ -4,7 +4,7 @@ set -ex
|
|||||||
|
|
||||||
# Optionally install conda
|
# Optionally install conda
|
||||||
if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
||||||
BASE_URL="https://repo.continuum.io/miniconda"
|
BASE_URL="https://repo.anaconda.com/miniconda"
|
||||||
|
|
||||||
MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
|
MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,11 @@ retry () {
|
|||||||
if [[ "$(uname)" == Darwin ]]; then
|
if [[ "$(uname)" == Darwin ]]; then
|
||||||
# macos executor (builds and tests)
|
# macos executor (builds and tests)
|
||||||
workdir="/Users/distiller/project"
|
workdir="/Users/distiller/project"
|
||||||
|
elif [[ "$OSTYPE" == "msys" ]]; then
|
||||||
|
# windows executor (builds and tests)
|
||||||
|
rm -rf /c/w
|
||||||
|
ln -s "/c/Users/circleci/project" /c/w
|
||||||
|
workdir="/c/w"
|
||||||
elif [[ -d "/home/circleci/project" ]]; then
|
elif [[ -d "/home/circleci/project" ]]; then
|
||||||
# machine executor (binary tests)
|
# machine executor (binary tests)
|
||||||
workdir="/home/circleci/project"
|
workdir="/home/circleci/project"
|
||||||
@ -19,8 +24,14 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# It is very important that this stays in sync with binary_populate_env.sh
|
# It is very important that this stays in sync with binary_populate_env.sh
|
||||||
export PYTORCH_ROOT="$workdir/pytorch"
|
if [[ "$OSTYPE" == "msys" ]]; then
|
||||||
export BUILDER_ROOT="$workdir/builder"
|
# We need to make the paths as short as possible on Windows
|
||||||
|
export PYTORCH_ROOT="$workdir/p"
|
||||||
|
export BUILDER_ROOT="$workdir/b"
|
||||||
|
else
|
||||||
|
export PYTORCH_ROOT="$workdir/pytorch"
|
||||||
|
export BUILDER_ROOT="$workdir/builder"
|
||||||
|
fi
|
||||||
|
|
||||||
# Clone the Pytorch branch
|
# Clone the Pytorch branch
|
||||||
retry git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT"
|
retry git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT"
|
||||||
|
|||||||
@ -31,9 +31,9 @@ fi
|
|||||||
|
|
||||||
conda_sh="$workdir/install_miniconda.sh"
|
conda_sh="$workdir/install_miniconda.sh"
|
||||||
if [[ "$(uname)" == Darwin ]]; then
|
if [[ "$(uname)" == Darwin ]]; then
|
||||||
curl --retry 3 -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
|
curl --retry 3 -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
|
||||||
else
|
else
|
||||||
curl --retry 3 -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
curl --retry 3 -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
||||||
fi
|
fi
|
||||||
chmod +x "$conda_sh"
|
chmod +x "$conda_sh"
|
||||||
"$conda_sh" -b -p "$MINICONDA_ROOT"
|
"$conda_sh" -b -p "$MINICONDA_ROOT"
|
||||||
|
|||||||
@ -2,11 +2,31 @@
|
|||||||
set -eux -o pipefail
|
set -eux -o pipefail
|
||||||
export TZ=UTC
|
export TZ=UTC
|
||||||
|
|
||||||
|
tagged_version() {
|
||||||
|
# Grabs version from either the env variable CIRCLE_TAG
|
||||||
|
# or the pytorch git described version
|
||||||
|
if [[ "$OSTYPE" == "msys" ]]; then
|
||||||
|
GIT_DESCRIBE="git --git-dir ${workdir}/p/.git describe"
|
||||||
|
else
|
||||||
|
GIT_DESCRIBE="git --git-dir ${workdir}/pytorch/.git describe"
|
||||||
|
fi
|
||||||
|
if [[ -n "${CIRCLE_TAG:-}" ]]; then
|
||||||
|
echo "${CIRCLE_TAG}"
|
||||||
|
elif ${GIT_DESCRIBE} --exact --tags >/dev/null; then
|
||||||
|
${GIT_DESCRIBE} --tags
|
||||||
|
else
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# We need to write an envfile to persist these variables to following
|
# We need to write an envfile to persist these variables to following
|
||||||
# steps, but the location of the envfile depends on the circleci executor
|
# steps, but the location of the envfile depends on the circleci executor
|
||||||
if [[ "$(uname)" == Darwin ]]; then
|
if [[ "$(uname)" == Darwin ]]; then
|
||||||
# macos executor (builds and tests)
|
# macos executor (builds and tests)
|
||||||
workdir="/Users/distiller/project"
|
workdir="/Users/distiller/project"
|
||||||
|
elif [[ "$OSTYPE" == "msys" ]]; then
|
||||||
|
# windows executor (builds and tests)
|
||||||
|
workdir="/c/w"
|
||||||
elif [[ -d "/home/circleci/project" ]]; then
|
elif [[ -d "/home/circleci/project" ]]; then
|
||||||
# machine executor (binary tests)
|
# machine executor (binary tests)
|
||||||
workdir="/home/circleci/project"
|
workdir="/home/circleci/project"
|
||||||
@ -23,7 +43,15 @@ configs=($BUILD_ENVIRONMENT)
|
|||||||
export PACKAGE_TYPE="${configs[0]}"
|
export PACKAGE_TYPE="${configs[0]}"
|
||||||
export DESIRED_PYTHON="${configs[1]}"
|
export DESIRED_PYTHON="${configs[1]}"
|
||||||
export DESIRED_CUDA="${configs[2]}"
|
export DESIRED_CUDA="${configs[2]}"
|
||||||
export DESIRED_DEVTOOLSET="${configs[3]:-}"
|
if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
|
||||||
|
export DESIRED_DEVTOOLSET=""
|
||||||
|
export LIBTORCH_CONFIG="${configs[3]:-}"
|
||||||
|
if [[ "$LIBTORCH_CONFIG" == 'debug' ]]; then
|
||||||
|
export DEBUG=1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
export DESIRED_DEVTOOLSET="${configs[3]:-}"
|
||||||
|
fi
|
||||||
if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
|
if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
|
||||||
export BUILD_PYTHONLESS=1
|
export BUILD_PYTHONLESS=1
|
||||||
fi
|
fi
|
||||||
@ -47,15 +75,17 @@ export DATE="$(date -u +%Y%m%d)"
|
|||||||
#TODO: We should be pulling semver version from the base version.txt
|
#TODO: We should be pulling semver version from the base version.txt
|
||||||
BASE_BUILD_VERSION="1.5.0.dev$DATE"
|
BASE_BUILD_VERSION="1.5.0.dev$DATE"
|
||||||
# Change BASE_BUILD_VERSION to git tag when on a git tag
|
# Change BASE_BUILD_VERSION to git tag when on a git tag
|
||||||
if git describe --tags --exact >/dev/null 2>/dev/null; then
|
# Use 'git -C' to make doubly sure we're in the correct directory for checking
|
||||||
|
# the git tag
|
||||||
|
if tagged_version >/dev/null; then
|
||||||
# Switch upload folder to 'test/' if we are on a tag
|
# Switch upload folder to 'test/' if we are on a tag
|
||||||
PIP_UPLOAD_FOLDER='test/'
|
PIP_UPLOAD_FOLDER='test/'
|
||||||
# Grab git tag, remove prefixed v and remove everything after -
|
# Grab git tag, remove prefixed v and remove everything after -
|
||||||
# Used to clean up tags that are for release candidates like v1.5.0-rc1
|
# Used to clean up tags that are for release candidates like v1.5.0-rc1
|
||||||
# Turns tag v1.5.0-rc1 -> v1.5.0
|
# Turns tag v1.5.0-rc1 -> v1.5.0
|
||||||
BASE_BUILD_VERSION="$(git describe --tags | sed -e 's/^v//' -e 's/-.*$//')"
|
BASE_BUILD_VERSION="$(tagged_version | sed -e 's/^v//' -e 's/-.*$//')"
|
||||||
fi
|
fi
|
||||||
if [[ "$(uname)" == 'Darwin' ]] || [[ "$DESIRED_CUDA" == "cu101" ]] || [[ "$PACKAGE_TYPE" == conda ]]; then
|
if [[ "$(uname)" == 'Darwin' ]] || [[ "$DESIRED_CUDA" == "cu102" ]] || [[ "$PACKAGE_TYPE" == conda ]]; then
|
||||||
export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}"
|
export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}"
|
||||||
else
|
else
|
||||||
export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA"
|
export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA"
|
||||||
@ -94,6 +124,10 @@ export DESIRED_CUDA="$DESIRED_CUDA"
|
|||||||
export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
|
export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
|
||||||
export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
|
export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
|
||||||
export DESIRED_DEVTOOLSET="$DESIRED_DEVTOOLSET"
|
export DESIRED_DEVTOOLSET="$DESIRED_DEVTOOLSET"
|
||||||
|
if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
|
||||||
|
export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
|
||||||
|
export DEBUG="${DEBUG:-}"
|
||||||
|
fi
|
||||||
|
|
||||||
export DATE="$DATE"
|
export DATE="$DATE"
|
||||||
export NIGHTLIES_DATE_PREAMBLE=1.5.0.dev
|
export NIGHTLIES_DATE_PREAMBLE=1.5.0.dev
|
||||||
@ -113,8 +147,13 @@ export DOCKER_IMAGE="$DOCKER_IMAGE"
|
|||||||
|
|
||||||
export workdir="$workdir"
|
export workdir="$workdir"
|
||||||
export MAC_PACKAGE_WORK_DIR="$workdir"
|
export MAC_PACKAGE_WORK_DIR="$workdir"
|
||||||
export PYTORCH_ROOT="$workdir/pytorch"
|
if [[ "$OSTYPE" == "msys" ]]; then
|
||||||
export BUILDER_ROOT="$workdir/builder"
|
export PYTORCH_ROOT="$workdir/p"
|
||||||
|
export BUILDER_ROOT="$workdir/b"
|
||||||
|
else
|
||||||
|
export PYTORCH_ROOT="$workdir/pytorch"
|
||||||
|
export BUILDER_ROOT="$workdir/builder"
|
||||||
|
fi
|
||||||
export MINICONDA_ROOT="$workdir/miniconda"
|
export MINICONDA_ROOT="$workdir/miniconda"
|
||||||
export PYTORCH_FINAL_PACKAGE_DIR="$workdir/final_pkgs"
|
export PYTORCH_FINAL_PACKAGE_DIR="$workdir/final_pkgs"
|
||||||
|
|
||||||
|
|||||||
33
.circleci/scripts/binary_windows_build.sh
Normal file
33
.circleci/scripts/binary_windows_build.sh
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eux -o pipefail
|
||||||
|
|
||||||
|
source "/c/w/env"
|
||||||
|
mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
|
||||||
|
|
||||||
|
export CUDA_VERSION="${DESIRED_CUDA/cu/}"
|
||||||
|
export VC_YEAR=2017
|
||||||
|
export USE_SCCACHE=1
|
||||||
|
export SCCACHE_BUCKET=ossci-compiler-cache-windows
|
||||||
|
export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
|
||||||
|
|
||||||
|
set +x
|
||||||
|
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
|
||||||
|
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
|
||||||
|
set -x
|
||||||
|
|
||||||
|
if [[ "$CIRCLECI" == 'true' && -d "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019" ]]; then
|
||||||
|
rm -rf "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Free space on filesystem before build:"
|
||||||
|
df -h
|
||||||
|
|
||||||
|
pushd "$BUILDER_ROOT"
|
||||||
|
if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
|
||||||
|
./windows/internal/build_conda.bat
|
||||||
|
elif [[ "$PACKAGE_TYPE" == 'wheel' || "$PACKAGE_TYPE" == 'libtorch' ]]; then
|
||||||
|
./windows/internal/build_wheels.bat
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Free space on filesystem after build:"
|
||||||
|
df -h
|
||||||
37
.circleci/scripts/binary_windows_upload.sh
Normal file
37
.circleci/scripts/binary_windows_upload.sh
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eu -o pipefail
|
||||||
|
set +x
|
||||||
|
declare -x "AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
|
||||||
|
declare -x "AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
|
||||||
|
|
||||||
|
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
|
||||||
|
# DO NOT TURN -x ON BEFORE THIS LINE
|
||||||
|
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
|
||||||
|
set -eux -o pipefail
|
||||||
|
|
||||||
|
source "/env"
|
||||||
|
|
||||||
|
# This gets set in binary_populate_env.sh, but lets have a sane default just in case
|
||||||
|
PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly/}
|
||||||
|
# TODO: Combine CONDA_UPLOAD_CHANNEL and PIP_UPLOAD_FOLDER into one variable
|
||||||
|
# The only difference is the trailing slash
|
||||||
|
# Strip trailing slashes if there
|
||||||
|
CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
|
||||||
|
|
||||||
|
pushd /root/workspace/final_pkgs
|
||||||
|
# Upload the package to the final location
|
||||||
|
if [[ "$PACKAGE_TYPE" == conda ]]; then
|
||||||
|
retry conda install -yq anaconda-client
|
||||||
|
anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload "$(ls)" -u "pytorch-${CONDA_UPLOAD_CHANNEL}" --label main --no-progress --force
|
||||||
|
elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
|
||||||
|
retry conda install -c conda-forge -yq awscli
|
||||||
|
s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
|
for pkg in $(ls); do
|
||||||
|
retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
|
||||||
|
done
|
||||||
|
else
|
||||||
|
retry conda install -c conda-forge -yq awscli
|
||||||
|
s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
|
||||||
|
retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
|
||||||
|
fi
|
||||||
|
|
||||||
@ -72,10 +72,10 @@ time python tools/setup_helpers/generate_code.py \
|
|||||||
|
|
||||||
# Build the docs
|
# Build the docs
|
||||||
pushd docs/cpp
|
pushd docs/cpp
|
||||||
pip install breathe>=4.13.0 bs4 lxml six
|
pip install breathe==4.13.0 bs4 lxml six
|
||||||
pip install --no-cache-dir -e "git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme"
|
pip install --no-cache-dir -e "git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme"
|
||||||
pip install exhale>=0.2.1
|
pip install exhale>=0.2.1
|
||||||
pip install sphinx>=2.0
|
pip install sphinx==2.4.4
|
||||||
# Uncomment once it is fixed
|
# Uncomment once it is fixed
|
||||||
# pip install -r requirements.txt
|
# pip install -r requirements.txt
|
||||||
time make VERBOSE=1 html -j
|
time make VERBOSE=1 html -j
|
||||||
|
|||||||
@ -52,3 +52,12 @@ binary_mac_params: &binary_mac_params
|
|||||||
environment:
|
environment:
|
||||||
BUILD_ENVIRONMENT: << parameters.build_environment >>
|
BUILD_ENVIRONMENT: << parameters.build_environment >>
|
||||||
|
|
||||||
|
binary_windows_params: &binary_windows_params
|
||||||
|
parameters:
|
||||||
|
build_environment:
|
||||||
|
type: string
|
||||||
|
default: ""
|
||||||
|
environment:
|
||||||
|
BUILD_ENVIRONMENT: << parameters.build_environment >>
|
||||||
|
BUILD_FOR_SYSTEM: windows
|
||||||
|
|
||||||
|
|||||||
@ -275,3 +275,46 @@
|
|||||||
script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
|
script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
|
||||||
cat "$script"
|
cat "$script"
|
||||||
source "$script"
|
source "$script"
|
||||||
|
|
||||||
|
binary_windows_build:
|
||||||
|
<<: *binary_windows_params
|
||||||
|
executor:
|
||||||
|
name: windows-cpu-with-nvidia-cuda
|
||||||
|
steps:
|
||||||
|
# See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
|
||||||
|
- attach_scripts
|
||||||
|
- run:
|
||||||
|
<<: *binary_checkout
|
||||||
|
- run:
|
||||||
|
<<: *binary_populate_env
|
||||||
|
- run:
|
||||||
|
name: Build
|
||||||
|
no_output_timeout: "1h"
|
||||||
|
command: |
|
||||||
|
set -eux -o pipefail
|
||||||
|
script="/c/w/p/.circleci/scripts/binary_windows_build.sh"
|
||||||
|
cat "$script"
|
||||||
|
source "$script"
|
||||||
|
- persist_to_workspace:
|
||||||
|
root: "C:/w"
|
||||||
|
paths: final_pkgs
|
||||||
|
|
||||||
|
binary_windows_upload:
|
||||||
|
<<: *binary_windows_params
|
||||||
|
docker:
|
||||||
|
- image: continuumio/miniconda
|
||||||
|
steps:
|
||||||
|
# See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
|
||||||
|
- attach_scripts
|
||||||
|
- run:
|
||||||
|
<<: *binary_checkout
|
||||||
|
- run:
|
||||||
|
<<: *binary_populate_env
|
||||||
|
- run:
|
||||||
|
name: Upload
|
||||||
|
no_output_timeout: "10m"
|
||||||
|
command: |
|
||||||
|
set -eux -o pipefail
|
||||||
|
script="/pytorch/.circleci/scripts/binary_windows_upload.sh"
|
||||||
|
cat "$script"
|
||||||
|
source "$script"
|
||||||
|
|||||||
@ -151,7 +151,7 @@
|
|||||||
# Install Anaconda if we need to
|
# Install Anaconda if we need to
|
||||||
if [ -n "${CAFFE2_USE_ANACONDA}" ]; then
|
if [ -n "${CAFFE2_USE_ANACONDA}" ]; then
|
||||||
rm -rf ${TMPDIR}/anaconda
|
rm -rf ${TMPDIR}/anaconda
|
||||||
curl --retry 3 -o ${TMPDIR}/conda.sh https://repo.continuum.io/miniconda/Miniconda${ANACONDA_VERSION}-latest-MacOSX-x86_64.sh
|
curl --retry 3 -o ${TMPDIR}/conda.sh https://repo.anaconda.com/miniconda/Miniconda${ANACONDA_VERSION}-latest-MacOSX-x86_64.sh
|
||||||
chmod +x ${TMPDIR}/conda.sh
|
chmod +x ${TMPDIR}/conda.sh
|
||||||
/bin/bash ${TMPDIR}/conda.sh -b -p ${TMPDIR}/anaconda
|
/bin/bash ${TMPDIR}/conda.sh -b -p ${TMPDIR}/anaconda
|
||||||
rm -f ${TMPDIR}/conda.sh
|
rm -f ${TMPDIR}/conda.sh
|
||||||
|
|||||||
@ -20,16 +20,16 @@ jobs:
|
|||||||
export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
|
export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
|
||||||
|
|
||||||
# TODO We may want to move the rebase logic to a separate step after checkout
|
# TODO We may want to move the rebase logic to a separate step after checkout
|
||||||
# Rebase to master only if in xenial_py3_6_gcc5_4 case
|
# Rebase to release/1.5 only if in xenial_py3_6_gcc5_4 case
|
||||||
if [[ "${CIRCLE_BRANCH}" != "master" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then
|
if [[ "${CIRCLE_BRANCH}" != "release/1.5" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then
|
||||||
echo "Merge master branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT"
|
echo "Merge release/1.5 branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT"
|
||||||
set -x
|
set -x
|
||||||
git config --global user.email "circleci.ossci@gmail.com"
|
git config --global user.email "circleci.ossci@gmail.com"
|
||||||
git config --global user.name "CircleCI"
|
git config --global user.name "CircleCI"
|
||||||
git config remote.origin.url https://github.com/pytorch/pytorch.git
|
git config remote.origin.url https://github.com/pytorch/pytorch.git
|
||||||
git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
|
git config --add remote.origin.fetch +refs/heads/release/1.5:refs/remotes/origin/release/1.5
|
||||||
git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
|
git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/release/1.5:refs/remotes/origin/release/1.5 --depth=100 --quiet
|
||||||
export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master`
|
export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/release/1.5`
|
||||||
echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
|
echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
|
||||||
export GIT_COMMIT=${CIRCLE_SHA1}
|
export GIT_COMMIT=${CIRCLE_SHA1}
|
||||||
echo "GIT_COMMIT: " ${GIT_COMMIT}
|
echo "GIT_COMMIT: " ${GIT_COMMIT}
|
||||||
@ -38,7 +38,7 @@ jobs:
|
|||||||
git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
|
git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
|
||||||
set +x
|
set +x
|
||||||
else
|
else
|
||||||
echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
|
echo "Do NOT merge release/1.5 branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
git submodule sync && git submodule update -q --init --recursive
|
git submodule sync && git submodule update -q --init --recursive
|
||||||
|
|||||||
@ -15,6 +15,7 @@
|
|||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- /ci-all\/.*/
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
- pytorch_windows_test:
|
- pytorch_windows_test:
|
||||||
name: pytorch_windows_vs2017_14.11_py36_cuda10.1_test1
|
name: pytorch_windows_vs2017_14.11_py36_cuda10.1_test1
|
||||||
test_name: pytorch-windows-test1
|
test_name: pytorch-windows-test1
|
||||||
@ -32,6 +33,7 @@
|
|||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- /ci-all\/.*/
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
- pytorch_windows_test:
|
- pytorch_windows_test:
|
||||||
name: pytorch_windows_vs2017_14.11_py36_cuda10.1_test2
|
name: pytorch_windows_vs2017_14.11_py36_cuda10.1_test2
|
||||||
test_name: pytorch-windows-test2
|
test_name: pytorch-windows-test2
|
||||||
@ -49,6 +51,7 @@
|
|||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- /ci-all\/.*/
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
- pytorch_windows_build:
|
- pytorch_windows_build:
|
||||||
name: pytorch_windows_vs2017_14.16_py36_cuda10.1_build
|
name: pytorch_windows_vs2017_14.16_py36_cuda10.1_build
|
||||||
cuda_version: "10"
|
cuda_version: "10"
|
||||||
@ -64,6 +67,7 @@
|
|||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- /ci-all\/.*/
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
- pytorch_windows_test:
|
- pytorch_windows_test:
|
||||||
name: pytorch_windows_vs2017_14.16_py36_cuda10.1_test1
|
name: pytorch_windows_vs2017_14.16_py36_cuda10.1_test1
|
||||||
test_name: pytorch-windows-test1
|
test_name: pytorch-windows-test1
|
||||||
@ -81,6 +85,7 @@
|
|||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- /ci-all\/.*/
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
- pytorch_windows_test:
|
- pytorch_windows_test:
|
||||||
name: pytorch_windows_vs2017_14.16_py36_cuda10.1_test2
|
name: pytorch_windows_vs2017_14.16_py36_cuda10.1_test2
|
||||||
test_name: pytorch-windows-test2
|
test_name: pytorch-windows-test2
|
||||||
@ -98,6 +103,7 @@
|
|||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- /ci-all\/.*/
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
- pytorch_windows_build:
|
- pytorch_windows_build:
|
||||||
name: pytorch_windows_vs2019_py36_cuda10.1_build
|
name: pytorch_windows_vs2019_py36_cuda10.1_build
|
||||||
cuda_version: "10"
|
cuda_version: "10"
|
||||||
|
|||||||
@ -7,12 +7,6 @@
|
|||||||
# pytorch-ci-hud to adjust the list of whitelisted builds
|
# pytorch-ci-hud to adjust the list of whitelisted builds
|
||||||
# at https://github.com/ezyang/pytorch-ci-hud/blob/master/src/BuildHistoryDisplay.js
|
# at https://github.com/ezyang/pytorch-ci-hud/blob/master/src/BuildHistoryDisplay.js
|
||||||
|
|
||||||
- binary_linux_build:
|
|
||||||
name: binary_linux_manywheel_2_7mu_cpu_devtoolset7_build
|
|
||||||
build_environment: "manywheel 2.7mu cpu devtoolset7"
|
|
||||||
requires:
|
|
||||||
- setup
|
|
||||||
docker_image: "pytorch/manylinux-cuda102"
|
|
||||||
- binary_linux_build:
|
- binary_linux_build:
|
||||||
name: binary_linux_manywheel_3_7m_cu102_devtoolset7_build
|
name: binary_linux_manywheel_3_7m_cu102_devtoolset7_build
|
||||||
build_environment: "manywheel 3.7m cu102 devtoolset7"
|
build_environment: "manywheel 3.7m cu102 devtoolset7"
|
||||||
@ -23,24 +17,21 @@
|
|||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- binary_linux_build:
|
- /ci-all\/.*/
|
||||||
name: binary_linux_conda_2_7_cpu_devtoolset7_build
|
- /release\/.*/
|
||||||
build_environment: "conda 2.7 cpu devtoolset7"
|
|
||||||
requires:
|
|
||||||
- setup
|
|
||||||
docker_image: "pytorch/conda-cuda"
|
|
||||||
# This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710
|
# This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710
|
||||||
# - binary_linux_conda_3_6_cu90_devtoolset7_build
|
# - binary_linux_conda_3_6_cu90_devtoolset7_build
|
||||||
|
# TODO rename to remove python version for libtorch
|
||||||
- binary_linux_build:
|
- binary_linux_build:
|
||||||
name: binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_build
|
name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build
|
||||||
build_environment: "libtorch 2.7m cpu devtoolset7"
|
build_environment: "libtorch 3.7m cpu devtoolset7"
|
||||||
requires:
|
requires:
|
||||||
- setup
|
- setup
|
||||||
libtorch_variant: "shared-with-deps"
|
libtorch_variant: "shared-with-deps"
|
||||||
docker_image: "pytorch/manylinux-cuda102"
|
docker_image: "pytorch/manylinux-cuda102"
|
||||||
- binary_linux_build:
|
- binary_linux_build:
|
||||||
name: binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
|
name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
|
||||||
build_environment: "libtorch 2.7m cpu gcc5.4_cxx11-abi"
|
build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi"
|
||||||
requires:
|
requires:
|
||||||
- setup
|
- setup
|
||||||
libtorch_variant: "shared-with-deps"
|
libtorch_variant: "shared-with-deps"
|
||||||
@ -48,45 +39,51 @@
|
|||||||
# TODO we should test a libtorch cuda build, but they take too long
|
# TODO we should test a libtorch cuda build, but they take too long
|
||||||
# - binary_linux_libtorch_2_7m_cu90_devtoolset7_static-without-deps_build
|
# - binary_linux_libtorch_2_7m_cu90_devtoolset7_static-without-deps_build
|
||||||
- binary_mac_build:
|
- binary_mac_build:
|
||||||
name: binary_macos_wheel_3_6_cpu_build
|
name: binary_macos_wheel_3_7_cpu_build
|
||||||
build_environment: "wheel 3.6 cpu"
|
build_environment: "wheel 3.7 cpu"
|
||||||
requires:
|
|
||||||
- setup
|
|
||||||
filters:
|
|
||||||
branches:
|
|
||||||
only:
|
|
||||||
- master
|
|
||||||
- binary_mac_build:
|
|
||||||
name: binary_macos_conda_2_7_cpu_build
|
|
||||||
build_environment: "conda 2.7 cpu"
|
|
||||||
requires:
|
requires:
|
||||||
- setup
|
- setup
|
||||||
filters:
|
filters:
|
||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
# This job has an average run time of 3 hours o.O
|
# This job has an average run time of 3 hours o.O
|
||||||
# Now only running this on master to reduce overhead
|
# Now only running this on master to reduce overhead
|
||||||
|
# TODO rename to remove python version for libtorch
|
||||||
- binary_mac_build:
|
- binary_mac_build:
|
||||||
name: binary_macos_libtorch_2_7_cpu_build
|
name: binary_macos_libtorch_3_7_cpu_build
|
||||||
build_environment: "libtorch 2.7 cpu"
|
build_environment: "libtorch 3.7 cpu"
|
||||||
requires:
|
requires:
|
||||||
- setup
|
- setup
|
||||||
filters:
|
filters:
|
||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- binary_linux_test:
|
- /ci-all\/.*/
|
||||||
name: binary_linux_manywheel_2_7mu_cpu_devtoolset7_test
|
- /release\/.*/
|
||||||
build_environment: "manywheel 2.7mu cpu devtoolset7"
|
- binary_windows_build:
|
||||||
|
name: binary_windows_libtorch_3_7_cpu_debug_build
|
||||||
|
build_environment: "libtorch 3.7 cpu debug"
|
||||||
|
requires:
|
||||||
|
- setup
|
||||||
|
- binary_windows_build:
|
||||||
|
name: binary_windows_libtorch_3_7_cpu_release_build
|
||||||
|
build_environment: "libtorch 3.7 cpu release"
|
||||||
|
requires:
|
||||||
|
- setup
|
||||||
|
- binary_windows_build:
|
||||||
|
name: binary_windows_wheel_3_7_cu102_build
|
||||||
|
build_environment: "wheel 3.7 cu102"
|
||||||
requires:
|
requires:
|
||||||
- setup
|
- setup
|
||||||
- binary_linux_manywheel_2_7mu_cpu_devtoolset7_build
|
|
||||||
docker_image: "pytorch/manylinux-cuda102"
|
|
||||||
filters:
|
filters:
|
||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
- binary_linux_test:
|
- binary_linux_test:
|
||||||
name: binary_linux_manywheel_3_7m_cu102_devtoolset7_test
|
name: binary_linux_manywheel_3_7m_cu102_devtoolset7_test
|
||||||
build_environment: "manywheel 3.7m cu102 devtoolset7"
|
build_environment: "manywheel 3.7m cu102 devtoolset7"
|
||||||
@ -100,29 +97,25 @@
|
|||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- binary_linux_test:
|
- /ci-all\/.*/
|
||||||
name: binary_linux_conda_2_7_cpu_devtoolset7_test
|
- /release\/.*/
|
||||||
build_environment: "conda 2.7 cpu devtoolset7"
|
|
||||||
requires:
|
|
||||||
- setup
|
|
||||||
- binary_linux_conda_2_7_cpu_devtoolset7_build
|
|
||||||
docker_image: "pytorch/conda-cuda"
|
|
||||||
# This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710
|
# This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710
|
||||||
# - binary_linux_conda_3_6_cu90_devtoolset7_test:
|
# - binary_linux_conda_3_6_cu90_devtoolset7_test:
|
||||||
|
# TODO rename to remove python version for libtorch
|
||||||
- binary_linux_test:
|
- binary_linux_test:
|
||||||
name: binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_test
|
name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test
|
||||||
build_environment: "libtorch 2.7m cpu devtoolset7"
|
build_environment: "libtorch 3.7m cpu devtoolset7"
|
||||||
requires:
|
requires:
|
||||||
- setup
|
- setup
|
||||||
- binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_build
|
- binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build
|
||||||
libtorch_variant: "shared-with-deps"
|
libtorch_variant: "shared-with-deps"
|
||||||
docker_image: "pytorch/manylinux-cuda102"
|
docker_image: "pytorch/manylinux-cuda102"
|
||||||
- binary_linux_test:
|
- binary_linux_test:
|
||||||
name: binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test
|
name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test
|
||||||
build_environment: "libtorch 2.7m cpu gcc5.4_cxx11-abi"
|
build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi"
|
||||||
requires:
|
requires:
|
||||||
- setup
|
- setup
|
||||||
- binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
|
- binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
|
||||||
libtorch_variant: "shared-with-deps"
|
libtorch_variant: "shared-with-deps"
|
||||||
docker_image: "pytorch/pytorch-binary-docker-image-ubuntu16.04:latest"
|
docker_image: "pytorch/pytorch-binary-docker-image-ubuntu16.04:latest"
|
||||||
|
|
||||||
|
|||||||
@ -20,21 +20,12 @@
|
|||||||
- docker_build_job:
|
- docker_build_job:
|
||||||
name: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
|
name: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
|
||||||
image_name: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
|
image_name: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
|
||||||
- docker_build_job:
|
|
||||||
name: "pytorch-linux-xenial-cuda9-cudnn7-py2"
|
|
||||||
image_name: "pytorch-linux-xenial-cuda9-cudnn7-py2"
|
|
||||||
- docker_build_job:
|
- docker_build_job:
|
||||||
name: "pytorch-linux-xenial-cuda9-cudnn7-py3"
|
name: "pytorch-linux-xenial-cuda9-cudnn7-py3"
|
||||||
image_name: "pytorch-linux-xenial-cuda9-cudnn7-py3"
|
image_name: "pytorch-linux-xenial-cuda9-cudnn7-py3"
|
||||||
- docker_build_job:
|
- docker_build_job:
|
||||||
name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
|
name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
|
||||||
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
|
image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
|
||||||
- docker_build_job:
|
|
||||||
name: "pytorch-linux-xenial-py2.7.9"
|
|
||||||
image_name: "pytorch-linux-xenial-py2.7.9"
|
|
||||||
- docker_build_job:
|
|
||||||
name: "pytorch-linux-xenial-py2.7"
|
|
||||||
image_name: "pytorch-linux-xenial-py2.7"
|
|
||||||
- docker_build_job:
|
- docker_build_job:
|
||||||
name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
|
name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
|
||||||
image_name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
|
image_name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
|
||||||
|
|||||||
@ -4,6 +4,8 @@
|
|||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
requires:
|
requires:
|
||||||
- pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build
|
- pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build
|
||||||
|
|
||||||
@ -13,6 +15,8 @@
|
|||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
requires:
|
requires:
|
||||||
- pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build
|
- pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build
|
||||||
- pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build
|
- pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build
|
||||||
|
|||||||
@ -7,10 +7,10 @@
|
|||||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:f990c76a-a798-42bb-852f-5be5006f8026"
|
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:f990c76a-a798-42bb-852f-5be5006f8026"
|
||||||
resource_class: large
|
resource_class: large
|
||||||
- pytorch_linux_test:
|
- pytorch_linux_test:
|
||||||
name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test
|
name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test
|
||||||
requires:
|
requires:
|
||||||
- setup
|
- setup
|
||||||
- pytorch_linux_xenial_py3_6_gcc5_4_build
|
- pytorch_linux_xenial_py3_6_gcc5_4_build
|
||||||
build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test"
|
build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_profiling-test"
|
||||||
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:f990c76a-a798-42bb-852f-5be5006f8026"
|
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:f990c76a-a798-42bb-852f-5be5006f8026"
|
||||||
resource_class: large
|
resource_class: large
|
||||||
|
|||||||
@ -31,6 +31,7 @@
|
|||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- /ci-all\/.*/
|
- /ci-all\/.*/
|
||||||
|
- /release\/.*/
|
||||||
build_environment: "pytorch-linux-xenial-py3-clang5-mobile-code-analysis"
|
build_environment: "pytorch-linux-xenial-py3-clang5-mobile-code-analysis"
|
||||||
build_only: "1"
|
build_only: "1"
|
||||||
# Use LLVM-DEV toolchain in android-ndk-r19c docker image
|
# Use LLVM-DEV toolchain in android-ndk-r19c docker image
|
||||||
|
|||||||
42
.github/workflows/lint.yml
vendored
42
.github/workflows/lint.yml
vendored
@ -67,7 +67,7 @@ jobs:
|
|||||||
- name: Run flake8
|
- name: Run flake8
|
||||||
run: |
|
run: |
|
||||||
set -eux
|
set -eux
|
||||||
pip install flake8 flake8-mypy flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi mccabe pycodestyle pyflakes
|
pip install flake8==3.7.9 flake8-mypy flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi mccabe pycodestyle==2.5.0 pyflakes==2.1.1
|
||||||
flake8 --version
|
flake8 --version
|
||||||
flake8 --exit-zero > ${GITHUB_WORKSPACE}/flake8-output.txt
|
flake8 --exit-zero > ${GITHUB_WORKSPACE}/flake8-output.txt
|
||||||
cat ${GITHUB_WORKSPACE}/flake8-output.txt
|
cat ${GITHUB_WORKSPACE}/flake8-output.txt
|
||||||
@ -81,44 +81,6 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
flake8-py2:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Setup Python
|
|
||||||
uses: actions/setup-python@v1
|
|
||||||
with:
|
|
||||||
python-version: 2.x
|
|
||||||
architecture: x64
|
|
||||||
- name: Fetch PyTorch
|
|
||||||
uses: actions/checkout@v1
|
|
||||||
- name: Checkout PR tip
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
|
||||||
# We are on a PR, so actions/checkout leaves us on a merge commit.
|
|
||||||
# Check out the actual tip of the branch.
|
|
||||||
git checkout ${{ github.event.pull_request.head.sha }}
|
|
||||||
fi
|
|
||||||
echo ::set-output name=commit_sha::$(git rev-parse HEAD)
|
|
||||||
id: get_pr_tip
|
|
||||||
- name: Run flake8
|
|
||||||
run: |
|
|
||||||
set -eux
|
|
||||||
pip install flake8
|
|
||||||
rm -rf .circleci tools/clang_format_new.py
|
|
||||||
flake8 --exit-zero > ${GITHUB_WORKSPACE}/flake8-output.txt
|
|
||||||
cat ${GITHUB_WORKSPACE}/flake8-output.txt
|
|
||||||
- name: Add annotations
|
|
||||||
uses: pytorch/add-annotations-github-action@master
|
|
||||||
with:
|
|
||||||
check_name: 'flake8-py2'
|
|
||||||
linter_output_path: 'flake8-output.txt'
|
|
||||||
commit_sha: ${{ steps.get_pr_tip.outputs.commit_sha }}
|
|
||||||
regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w\d+) (?<errorDesc>.*)'
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
|
|
||||||
|
|
||||||
clang-tidy:
|
clang-tidy:
|
||||||
if: github.event_name == 'pull_request'
|
if: github.event_name == 'pull_request'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -198,6 +160,8 @@ jobs:
|
|||||||
-g"-torch/csrc/jit/export.cpp" \
|
-g"-torch/csrc/jit/export.cpp" \
|
||||||
-g"-torch/csrc/jit/import.cpp" \
|
-g"-torch/csrc/jit/import.cpp" \
|
||||||
-g"-torch/csrc/jit/netdef_converter.cpp" \
|
-g"-torch/csrc/jit/netdef_converter.cpp" \
|
||||||
|
-g"-torch/csrc/cuda/nccl.*" \
|
||||||
|
-g"-torch/csrc/cuda/python_nccl.cpp" \
|
||||||
"$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
|
"$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
|
||||||
|
|
||||||
cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
|
cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
|
||||||
|
|||||||
@ -6,6 +6,69 @@ TEST_DIR="$ROOT_DIR/caffe2_tests"
|
|||||||
gtest_reports_dir="${TEST_DIR}/cpp"
|
gtest_reports_dir="${TEST_DIR}/cpp"
|
||||||
pytest_reports_dir="${TEST_DIR}/python"
|
pytest_reports_dir="${TEST_DIR}/python"
|
||||||
|
|
||||||
|
# This is needed to work around ROCm using old docker images until
|
||||||
|
# the transition to new images is complete.
|
||||||
|
# TODO: Remove once ROCm CI is using new images.
|
||||||
|
if [[ $BUILD_ENVIRONMENT == py3.6-devtoolset7-rocmrpm-centos* ]]; then
|
||||||
|
# This file is sourced multiple times, only install conda the first time.
|
||||||
|
# We must install conda where we have write access.
|
||||||
|
CONDA_DIR="$ROOT_DIR/conda"
|
||||||
|
if [[ ! -d $CONDA_DIR ]]; then
|
||||||
|
ANACONDA_PYTHON_VERSION=3.6
|
||||||
|
BASE_URL="https://repo.anaconda.com/miniconda"
|
||||||
|
CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
|
||||||
|
mkdir $CONDA_DIR
|
||||||
|
pushd /tmp
|
||||||
|
wget -q "${BASE_URL}/${CONDA_FILE}"
|
||||||
|
chmod +x "${CONDA_FILE}"
|
||||||
|
./"${CONDA_FILE}" -b -f -p "$CONDA_DIR"
|
||||||
|
popd
|
||||||
|
export PATH="$CONDA_DIR/bin:$PATH"
|
||||||
|
# Ensure we run conda in a directory that jenkins has write access to
|
||||||
|
pushd $CONDA_DIR
|
||||||
|
# Track latest conda update
|
||||||
|
conda update -n base conda
|
||||||
|
# Install correct Python version
|
||||||
|
conda install python="$ANACONDA_PYTHON_VERSION"
|
||||||
|
|
||||||
|
conda_install() {
|
||||||
|
# Ensure that the install command don't upgrade/downgrade Python
|
||||||
|
# This should be called as
|
||||||
|
# conda_install pkg1 pkg2 ... [-c channel]
|
||||||
|
conda install -q -y python="$ANACONDA_PYTHON_VERSION" $*
|
||||||
|
}
|
||||||
|
|
||||||
|
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
|
||||||
|
conda_install numpy pyyaml mkl mkl-include setuptools cffi typing future six
|
||||||
|
|
||||||
|
# TODO: This isn't working atm
|
||||||
|
conda_install nnpack -c killeent
|
||||||
|
|
||||||
|
# Install some other packages
|
||||||
|
|
||||||
|
# Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
|
||||||
|
# defaults installs the most recent networkx version, so we install this lower
|
||||||
|
# version explicitly before scikit-image pulls it in as a dependency
|
||||||
|
pip install networkx==2.0
|
||||||
|
|
||||||
|
# TODO: Why is scipy pinned
|
||||||
|
# numba & llvmlite is pinned because of https://github.com/numba/numba/issues/4368
|
||||||
|
# scikit-learn is pinned because of
|
||||||
|
# https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5
|
||||||
|
# only)
|
||||||
|
pip install --progress-bar off pytest scipy==1.1.0 scikit-learn==0.20.3 scikit-image librosa>=0.6.2 psutil numba==0.46.0 llvmlite==0.30.0
|
||||||
|
|
||||||
|
# click - onnx
|
||||||
|
# hypothesis - tests
|
||||||
|
# jupyter - for tutorials
|
||||||
|
pip install --progress-bar off click hypothesis jupyter protobuf tabulate virtualenv mock typing-extensions
|
||||||
|
|
||||||
|
popd
|
||||||
|
else
|
||||||
|
export PATH="$CONDA_DIR/bin:$PATH"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Figure out which Python to use
|
# Figure out which Python to use
|
||||||
PYTHON="$(which python)"
|
PYTHON="$(which python)"
|
||||||
if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
|
if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
|
||||||
|
|||||||
@ -144,7 +144,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
|
|||||||
# default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
|
# default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
|
||||||
# Fix the pip error: Couldn't find a version that satisfies the requirement
|
# Fix the pip error: Couldn't find a version that satisfies the requirement
|
||||||
sudo pip install --upgrade pip
|
sudo pip install --upgrade pip
|
||||||
pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.1.0.dev1228
|
pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.3.0.dev202005123
|
||||||
fi
|
fi
|
||||||
"$ROOT_DIR/scripts/onnx/test.sh"
|
"$ROOT_DIR/scripts/onnx/test.sh"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -167,7 +167,7 @@ fi
|
|||||||
|
|
||||||
# Patch required to build xla
|
# Patch required to build xla
|
||||||
if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
|
if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
|
||||||
git clone --recursive https://github.com/pytorch/xla.git
|
git clone --recursive -b r1.5 https://github.com/pytorch/xla.git
|
||||||
./xla/scripts/apply_patches.sh
|
./xla/scripts/apply_patches.sh
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -259,7 +259,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
|
|||||||
# XLA build requires Bazel
|
# XLA build requires Bazel
|
||||||
# We use bazelisk to avoid updating Bazel version manually.
|
# We use bazelisk to avoid updating Bazel version manually.
|
||||||
sudo npm install -g @bazel/bazelisk
|
sudo npm install -g @bazel/bazelisk
|
||||||
sudo ln -s "$(command -v bazelisk)" /usr/bin/bazel
|
sudo ln -sf "$(command -v bazelisk)" /usr/bin/bazel
|
||||||
|
|
||||||
# Install bazels3cache for cloud cache
|
# Install bazels3cache for cloud cache
|
||||||
sudo npm install -g bazels3cache
|
sudo npm install -g bazels3cache
|
||||||
|
|||||||
@ -13,12 +13,12 @@ mkdir -p ${WORKSPACE_DIR}
|
|||||||
# If a local installation of conda doesn't exist, we download and install conda
|
# If a local installation of conda doesn't exist, we download and install conda
|
||||||
if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
|
if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
|
||||||
mkdir -p ${WORKSPACE_DIR}
|
mkdir -p ${WORKSPACE_DIR}
|
||||||
curl --retry 3 https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${WORKSPACE_DIR}/miniconda3.sh
|
curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${WORKSPACE_DIR}/miniconda3.sh
|
||||||
retry bash ${WORKSPACE_DIR}/miniconda3.sh -b -p ${WORKSPACE_DIR}/miniconda3
|
retry bash ${WORKSPACE_DIR}/miniconda3.sh -b -p ${WORKSPACE_DIR}/miniconda3
|
||||||
fi
|
fi
|
||||||
export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
|
export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
|
||||||
source ${WORKSPACE_DIR}/miniconda3/bin/activate
|
source ${WORKSPACE_DIR}/miniconda3/bin/activate
|
||||||
retry conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja
|
retry conda install -y mkl mkl-include numpy pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja
|
||||||
|
|
||||||
# The torch.hub tests make requests to GitHub.
|
# The torch.hub tests make requests to GitHub.
|
||||||
#
|
#
|
||||||
|
|||||||
@ -20,7 +20,7 @@ if [ -n "${IN_CIRCLECI}" ]; then
|
|||||||
sudo apt-get install -y --allow-downgrades --allow-change-held-packages libnccl-dev=2.5.6-1+cuda10.1 libnccl2=2.5.6-1+cuda10.1
|
sudo apt-get install -y --allow-downgrades --allow-change-held-packages libnccl-dev=2.5.6-1+cuda10.1 libnccl2=2.5.6-1+cuda10.1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then
|
if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda10.1-cudnn7-py3* ]]; then
|
||||||
# TODO: move this to Docker
|
# TODO: move this to Docker
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
|
sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
|
||||||
|
|||||||
@ -21,7 +21,7 @@ if [ -n "${IN_CIRCLECI}" ]; then
|
|||||||
sudo apt-get -qq install --allow-downgrades --allow-change-held-packages libnccl-dev=2.5.6-1+cuda10.1 libnccl2=2.5.6-1+cuda10.1
|
sudo apt-get -qq install --allow-downgrades --allow-change-held-packages libnccl-dev=2.5.6-1+cuda10.1 libnccl2=2.5.6-1+cuda10.1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then
|
if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda10.1-cudnn7-py3* ]]; then
|
||||||
# TODO: move this to Docker
|
# TODO: move this to Docker
|
||||||
sudo apt-get -qq update
|
sudo apt-get -qq update
|
||||||
sudo apt-get -qq install --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
|
sudo apt-get -qq install --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
|
||||||
@ -141,8 +141,8 @@ test_python_nn() {
|
|||||||
assert_git_not_dirty
|
assert_git_not_dirty
|
||||||
}
|
}
|
||||||
|
|
||||||
test_python_ge_config_simple() {
|
test_python_ge_config_profiling() {
|
||||||
time python test/run_test.py --include test_jit_simple --verbose --determine-from="$DETERMINE_FROM"
|
time python test/run_test.py --include test_jit_profiling test_jit_fuser_profiling --verbose --determine-from="$DETERMINE_FROM"
|
||||||
assert_git_not_dirty
|
assert_git_not_dirty
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -152,7 +152,7 @@ test_python_ge_config_legacy() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test_python_all_except_nn() {
|
test_python_all_except_nn() {
|
||||||
time python test/run_test.py --exclude test_nn test_jit_simple test_jit_legacy test_jit_fuser_legacy --verbose --bring-to-front test_quantization test_quantized test_quantized_tensor test_quantized_nn_mods --determine-from="$DETERMINE_FROM"
|
time python test/run_test.py --exclude test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_profiling --verbose --bring-to-front test_quantization test_quantized test_quantized_tensor test_quantized_nn_mods --determine-from="$DETERMINE_FROM"
|
||||||
assert_git_not_dirty
|
assert_git_not_dirty
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,7 +244,7 @@ test_backward_compatibility() {
|
|||||||
pushd test/backward_compatibility
|
pushd test/backward_compatibility
|
||||||
python dump_all_function_schemas.py --filename new_schemas.txt
|
python dump_all_function_schemas.py --filename new_schemas.txt
|
||||||
pip_uninstall torch
|
pip_uninstall torch
|
||||||
pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
|
pip_install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
python check_backward_compatibility.py --new-schemas new_schemas.txt
|
python check_backward_compatibility.py --new-schemas new_schemas.txt
|
||||||
popd
|
popd
|
||||||
set +x
|
set +x
|
||||||
@ -264,8 +264,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then
|
|||||||
test_xla
|
test_xla
|
||||||
elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then
|
elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then
|
||||||
test_python_ge_config_legacy
|
test_python_ge_config_legacy
|
||||||
elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_simple* || "${JOB_BASE_NAME}" == *ge_config_simple* ]]; then
|
elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_profiling* || "${JOB_BASE_NAME}" == *ge_config_profiling* ]]; then
|
||||||
test_python_ge_config_simple
|
test_python_ge_config_profiling
|
||||||
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
|
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
|
||||||
# TODO: run some C++ tests
|
# TODO: run some C++ tests
|
||||||
echo "no-op at the moment"
|
echo "no-op at the moment"
|
||||||
|
|||||||
@ -5,7 +5,7 @@ if "%BUILD_ENVIRONMENT%"=="" (
|
|||||||
)
|
)
|
||||||
if "%REBUILD%"=="" (
|
if "%REBUILD%"=="" (
|
||||||
IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 )
|
IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 )
|
||||||
curl --retry 3 -k https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
|
curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
|
||||||
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
|
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
|
||||||
)
|
)
|
||||||
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
|
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
|
||||||
|
|||||||
@ -13,7 +13,7 @@ if "%BUILD_ENVIRONMENT%"=="" (
|
|||||||
)
|
)
|
||||||
if NOT "%BUILD_ENVIRONMENT%"=="" (
|
if NOT "%BUILD_ENVIRONMENT%"=="" (
|
||||||
IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 )
|
IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 )
|
||||||
curl --retry 3 https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
|
curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
|
||||||
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
|
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
|
||||||
)
|
)
|
||||||
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
|
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
|
||||||
|
|||||||
@ -1,3 +1,3 @@
|
|||||||
call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
|
call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
|
||||||
cd test && python run_test.py --exclude test_nn test_jit_simple test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="%1" && cd ..
|
cd test && python run_test.py --exclude test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_profiling --verbose --determine-from="%1" && cd ..
|
||||||
if ERRORLEVEL 1 exit /b 1
|
if ERRORLEVEL 1 exit /b 1
|
||||||
|
|||||||
@ -160,20 +160,18 @@ ENDIF(BLAS_FOUND)
|
|||||||
|
|
||||||
IF(LAPACK_FOUND)
|
IF(LAPACK_FOUND)
|
||||||
list(APPEND ATen_CPU_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
|
list(APPEND ATen_CPU_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
|
||||||
if(USE_CUDA)
|
if(USE_CUDA AND MSVC)
|
||||||
# Although Lapack provides CPU (and thus, one might expect that ATen_cuda
|
# Although Lapack provides CPU (and thus, one might expect that ATen_cuda
|
||||||
# would not need this at all), some of our libraries (magma in particular)
|
# would not need this at all), some of our libraries (magma in particular)
|
||||||
# backend to CPU BLAS/LAPACK implementations, and so it is very important
|
# backend to CPU BLAS/LAPACK implementations, and so it is very important
|
||||||
# we get the *right* implementation, because even if the symbols are the
|
# we get the *right* implementation, because even if the symbols are the
|
||||||
# same, LAPACK implementions may have different calling conventions.
|
# same, LAPACK implementions may have different calling conventions.
|
||||||
# This caused https://github.com/pytorch/pytorch/issues/7353
|
# This caused https://github.com/pytorch/pytorch/issues/7353
|
||||||
|
#
|
||||||
|
# We do NOT do this on Linux, since we just rely on torch_cpu to
|
||||||
|
# provide all of the symbols we need
|
||||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
|
list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
|
||||||
endif()
|
endif()
|
||||||
if(USE_ROCM)
|
|
||||||
# It's not altogether clear that HIP behaves the same way, but it
|
|
||||||
# seems safer to assume that it needs it too
|
|
||||||
list(APPEND ATen_HIP_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
|
|
||||||
endif()
|
|
||||||
ENDIF(LAPACK_FOUND)
|
ENDIF(LAPACK_FOUND)
|
||||||
|
|
||||||
IF (UNIX AND NOT APPLE)
|
IF (UNIX AND NOT APPLE)
|
||||||
@ -331,8 +329,12 @@ IF(USE_CUDA AND NOT USE_ROCM)
|
|||||||
IF(USE_MAGMA)
|
IF(USE_MAGMA)
|
||||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${MAGMA_LIBRARIES})
|
list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${MAGMA_LIBRARIES})
|
||||||
IF ($ENV{TH_BINARY_BUILD})
|
IF ($ENV{TH_BINARY_BUILD})
|
||||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
IF (MSVC)
|
||||||
"${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
|
# Do not do this on Linux: see Note [Extra MKL symbols for MAGMA in torch_cpu]
|
||||||
|
# in caffe2/CMakeLists.txt
|
||||||
|
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||||
|
"${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
|
||||||
|
ENDIF(MSVC)
|
||||||
ENDIF($ENV{TH_BINARY_BUILD})
|
ENDIF($ENV{TH_BINARY_BUILD})
|
||||||
ENDIF(USE_MAGMA)
|
ENDIF(USE_MAGMA)
|
||||||
IF ($ENV{ATEN_STATIC_CUDA})
|
IF ($ENV{ATEN_STATIC_CUDA})
|
||||||
|
|||||||
@ -125,13 +125,15 @@ void _parallel_run(
|
|||||||
std::tie(num_tasks, chunk_size) =
|
std::tie(num_tasks, chunk_size) =
|
||||||
internal::calc_num_tasks_and_chunk_size(begin, end, grain_size);
|
internal::calc_num_tasks_and_chunk_size(begin, end, grain_size);
|
||||||
|
|
||||||
std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
|
struct {
|
||||||
std::exception_ptr eptr;
|
std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
|
||||||
std::vector<std::shared_ptr<c10::ivalue::Future>> futures(num_tasks);
|
std::exception_ptr eptr;
|
||||||
for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
|
std::mutex mutex;
|
||||||
futures[task_id] = std::make_shared<c10::ivalue::Future>(c10::NoneType::get());
|
volatile size_t remaining;
|
||||||
}
|
std::condition_variable cv;
|
||||||
auto task = [f, &eptr, &err_flag, &futures, begin, end, chunk_size]
|
} state;
|
||||||
|
|
||||||
|
auto task = [f, &state, begin, end, chunk_size]
|
||||||
(int /* unused */, size_t task_id) {
|
(int /* unused */, size_t task_id) {
|
||||||
int64_t local_start = begin + task_id * chunk_size;
|
int64_t local_start = begin + task_id * chunk_size;
|
||||||
if (local_start < end) {
|
if (local_start < end) {
|
||||||
@ -140,21 +142,30 @@ void _parallel_run(
|
|||||||
ParallelRegionGuard guard(task_id);
|
ParallelRegionGuard guard(task_id);
|
||||||
f(local_start, local_end, task_id);
|
f(local_start, local_end, task_id);
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
if (!err_flag.test_and_set()) {
|
if (!state.err_flag.test_and_set()) {
|
||||||
eptr = std::current_exception();
|
state.eptr = std::current_exception();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
futures[task_id]->markCompleted();
|
{
|
||||||
|
std::unique_lock<std::mutex> lk(state.mutex);
|
||||||
|
if (--state.remaining == 0) {
|
||||||
|
state.cv.notify_one();
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
state.remaining = num_tasks;
|
||||||
_run_with_pool(task, num_tasks);
|
_run_with_pool(task, num_tasks);
|
||||||
|
|
||||||
// Wait for all tasks to finish.
|
// Wait for all tasks to finish.
|
||||||
for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
|
{
|
||||||
futures[task_id]->wait();
|
std::unique_lock<std::mutex> lk(state.mutex);
|
||||||
|
if (state.remaining != 0) {
|
||||||
|
state.cv.wait(lk);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (eptr) {
|
if (state.eptr) {
|
||||||
std::rethrow_exception(eptr);
|
std::rethrow_exception(state.eptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -16,14 +16,6 @@
|
|||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#if defined(__clang__)
|
|
||||||
#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero")))
|
|
||||||
#define __ubsan_ignore_vptr__ __attribute__((no_sanitize("vptr")))
|
|
||||||
#else
|
|
||||||
#define __ubsan_ignore_float_divide_by_zero__
|
|
||||||
#define __ubsan_ignore_vptr__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define AT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
|
#define AT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
|
||||||
TypeName(const TypeName&) = delete; \
|
TypeName(const TypeName&) = delete; \
|
||||||
void operator=(const TypeName&) = delete
|
void operator=(const TypeName&) = delete
|
||||||
|
|||||||
@ -27,14 +27,9 @@ using c10::KernelFunction;
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
void variable_fallback_kernel(const OperatorHandle& op, Stack* stack) {
|
static auto registry = c10::Dispatcher::singleton().registerBackendFallbackKernel(
|
||||||
at::AutoNonVariableTypeMode _var_guard(true);
|
|
||||||
Dispatcher::singleton().callBoxed(op, stack);
|
|
||||||
}
|
|
||||||
|
|
||||||
static auto registry = Dispatcher::singleton().registerBackendFallbackKernel(
|
|
||||||
DispatchKey::VariableTensorId,
|
DispatchKey::VariableTensorId,
|
||||||
KernelFunction::makeFromBoxedFunction<&variable_fallback_kernel>()
|
KernelFunction::makeFallthrough()
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -20,6 +20,10 @@ void registerCustomClass(at::ClassTypePtr class_type) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
at::ClassTypePtr getCustomClass(const std::string& name) {
|
at::ClassTypePtr getCustomClass(const std::string& name) {
|
||||||
|
// BC hack so we can upgrade a binary internally
|
||||||
|
if (name == "__torch__.torch.classes.SentencePiece") {
|
||||||
|
return getCustomClass("__torch__.torch.classes.fb.SentencePiece");
|
||||||
|
}
|
||||||
return customClasses().count(name) ? customClasses()[name] : nullptr;
|
return customClasses().count(name) ? customClasses()[name] : nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -15,6 +15,7 @@
|
|||||||
#include <c10/util/math_compat.h>
|
#include <c10/util/math_compat.h>
|
||||||
#include <ATen/native/cpu/zmath.h>
|
#include <ATen/native/cpu/zmath.h>
|
||||||
#include <c10/util/TypeCast.h>
|
#include <c10/util/TypeCast.h>
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
|
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
#define __at_align32__ __attribute__((aligned(32)))
|
#define __at_align32__ __attribute__((aligned(32)))
|
||||||
|
|||||||
@ -145,7 +145,7 @@ private:
|
|||||||
|
|
||||||
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
|
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
|
||||||
|
|
||||||
class FilterDescriptor
|
class TORCH_CUDA_API FilterDescriptor
|
||||||
: public Descriptor<cudnnFilterStruct,
|
: public Descriptor<cudnnFilterStruct,
|
||||||
&cudnnCreateFilterDescriptor,
|
&cudnnCreateFilterDescriptor,
|
||||||
&cudnnDestroyFilterDescriptor>
|
&cudnnDestroyFilterDescriptor>
|
||||||
|
|||||||
@ -550,7 +550,6 @@ FunctionOption = TypedDict('FunctionOption', {
|
|||||||
'type_method_definition_dispatch': str,
|
'type_method_definition_dispatch': str,
|
||||||
'type_method_formals': List[str],
|
'type_method_formals': List[str],
|
||||||
'variants': str,
|
'variants': str,
|
||||||
'with_gil': bool,
|
|
||||||
'zero_dim_dispatch_when_scalar': str,
|
'zero_dim_dispatch_when_scalar': str,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@ -673,11 +673,11 @@ Tensor & leaky_relu_(
|
|||||||
return at::leaky_relu_out(self, self, neg_val);
|
return at::leaky_relu_out(self, self, neg_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: leakyReLu backward calculation doesn't support in-place call with non-positive slope.
|
// Note: leakyReLu backward calculation doesn't support in-place call with negative slope.
|
||||||
// The reason is that for in-place forward call, the forward result will be saved into autograd
|
// The reason is that for in-place forward call, the forward result will be saved into autograd
|
||||||
// node instead of the input itself, when calculating backward gradient, there is no way to know
|
// node instead of the input itself, when calculating backward gradient, there is no way to know
|
||||||
// whether the original input for current node is positive or not if the input slope is
|
// whether the original input for current node is positive or not if the input slope is
|
||||||
// non-positive. eg. forward is 2, slope is -0.2, the original input for this node could be
|
// negative. eg. forward is 2, slope is -0.2, the original input for this node could be
|
||||||
// either 2, or -10, so no way to get a correct backward gradient in this case.
|
// either 2, or -10, so no way to get a correct backward gradient in this case.
|
||||||
Tensor leaky_relu_backward(
|
Tensor leaky_relu_backward(
|
||||||
const Tensor& grad_output,
|
const Tensor& grad_output,
|
||||||
@ -685,11 +685,11 @@ Tensor leaky_relu_backward(
|
|||||||
Scalar negval,
|
Scalar negval,
|
||||||
bool is_result) {
|
bool is_result) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
!is_result || negval.to<double>() > 0.0,
|
!is_result || negval.to<double>() >= 0.0,
|
||||||
"In-place leakyReLu backward calculation is triggered with a non-positive slope which is not supported. "
|
"In-place leakyReLu backward calculation is triggered with a negative slope which is not supported. "
|
||||||
"This is caused by calling in-place forward function with a non-positive slope, "
|
"This is caused by calling in-place forward function with a negative slope, "
|
||||||
"please call out-of-place version instead. File an issue at https://github.com/pytorch/pytorch if you do "
|
"please call out-of-place version instead. File an issue at https://github.com/pytorch/pytorch if you do "
|
||||||
"require supporting in-place leakRelu backward calculation with non-positive slope");
|
"require supporting in-place leakRelu backward calculation with negative slope");
|
||||||
|
|
||||||
Tensor result;
|
Tensor result;
|
||||||
auto iter = TensorIterator::binary_op(result, self_or_result, grad_output);
|
auto iter = TensorIterator::binary_op(result, self_or_result, grad_output);
|
||||||
@ -698,17 +698,34 @@ Tensor leaky_relu_backward(
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<Tensor, Tensor> log_sigmoid_forward_cpu(const Tensor& input) {
|
std::tuple<Tensor, Tensor> log_sigmoid_forward_cpu(const Tensor& input) {
|
||||||
auto result = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
// FIXME: do these actually need to be zeros_like or can they be empty_like?
|
||||||
auto buffer = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
auto result = at::zeros_like(input, at::MemoryFormat::Contiguous);
|
||||||
|
auto buffer = at::zeros_like(input, at::MemoryFormat::Contiguous);
|
||||||
log_sigmoid_cpu_stub(kCPU, result, buffer, input.contiguous());
|
log_sigmoid_cpu_stub(kCPU, result, buffer, input.contiguous());
|
||||||
return std::make_tuple(result, buffer);
|
return std::make_tuple(result, buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_cpu(Tensor& result, Tensor& buffer, const Tensor& input) {
|
std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_cpu(Tensor& result, Tensor& buffer, const Tensor& input) {
|
||||||
log_sigmoid_cpu_stub(kCPU, result, buffer, input);
|
result.resize_as_(input);
|
||||||
|
buffer.resize_as_(input, at::MemoryFormat::Contiguous);
|
||||||
|
TORCH_CHECK(buffer.is_contiguous(), "Contiguous buffer required for log_sigmoid with out parameter");
|
||||||
|
Tensor result_tmp = result.is_contiguous() ? result : at::empty_like(result, at::MemoryFormat::Contiguous);
|
||||||
|
log_sigmoid_cpu_stub(kCPU, result_tmp, buffer, input.contiguous());
|
||||||
|
if (!result.is_contiguous()) {
|
||||||
|
result.copy_(result_tmp);
|
||||||
|
}
|
||||||
return std::forward_as_tuple(result, buffer);
|
return std::forward_as_tuple(result, buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) {
|
||||||
|
Tensor buffer = at::empty({0}, self.options());
|
||||||
|
return std::get<0>(at::log_sigmoid_forward_out(output, buffer, self));
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor log_sigmoid(const Tensor & self) {
|
||||||
|
return std::get<0>(at::log_sigmoid_forward(self));
|
||||||
|
}
|
||||||
|
|
||||||
Tensor log_sigmoid_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& buffer) {
|
Tensor log_sigmoid_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& buffer) {
|
||||||
Tensor grad_input;
|
Tensor grad_input;
|
||||||
auto iter = at::TensorIterator();
|
auto iter = at::TensorIterator();
|
||||||
|
|||||||
@ -138,6 +138,10 @@ Tensor true_divide(const Tensor& self, const Tensor& divisor) {
|
|||||||
return iter.output();
|
return iter.output();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor& true_divide_(Tensor& self, const Tensor& divisor) {
|
||||||
|
return native::true_divide_out(self, self, divisor);
|
||||||
|
}
|
||||||
|
|
||||||
Tensor& floor_divide_out(Tensor& result, const Tensor& self, const Tensor& other) {
|
Tensor& floor_divide_out(Tensor& result, const Tensor& self, const Tensor& other) {
|
||||||
auto iter = TensorIterator::binary_op(result, self, other,
|
auto iter = TensorIterator::binary_op(result, self, other,
|
||||||
/*check_mem_overlap=*/true);
|
/*check_mem_overlap=*/true);
|
||||||
@ -731,7 +735,11 @@ Tensor& fmod_(Tensor& self, Scalar other) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Tensor true_divide(const Tensor& self, Scalar divisor) {
|
Tensor true_divide(const Tensor& self, Scalar divisor) {
|
||||||
return at::true_divide(self, wrapped_scalar_tensor(divisor)); // redispatch!
|
return self.true_divide(wrapped_scalar_tensor(divisor)); // redispatch!
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor& true_divide_(Tensor& self, Scalar divisor) {
|
||||||
|
return self.true_divide_(wrapped_scalar_tensor(divisor)); // redispatch!
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -609,7 +609,7 @@ at::Tensor _convolution(
|
|||||||
auto weight_view = at::_unsafe_view(weight, -1);
|
auto weight_view = at::_unsafe_view(weight, -1);
|
||||||
auto out = input*weight_view[0];
|
auto out = input*weight_view[0];
|
||||||
if (bias.defined())
|
if (bias.defined())
|
||||||
out = out + bias[0];
|
out.add_(bias[0]);
|
||||||
return out.view(o);
|
return out.view(o);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -639,7 +639,7 @@ at::Tensor _convolution(
|
|||||||
input.contiguous(cudnn_memory_format), weight,
|
input.contiguous(cudnn_memory_format), weight,
|
||||||
padding, stride, dilation, params.groups, params.benchmark, params.deterministic);
|
padding, stride, dilation, params.groups, params.benchmark, params.deterministic);
|
||||||
if (bias.defined()) {
|
if (bias.defined()) {
|
||||||
output = output + reshape_bias(input.dim(), bias);
|
output.add_(reshape_bias(input.dim(), bias));
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (params.use_miopen(input, bias.defined())){
|
} else if (params.use_miopen(input, bias.defined())){
|
||||||
@ -662,14 +662,14 @@ at::Tensor _convolution(
|
|||||||
input.contiguous(cudnn_memory_format), weight,
|
input.contiguous(cudnn_memory_format), weight,
|
||||||
params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
|
params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
|
||||||
if (bias.defined()) {
|
if (bias.defined()) {
|
||||||
output = output + reshape_bias(input.dim(), bias);
|
output.add_(reshape_bias(input.dim(), bias));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
output = at::cudnn_convolution(
|
output = at::cudnn_convolution(
|
||||||
input.contiguous(cudnn_memory_format), weight,
|
input.contiguous(cudnn_memory_format), weight,
|
||||||
params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
|
params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
|
||||||
if (bias.defined()) {
|
if (bias.defined()) {
|
||||||
output = output + reshape_bias(input.dim(), bias);
|
output.add_(reshape_bias(input.dim(), bias));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (params.use_miopen(input, bias.defined())) {
|
} else if (params.use_miopen(input, bias.defined())) {
|
||||||
|
|||||||
@ -70,8 +70,8 @@ struct CAFFE2_API DispatchStub<rT (*)(Args...), T> {
|
|||||||
// they will still compute the same value for cpu_dispatch_ptr.
|
// they will still compute the same value for cpu_dispatch_ptr.
|
||||||
if (!cpu_dispatch_ptr.load(std::memory_order_relaxed)) {
|
if (!cpu_dispatch_ptr.load(std::memory_order_relaxed)) {
|
||||||
FnPtr tmp_cpu_dispatch_ptr = nullptr;
|
FnPtr tmp_cpu_dispatch_ptr = nullptr;
|
||||||
cpu_dispatch_ptr.compare_exchange_weak(
|
while(!cpu_dispatch_ptr.compare_exchange_weak(
|
||||||
tmp_cpu_dispatch_ptr, choose_cpu_impl(), std::memory_order_relaxed);
|
tmp_cpu_dispatch_ptr, choose_cpu_impl(), std::memory_order_relaxed));
|
||||||
}
|
}
|
||||||
return (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
|
return (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
|
||||||
} else if (device_type == DeviceType::CUDA) {
|
} else if (device_type == DeviceType::CUDA) {
|
||||||
|
|||||||
@ -31,15 +31,6 @@ Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & wei
|
|||||||
return std::get<0>(at::nll_loss2d_forward(self, target, weight, reduction, ignore_index));
|
return std::get<0>(at::nll_loss2d_forward(self, target, weight, reduction, ignore_index));
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) {
|
|
||||||
Tensor buffer = at::empty({0}, self.options());
|
|
||||||
return std::get<0>(at::log_sigmoid_forward_out(output, buffer, self));
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor log_sigmoid(const Tensor & self) {
|
|
||||||
return std::get<0>(at::log_sigmoid_forward(self));
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding) {
|
Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding) {
|
||||||
Tensor finput = at::empty({0}, self.options());
|
Tensor finput = at::empty({0}, self.options());
|
||||||
Tensor fgrad_input = at::empty({0}, self.options());
|
Tensor fgrad_input = at::empty({0}, self.options());
|
||||||
|
|||||||
@ -533,7 +533,7 @@ Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) {
|
|||||||
return at::norm(self, 2, dim, keepdim, self.scalar_type());
|
return at::norm(self, 2, dim, keepdim, self.scalar_type());
|
||||||
}
|
}
|
||||||
if (self.is_complex()){
|
if (self.is_complex()){
|
||||||
return at::sqrt(at::sum((self.conj() * self).real(), dim, keepdim));
|
return at::sqrt(at::sum(at::real(self.conj() * self), dim, keepdim));
|
||||||
} else {
|
} else {
|
||||||
return at::sqrt(at::sum((self * self), dim, keepdim));
|
return at::sqrt(at::sum((self * self), dim, keepdim));
|
||||||
}
|
}
|
||||||
@ -553,7 +553,7 @@ Tensor &frobenius_norm_out(
|
|||||||
return at::norm_out(result, self, 2, dim, keepdim, self.scalar_type());
|
return at::norm_out(result, self, 2, dim, keepdim, self.scalar_type());
|
||||||
}
|
}
|
||||||
if (self.is_complex()){
|
if (self.is_complex()){
|
||||||
return at::sqrt_out(result, at::sum((self.conj() * self).real(), dim, keepdim));
|
return at::sqrt_out(result, at::sum(at::real(self.conj() * self), dim, keepdim));
|
||||||
} else {
|
} else {
|
||||||
return at::sqrt_out(result, at::sum((self * self), dim, keepdim));
|
return at::sqrt_out(result, at::sum((self * self), dim, keepdim));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -799,7 +799,7 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim,
|
|||||||
|
|
||||||
if (at::isComplexType(self.scalar_type())){
|
if (at::isComplexType(self.scalar_type())){
|
||||||
ScalarType dtype = c10::toValueType(get_dtype(result, self, {}, true));
|
ScalarType dtype = c10::toValueType(get_dtype(result, self, {}, true));
|
||||||
Tensor real_in = self.real().to(dtype);
|
Tensor real_in = at::real(self).to(dtype);
|
||||||
Tensor real_out = at::empty({0}, self.options().dtype(dtype));
|
Tensor real_out = at::empty({0}, self.options().dtype(dtype));
|
||||||
auto iter = make_reduction("std or var", real_out, real_in, dim, keepdim, dtype);
|
auto iter = make_reduction("std or var", real_out, real_in, dim, keepdim, dtype);
|
||||||
if (iter.numel() == 0) {
|
if (iter.numel() == 0) {
|
||||||
@ -807,7 +807,7 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim,
|
|||||||
} else {
|
} else {
|
||||||
std_var_stub(iter.device_type(), iter, unbiased, false);
|
std_var_stub(iter.device_type(), iter, unbiased, false);
|
||||||
}
|
}
|
||||||
Tensor imag_in = self.imag().to(dtype);
|
Tensor imag_in = at::imag(self).to(dtype);
|
||||||
Tensor imag_out = at::empty({0}, self.options().dtype(dtype));
|
Tensor imag_out = at::empty({0}, self.options().dtype(dtype));
|
||||||
iter = make_reduction("std or var", imag_out, imag_in, dim, keepdim, dtype);
|
iter = make_reduction("std or var", imag_out, imag_in, dim, keepdim, dtype);
|
||||||
if (iter.numel() == 0) {
|
if (iter.numel() == 0) {
|
||||||
@ -845,7 +845,7 @@ static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &r
|
|||||||
".");
|
".");
|
||||||
if (at::isComplexType(self.scalar_type())){
|
if (at::isComplexType(self.scalar_type())){
|
||||||
ScalarType dtype = c10::toValueType(get_dtype(result1, self, {}, true));
|
ScalarType dtype = c10::toValueType(get_dtype(result1, self, {}, true));
|
||||||
Tensor real_in = self.real().to(dtype);
|
Tensor real_in = at::real(self).to(dtype);
|
||||||
Tensor real_out_var = at::empty({0}, self.options().dtype(dtype));
|
Tensor real_out_var = at::empty({0}, self.options().dtype(dtype));
|
||||||
Tensor real_out_mean = at::empty({0}, self.options().dtype(dtype));
|
Tensor real_out_mean = at::empty({0}, self.options().dtype(dtype));
|
||||||
auto iter = make_reduction(fname, real_out_var, real_out_mean, real_in, dim, keepdim, dtype);
|
auto iter = make_reduction(fname, real_out_var, real_out_mean, real_in, dim, keepdim, dtype);
|
||||||
@ -855,7 +855,7 @@ static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &r
|
|||||||
} else {
|
} else {
|
||||||
std_var_stub(iter.device_type(), iter, unbiased, false);
|
std_var_stub(iter.device_type(), iter, unbiased, false);
|
||||||
}
|
}
|
||||||
Tensor imag_in = self.imag().to(dtype);
|
Tensor imag_in = at::imag(self).to(dtype);
|
||||||
Tensor imag_out_var = at::empty({0}, self.options().dtype(dtype));
|
Tensor imag_out_var = at::empty({0}, self.options().dtype(dtype));
|
||||||
Tensor imag_out_mean = at::empty({0}, self.options().dtype(dtype));
|
Tensor imag_out_mean = at::empty({0}, self.options().dtype(dtype));
|
||||||
iter = make_reduction(fname, imag_out_var, imag_out_mean, imag_in, dim, keepdim, dtype);
|
iter = make_reduction(fname, imag_out_var, imag_out_mean, imag_in, dim, keepdim, dtype);
|
||||||
|
|||||||
@ -85,6 +85,7 @@ inline void setStrided(
|
|||||||
IntArrayRef size,
|
IntArrayRef size,
|
||||||
IntArrayRef stride,
|
IntArrayRef stride,
|
||||||
int64_t storage_offset) {
|
int64_t storage_offset) {
|
||||||
|
TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape");
|
||||||
auto* self_ = self.unsafeGetTensorImpl();
|
auto* self_ = self.unsafeGetTensorImpl();
|
||||||
checkInBoundsForStorage(size, stride, storage_offset, self_->storage());
|
checkInBoundsForStorage(size, stride, storage_offset, self_->storage());
|
||||||
|
|
||||||
@ -93,7 +94,6 @@ inline void setStrided(
|
|||||||
self_->set_storage_offset(storage_offset);
|
self_->set_storage_offset(storage_offset);
|
||||||
|
|
||||||
/* size and stride */
|
/* size and stride */
|
||||||
AT_ASSERT(size.size() == stride.size());
|
|
||||||
if (self_->sizes() == size && self_->strides() == stride) {
|
if (self_->sizes() == size && self_->strides() == stride) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -130,6 +130,28 @@ static Tensor reshape_indexer(const Tensor& index, int64_t dims_before, int64_t
|
|||||||
return index.reshape(shape);
|
return index.reshape(shape);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// checks whether index.dtype == int64
|
||||||
|
// and self.dtyp == src.dtype if src is a Tensor
|
||||||
|
static void scatter_gather_dtype_check(
|
||||||
|
const std::string& method_name,
|
||||||
|
const Tensor& self,
|
||||||
|
const Tensor& index,
|
||||||
|
const c10::optional<const Tensor>& src_opt = c10::nullopt
|
||||||
|
) {
|
||||||
|
TORCH_CHECK(
|
||||||
|
index.scalar_type() == at::ScalarType::Long,
|
||||||
|
method_name, "(): Expected dtype int64 for index"
|
||||||
|
);
|
||||||
|
|
||||||
|
if (src_opt.has_value()) {
|
||||||
|
auto src = src_opt.value();
|
||||||
|
TORCH_CHECK(
|
||||||
|
self.scalar_type() == src.scalar_type(),
|
||||||
|
method_name, "(): Expected self.dtype to be equal to src.dtype"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list)
|
AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list)
|
||||||
{
|
{
|
||||||
int64_t element_size_bytes = src.element_size();
|
int64_t element_size_bytes = src.element_size();
|
||||||
@ -493,40 +515,48 @@ Tensor index_fill(const Tensor & self, int64_t dim, const Tensor & index, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
Tensor & gather_out_cpu(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) {
|
Tensor & gather_out_cpu(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) {
|
||||||
|
scatter_gather_dtype_check("gather_out_cpu", self, index, result);
|
||||||
result.resize_(index.sizes());
|
result.resize_(index.sizes());
|
||||||
gather_stub(result.device().type(), result, self, dim, index);
|
gather_stub(result.device().type(), result, self, dim, index);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor gather_cpu(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) {
|
Tensor gather_cpu(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) {
|
||||||
|
scatter_gather_dtype_check("gather_cpu", self, index);
|
||||||
Tensor result = at::empty({0}, self.options());
|
Tensor result = at::empty({0}, self.options());
|
||||||
return gather_out_cpu(result, self, dim, index, sparse_grad);
|
return gather_out_cpu(result, self, dim, index, sparse_grad);
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor & scatter_cpu_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) {
|
Tensor & scatter_cpu_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) {
|
||||||
|
scatter_gather_dtype_check("scatter_cpu", self, index, src);
|
||||||
scatter_stub(self.device().type(), self, dim, index, src);
|
scatter_stub(self.device().type(), self, dim, index, src);
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor & scatter_fill_cpu_(Tensor & self, int64_t dim, const Tensor & index, Scalar src) {
|
Tensor & scatter_fill_cpu_(Tensor & self, int64_t dim, const Tensor & index, Scalar src) {
|
||||||
|
scatter_gather_dtype_check("scatter_fill_cpu", self, index);
|
||||||
scatter_fill_stub(self.device().type(), self, dim, index, src);
|
scatter_fill_stub(self.device().type(), self, dim, index, src);
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
|
Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
|
||||||
|
scatter_gather_dtype_check("scatter", self, index, source);
|
||||||
return self.clone(at::MemoryFormat::Preserve).scatter_(dim, index, source);
|
return self.clone(at::MemoryFormat::Preserve).scatter_(dim, index, source);
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, Scalar source) {
|
Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, Scalar source) {
|
||||||
|
scatter_gather_dtype_check("scatter", self, index);
|
||||||
return self.clone(at::MemoryFormat::Preserve).scatter_(dim, index, source);
|
return self.clone(at::MemoryFormat::Preserve).scatter_(dim, index, source);
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor & scatter_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) {
|
Tensor & scatter_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) {
|
||||||
|
scatter_gather_dtype_check("scatter_add_cpu", self, index, src);
|
||||||
scatter_add_stub(self.device().type(), self, dim, index, src);
|
scatter_add_stub(self.device().type(), self, dim, index, src);
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor scatter_add(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
|
Tensor scatter_add(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
|
||||||
|
scatter_gather_dtype_check("scatter_add", self, index, source);
|
||||||
return self.clone(at::MemoryFormat::Preserve).scatter_add_(dim, index, source);
|
return self.clone(at::MemoryFormat::Preserve).scatter_add_(dim, index, source);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -33,7 +33,7 @@ static inline Tensor to_impl(const Tensor& self, const TensorOptions& options, b
|
|||||||
if (self.is_non_overlapping_and_dense()) {
|
if (self.is_non_overlapping_and_dense()) {
|
||||||
// Copy all strides
|
// Copy all strides
|
||||||
auto r = at::empty_strided(self.sizes(), self.strides(), options.memory_format(c10::nullopt));
|
auto r = at::empty_strided(self.sizes(), self.strides(), options.memory_format(c10::nullopt));
|
||||||
r.copy_(self);
|
r.copy_(self, non_blocking);
|
||||||
return r;
|
return r;
|
||||||
} else {
|
} else {
|
||||||
memory_format = self.suggest_memory_format();
|
memory_format = self.suggest_memory_format();
|
||||||
|
|||||||
@ -99,7 +99,7 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) {
|
|||||||
|
|
||||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
Tensor empty_cpu(IntArrayRef size, const TensorOptions& options_, c10::optional<c10::MemoryFormat> optional_memory_format) {
|
Tensor empty_cpu(IntArrayRef size, const TensorOptions& options_, c10::optional<c10::MemoryFormat> optional_memory_format) {
|
||||||
|
TORCH_CHECK(!isComplexType(at::typeMetaToScalarType(options_.dtype())), "Complex dtype not supported.");
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
!(options_.has_memory_format() && optional_memory_format.has_value()),
|
!(options_.has_memory_format() && optional_memory_format.has_value()),
|
||||||
"Cannot set memory_format both in TensorOptions and explicit argument; please delete "
|
"Cannot set memory_format both in TensorOptions and explicit argument; please delete "
|
||||||
|
|||||||
@ -638,7 +638,7 @@ void TensorIterator::narrow(int dim, int64_t start, int64_t size) {
|
|||||||
for (auto& op : operands_) {
|
for (auto& op : operands_) {
|
||||||
op.data = ((char*)op.data) + op.stride_bytes[dim] * start;
|
op.data = ((char*)op.data) + op.stride_bytes[dim] * start;
|
||||||
}
|
}
|
||||||
if (size == 1) {
|
if (size == 1 && !is_reduction_) {
|
||||||
coalesce_dimensions();
|
coalesce_dimensions();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -891,10 +891,13 @@ std::unique_ptr<TensorIterator> TensorIterator::split(int dim) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int TensorIterator::get_dim_to_split() const {
|
int TensorIterator::get_dim_to_split() const {
|
||||||
TORCH_INTERNAL_ASSERT(ndim() >= 1 && shape()[ndim() - 1] >= 2);
|
TORCH_INTERNAL_ASSERT(ndim() >= 1);
|
||||||
int64_t max_extent = -1;
|
int64_t max_extent = -1;
|
||||||
int dim_to_split = -1;
|
int dim_to_split = -1;
|
||||||
for (int dim = ndim() - 1; dim >= 0; dim--) {
|
for (int dim = ndim() - 1; dim >= 0; dim--) {
|
||||||
|
if (shape_[dim] == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
int64_t size = shape_[dim];
|
int64_t size = shape_[dim];
|
||||||
for (auto& op : operands_) {
|
for (auto& op : operands_) {
|
||||||
int64_t extent = (size - 1) * op.stride_bytes[dim];
|
int64_t extent = (size - 1) * op.stride_bytes[dim];
|
||||||
|
|||||||
@ -98,6 +98,15 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) {
|
|||||||
"output memory locations. Found overlap in input tensor ", i);
|
"output memory locations. Found overlap in input tensor ", i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Dtypes should be the same
|
||||||
|
const auto first_in_cat = tensors[0];
|
||||||
|
for (int64_t i = 1; i < tensors.size(); i++) {
|
||||||
|
TORCH_CHECK(first_in_cat.dtype() == tensors[i].dtype(),
|
||||||
|
"Expected object of scalar type ", first_in_cat.dtype(),
|
||||||
|
" but got scalar type ", tensors[i].dtype(),
|
||||||
|
" for sequence element ", i, ".");
|
||||||
|
}
|
||||||
|
|
||||||
auto should_skip = [](const Tensor& t) { return t.numel() == 0 && t.dim() == 1; };
|
auto should_skip = [](const Tensor& t) { return t.numel() == 0 && t.dim() == 1; };
|
||||||
for (auto const &tensor : tensors) {
|
for (auto const &tensor : tensors) {
|
||||||
if (should_skip(tensor)) {
|
if (should_skip(tensor)) {
|
||||||
|
|||||||
@ -73,11 +73,17 @@ Tensor& abs_(Tensor& self) { return unary_op_impl_(self, at::abs_out); }
|
|||||||
Tensor& angle_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, angle_stub); }
|
Tensor& angle_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, angle_stub); }
|
||||||
Tensor angle(const Tensor& self) { return unary_op_impl(self, at::angle_out); }
|
Tensor angle(const Tensor& self) { return unary_op_impl(self, at::angle_out); }
|
||||||
|
|
||||||
Tensor& real_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, real_stub); }
|
Tensor real(const Tensor& self) {
|
||||||
Tensor real(const Tensor& self) { return unary_op_impl(self, at::real_out); }
|
TORCH_CHECK(!self.is_complex(), "real is not yet implemented for complex tensors.");
|
||||||
|
return self;
|
||||||
|
}
|
||||||
|
|
||||||
Tensor& imag_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, imag_stub); }
|
Tensor imag(const Tensor& self) {
|
||||||
Tensor imag(const Tensor& self) { return unary_op_impl(self, at::imag_out); }
|
TORCH_CHECK(false, "imag is not yet implemented.");
|
||||||
|
|
||||||
|
// Note: unreachable
|
||||||
|
return at::zeros_like(self);
|
||||||
|
}
|
||||||
|
|
||||||
Tensor& conj_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, conj_stub); }
|
Tensor& conj_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, conj_stub); }
|
||||||
Tensor conj(const Tensor& self) { return unary_op_impl(self, at::conj_out); }
|
Tensor conj(const Tensor& self) { return unary_op_impl(self, at::conj_out); }
|
||||||
|
|||||||
@ -7,6 +7,7 @@
|
|||||||
#include <ATen/native/TensorIterator.h>
|
#include <ATen/native/TensorIterator.h>
|
||||||
#include <ATen/native/BinaryOps.h>
|
#include <ATen/native/BinaryOps.h>
|
||||||
#include <ATen/native/cpu/Loops.h>
|
#include <ATen/native/cpu/Loops.h>
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
|
|
||||||
namespace at { namespace native {
|
namespace at { namespace native {
|
||||||
namespace {
|
namespace {
|
||||||
|
|||||||
@ -87,6 +87,10 @@ static void max_kernel_impl(
|
|||||||
Tensor& max_indices,
|
Tensor& max_indices,
|
||||||
const Tensor& self,
|
const Tensor& self,
|
||||||
c10::optional<int64_t> dim) {
|
c10::optional<int64_t> dim) {
|
||||||
|
|
||||||
|
TORCH_CHECK(max.scalar_type() == self.scalar_type() && max_indices.scalar_type() == kLong,
|
||||||
|
"Expect dtype ", self.scalar_type(), "and torch.long, but got ", max.scalar_type(), "and", max_indices.scalar_type());
|
||||||
|
|
||||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(ScalarType::Bool, self.scalar_type(), "max", [&] {
|
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(ScalarType::Bool, self.scalar_type(), "max", [&] {
|
||||||
Reduction<scalar_t, int64_t>::apply(max, max_indices, self, dim, true);
|
Reduction<scalar_t, int64_t>::apply(max, max_indices, self, dim, true);
|
||||||
});
|
});
|
||||||
@ -97,6 +101,10 @@ static void min_kernel_impl(
|
|||||||
Tensor& min_indices,
|
Tensor& min_indices,
|
||||||
const Tensor& self,
|
const Tensor& self,
|
||||||
c10::optional<int64_t> dim) {
|
c10::optional<int64_t> dim) {
|
||||||
|
|
||||||
|
TORCH_CHECK(min.scalar_type() == self.scalar_type() && min_indices.scalar_type() == kLong,
|
||||||
|
"Expect dtype ", self.scalar_type(), "and torch.long, but got ", min.scalar_type(), "and", min_indices.scalar_type());
|
||||||
|
|
||||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(ScalarType::Bool, self.scalar_type(), "min", [&] {
|
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(ScalarType::Bool, self.scalar_type(), "min", [&] {
|
||||||
Reduction<scalar_t, int64_t>::apply(min, min_indices, self, dim, false);
|
Reduction<scalar_t, int64_t>::apply(min, min_indices, self, dim, false);
|
||||||
});
|
});
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#include <ATen/native/cuda/zmath.cuh>
|
#include <ATen/native/cuda/zmath.cuh>
|
||||||
#include <ATen/native/TensorIterator.h>
|
#include <ATen/native/TensorIterator.h>
|
||||||
#include <ATen/native/BinaryOps.h>
|
#include <ATen/native/BinaryOps.h>
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
|
|
||||||
// NOTE: CUDA on Windows requires that the enclosing function
|
// NOTE: CUDA on Windows requires that the enclosing function
|
||||||
// of a __device__ lambda not have internal linkage.
|
// of a __device__ lambda not have internal linkage.
|
||||||
@ -69,7 +69,6 @@ void remainder_kernel_cuda(TensorIterator& iter) {
|
|||||||
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "remainder_cuda", [&]() {
|
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "remainder_cuda", [&]() {
|
||||||
using thrust_t = typename ztype_cuda<scalar_t>::thrust_t;
|
using thrust_t = typename ztype_cuda<scalar_t>::thrust_t;
|
||||||
gpu_kernel_with_scalars(iter, []GPU_LAMBDA(thrust_t a, thrust_t b) -> thrust_t {
|
gpu_kernel_with_scalars(iter, []GPU_LAMBDA(thrust_t a, thrust_t b) -> thrust_t {
|
||||||
CUDA_KERNEL_ASSERT(b != 0);
|
|
||||||
thrust_t r = a % b;
|
thrust_t r = a % b;
|
||||||
if ((r != 0) && ((r < 0) != (b < 0))) {
|
if ((r != 0) && ((r < 0) != (b < 0))) {
|
||||||
r += b;
|
r += b;
|
||||||
|
|||||||
@ -358,7 +358,7 @@ void max_pool2d_with_indices_out_cuda_template(
|
|||||||
|
|
||||||
Tensor input = input_.contiguous(memory_format);
|
Tensor input = input_.contiguous(memory_format);
|
||||||
|
|
||||||
const int64_t in_stride_n = input.stride(-4);
|
const int64_t in_stride_n = input_.ndimension() == 4 ? input.stride(-4) : 0;
|
||||||
const int64_t in_stride_c = input.stride(-3);
|
const int64_t in_stride_c = input.stride(-3);
|
||||||
const int64_t in_stride_h = input.stride(-2);
|
const int64_t in_stride_h = input.stride(-2);
|
||||||
const int64_t in_stride_w = input.stride(-1);
|
const int64_t in_stride_w = input.stride(-1);
|
||||||
@ -506,7 +506,7 @@ void max_pool2d_with_indices_backward_out_cuda_template(
|
|||||||
const int64_t inputHeight = input.size(-2);
|
const int64_t inputHeight = input.size(-2);
|
||||||
const int64_t inputWidth = input.size(-1);
|
const int64_t inputWidth = input.size(-1);
|
||||||
|
|
||||||
const int64_t in_stride_n = input.stride(-4);
|
const int64_t in_stride_n = input.ndimension() == 4 ? input.stride(-4) : 0;
|
||||||
const int64_t in_stride_c = input.stride(-3);
|
const int64_t in_stride_c = input.stride(-3);
|
||||||
const int64_t in_stride_h = input.stride(-2);
|
const int64_t in_stride_h = input.stride(-2);
|
||||||
const int64_t in_stride_w = input.stride(-1);
|
const int64_t in_stride_w = input.stride(-1);
|
||||||
|
|||||||
@ -54,7 +54,7 @@ __global__ void EmbeddingBag_updateOutputKernel(
|
|||||||
scalar_t *weightFeat = weight + featureDim * weight_stride1;
|
scalar_t *weightFeat = weight + featureDim * weight_stride1;
|
||||||
int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it
|
int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it
|
||||||
int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
|
int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
|
||||||
assert(end >= begin);
|
CUDA_KERNEL_ASSERT(end >= begin);
|
||||||
|
|
||||||
accscalar_t weightFeatSum = 0;
|
accscalar_t weightFeatSum = 0;
|
||||||
scalar_t weightFeatMax;
|
scalar_t weightFeatMax;
|
||||||
|
|||||||
@ -192,13 +192,13 @@ void index_put_accum_kernel(Tensor & self, TensorList indices, const Tensor & va
|
|||||||
if (num_indices > 0 && sliceSize > 0) {
|
if (num_indices > 0 && sliceSize > 0) {
|
||||||
const bool permuted = !src.is_contiguous();
|
const bool permuted = !src.is_contiguous();
|
||||||
auto src_ = permuted ? src.contiguous() : src;
|
auto src_ = permuted ? src.contiguous() : src;
|
||||||
linearIndex = linearIndex.view(-1);
|
linearIndex = linearIndex.reshape(-1);
|
||||||
auto sorted_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
auto sorted_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||||
auto orig_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
auto orig_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||||
using device_ptr = thrust::device_ptr<int64_t>;
|
using device_ptr = thrust::device_ptr<int64_t>;
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
linearIndex.div_(sliceSize);
|
linearIndex.floor_divide_(sliceSize);
|
||||||
{
|
{
|
||||||
sorted_indices.copy_(linearIndex);
|
sorted_indices.copy_(linearIndex);
|
||||||
auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
|
auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
|
||||||
|
|||||||
@ -35,13 +35,13 @@ __global__ void renormRowsL1(scalar_t* dist, long rows, long cols) {
|
|||||||
scalar_t sum = static_cast<scalar_t>(0);
|
scalar_t sum = static_cast<scalar_t>(0);
|
||||||
for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) {
|
for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) {
|
||||||
val = dist[row * cols + col];
|
val = dist[row * cols + col];
|
||||||
CUDA_ALWAYS_ASSERT(!THCNumerics<scalar_t>::lt(val, zero)); // ! < 0 for NaN handling
|
CUDA_KERNEL_ASSERT(!THCNumerics<scalar_t>::lt(val, zero)); // ! < 0 for NaN handling
|
||||||
sum = sum + val;
|
sum = sum + val;
|
||||||
}
|
}
|
||||||
|
|
||||||
sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<scalar_t>(), zero);
|
sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<scalar_t>(), zero);
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
CUDA_ALWAYS_ASSERT(!THCNumerics<scalar_t>::lt(val, zero)); // ! < 0 for NaN handling
|
CUDA_KERNEL_ASSERT(!THCNumerics<scalar_t>::lt(val, zero)); // ! < 0 for NaN handling
|
||||||
smem[0] = sum;
|
smem[0] = sum;
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -61,7 +61,7 @@ void renormRows(Tensor& t) {
|
|||||||
int64_t cols = t.size(1);
|
int64_t cols = t.size(1);
|
||||||
|
|
||||||
auto props = at::cuda::getCurrentDeviceProperties();
|
auto props = at::cuda::getCurrentDeviceProperties();
|
||||||
CUDA_ALWAYS_ASSERT(props != NULL);
|
CUDA_KERNEL_ASSERT(props != NULL);
|
||||||
int numSM = props->multiProcessorCount;
|
int numSM = props->multiProcessorCount;
|
||||||
int maxThreads = props->maxThreadsPerBlock;
|
int maxThreads = props->maxThreadsPerBlock;
|
||||||
|
|
||||||
@ -84,7 +84,7 @@ __device__ int binarySearchForMultinomial(scalar_t* cumdist,
|
|||||||
int start = 0;
|
int start = 0;
|
||||||
int end = size;
|
int end = size;
|
||||||
// cumdist[size - 1] = 0 => all zero prob dist
|
// cumdist[size - 1] = 0 => all zero prob dist
|
||||||
CUDA_ALWAYS_ASSERT(cumdist[size - 1] > static_cast<scalar_t>(0));
|
CUDA_KERNEL_ASSERT(cumdist[size - 1] > static_cast<scalar_t>(0));
|
||||||
|
|
||||||
while (end - start > 0) {
|
while (end - start > 0) {
|
||||||
int mid = start + (end - start) / 2;
|
int mid = start + (end - start) / 2;
|
||||||
@ -124,36 +124,33 @@ sampleMultinomialWithReplacement(std::pair<uint64_t, uint64_t> seeds,
|
|||||||
// search due to divergence. It seems possible to compute multiple
|
// search due to divergence. It seems possible to compute multiple
|
||||||
// values and limit divergence though later on.
|
// values and limit divergence though later on.
|
||||||
|
|
||||||
// global index formula for 1D grid of 2D blocks
|
// global index formula for 2D grid of 1D blocks
|
||||||
int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
|
int idx = blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
curandStatePhilox4_32_10_t state;
|
curandStatePhilox4_32_10_t state;
|
||||||
curand_init(seeds.first, idx, seeds.second, &state);
|
curand_init(seeds.first, idx, seeds.second, &state);
|
||||||
|
|
||||||
// The block determines the distribution for which we generate a point
|
// The block determines the distribution for which we generate a point
|
||||||
for (int64_t curDist = blockIdx.x;
|
for (int64_t curDist = blockIdx.y;
|
||||||
curDist < distributions;
|
curDist < distributions;
|
||||||
curDist += gridDim.x) {
|
curDist += gridDim.y) {
|
||||||
for (int sampleBase = 0;
|
for (int sample = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
sampleBase < totalSamples; sampleBase += blockDim.y) {
|
sample < totalSamples; sample += blockDim.x*gridDim.x) {
|
||||||
// The warp determines the sample
|
|
||||||
int sample = sampleBase + threadIdx.y;
|
|
||||||
|
|
||||||
// All threads participate in this
|
//we are losing 3 out of 4 generated numbers but it's ok
|
||||||
|
//this kernel is not very efficient anyway
|
||||||
auto rand = curand_uniform4(&state);
|
auto rand = curand_uniform4(&state);
|
||||||
scalar_t r = static_cast<scalar_t>(rand.x);
|
scalar_t r = static_cast<scalar_t>(rand.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0 && sample < totalSamples) {
|
// Find the bucket that a uniform sample lies in
|
||||||
// Find the bucket that a uniform sample lies in
|
int choice = binarySearchForMultinomial<scalar_t>(
|
||||||
int choice = binarySearchForMultinomial<scalar_t>(
|
normDistPrefixSum + curDist * categories,
|
||||||
normDistPrefixSum + curDist * categories,
|
normDist + curDist * categories,
|
||||||
normDist + curDist * categories,
|
categories,
|
||||||
categories,
|
r);
|
||||||
r);
|
|
||||||
|
dest[curDist * totalSamples + sample] = choice;
|
||||||
|
|
||||||
// Torch indices are 1-based
|
|
||||||
dest[curDist * totalSamples + sample] = choice;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -180,17 +177,14 @@ sampleMultinomialWithoutReplacement(std::pair<uint64_t, uint64_t> seeds,
|
|||||||
|
|
||||||
// The block and warp determines the distribution for which we
|
// The block and warp determines the distribution for which we
|
||||||
// generate a point
|
// generate a point
|
||||||
for (int64_t curDistBase = blockIdx.x * blockDim.y;
|
for (int64_t curDist = blockIdx.x * blockDim.y + threadIdx.y;
|
||||||
curDistBase < distributions;
|
curDist < distributions;
|
||||||
curDistBase += gridDim.x * blockDim.y) {
|
curDist += gridDim.x * blockDim.y) {
|
||||||
// The warp determines the distribution
|
|
||||||
int64_t curDist = curDistBase + threadIdx.y;
|
|
||||||
|
|
||||||
// All threads must participate in this
|
|
||||||
auto rand = curand_uniform4(&state);
|
auto rand = curand_uniform4(&state);
|
||||||
scalar_t r = static_cast<scalar_t>(rand.x);
|
scalar_t r = static_cast<scalar_t>(rand.x);
|
||||||
|
|
||||||
if (threadIdx.x == 0 && curDist < distributions) {
|
if (threadIdx.x == 0) {
|
||||||
// Find the bucket that a uniform sample lies in
|
// Find the bucket that a uniform sample lies in
|
||||||
int choice = binarySearchForMultinomial<scalar_t>(
|
int choice = binarySearchForMultinomial<scalar_t>(
|
||||||
normDistPrefixSum + curDist * categories,
|
normDistPrefixSum + curDist * categories,
|
||||||
@ -240,9 +234,9 @@ sampleMultinomialOnce(int64_t* dest,
|
|||||||
scalar_t val;
|
scalar_t val;
|
||||||
for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
|
for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
|
||||||
val = dist[curDist * stride_dist + cat * stride_categories];
|
val = dist[curDist * stride_dist + cat * stride_categories];
|
||||||
CUDA_ALWAYS_ASSERT(val >= zero);
|
CUDA_KERNEL_ASSERT(val >= zero);
|
||||||
CUDA_ALWAYS_ASSERT(!THCNumerics<scalar_t>::isinf(val));
|
CUDA_KERNEL_ASSERT(!THCNumerics<scalar_t>::isinf(val));
|
||||||
CUDA_ALWAYS_ASSERT(!THCNumerics<scalar_t>::isnan(val));
|
CUDA_KERNEL_ASSERT(!THCNumerics<scalar_t>::isnan(val));
|
||||||
sum = sum + static_cast<accscalar_t>(val);
|
sum = sum + static_cast<accscalar_t>(val);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -252,8 +246,8 @@ sampleMultinomialOnce(int64_t* dest,
|
|||||||
// Broadcast sum and sample value
|
// Broadcast sum and sample value
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
// Make sure the sum of our distribution didn't overflow
|
// Make sure the sum of our distribution didn't overflow
|
||||||
CUDA_ALWAYS_ASSERT(!THCNumerics<accscalar_t>::isinf(sum));
|
CUDA_KERNEL_ASSERT(!THCNumerics<accscalar_t>::isinf(sum));
|
||||||
CUDA_ALWAYS_ASSERT(sum > accZero);
|
CUDA_KERNEL_ASSERT(sum > accZero);
|
||||||
|
|
||||||
asmem[0] = sum;
|
asmem[0] = sum;
|
||||||
smem[0] = sampled[curDist];
|
smem[0] = sampled[curDist];
|
||||||
@ -363,7 +357,7 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
|
|||||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(self_v.scalar_type(), "multinomial_kernel_cuda", [&] {
|
AT_DISPATCH_FLOATING_TYPES_AND_HALF(self_v.scalar_type(), "multinomial_kernel_cuda", [&] {
|
||||||
using accscalar_t = at::acc_type<scalar_t, true>;
|
using accscalar_t = at::acc_type<scalar_t, true>;
|
||||||
auto props = at::cuda::getCurrentDeviceProperties();
|
auto props = at::cuda::getCurrentDeviceProperties();
|
||||||
CUDA_ALWAYS_ASSERT(props != NULL);
|
CUDA_KERNEL_ASSERT(props != NULL);
|
||||||
int numSM = props->multiProcessorCount;
|
int numSM = props->multiProcessorCount;
|
||||||
int maxThreads = props->maxThreadsPerBlock;
|
int maxThreads = props->maxThreadsPerBlock;
|
||||||
int maxShared = props->sharedMemPerBlock;
|
int maxShared = props->sharedMemPerBlock;
|
||||||
@ -415,26 +409,27 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
|
|||||||
std::pair<uint64_t, uint64_t> rng_engine_inputs;
|
std::pair<uint64_t, uint64_t> rng_engine_inputs;
|
||||||
|
|
||||||
if (with_replacement) {
|
if (with_replacement) {
|
||||||
|
// Binary search is warp divergent (so effectively we're running
|
||||||
|
// with just a single thread), but for better utilization,
|
||||||
|
// we need each block to have at least 4 warps.
|
||||||
|
dim3 block(128);
|
||||||
|
|
||||||
|
// Each block will generate a sample from one
|
||||||
|
// distribution concurrently.
|
||||||
|
int grid_y=std::min<int>(numDist, at::cuda::getCurrentDeviceProperties()->maxGridSize[1]);
|
||||||
|
dim3 grid((n_sample-1)/block.x+1, grid_y);
|
||||||
{
|
{
|
||||||
// See Note [Acquire lock when using random generators]
|
// See Note [Acquire lock when using random generators]
|
||||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||||
|
|
||||||
// each thread will utilize one random, however, since we have to use
|
// each thread generates a single sample for (numdist/numblocks.y) distributions, however, since we have to use
|
||||||
// curand_uniform4 (See Note [Register spilling in curand call for CUDA < 10]),
|
// curand_uniform4 (See Note [Register spilling in curand call for CUDA < 10]),
|
||||||
// offset is 4.
|
// offset is 4 times that.
|
||||||
rng_engine_inputs = gen->philox_engine_inputs(4);
|
auto offset = ((numDist-1)/grid.y+1)*4;
|
||||||
|
rng_engine_inputs = gen->philox_engine_inputs(offset);
|
||||||
}
|
}
|
||||||
// Sample with replacement
|
// Sample with replacement
|
||||||
|
|
||||||
// Binary search is warp divergent (so effectively we're running
|
|
||||||
// with just a single thread), but for better utilization,
|
|
||||||
// we need each block to have at least 4 warps.
|
|
||||||
dim3 block(32, 4);
|
|
||||||
|
|
||||||
// Each warp in a block will generate a sample from one
|
|
||||||
// distribution concurrently.
|
|
||||||
dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS);
|
|
||||||
|
|
||||||
sampleMultinomialWithReplacement
|
sampleMultinomialWithReplacement
|
||||||
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
|
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
|
||||||
rng_engine_inputs,
|
rng_engine_inputs,
|
||||||
@ -470,10 +465,11 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
|
|||||||
// See Note [Acquire lock when using random generators]
|
// See Note [Acquire lock when using random generators]
|
||||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||||
|
|
||||||
// each thread will utilize one random, however, since we have to use
|
// each thread will utilize distributions/(gridDim.x*blockDim.y) randoms, however, since we have to use
|
||||||
// curand_uniform4 (See Note [Register spilling in curand call for CUDA < 10]),
|
// curand_uniform4 (See Note [Register spilling in curand call for CUDA < 10]),
|
||||||
// offset is 4.
|
// offset is 4 times that.
|
||||||
rng_engine_inputs = gen->philox_engine_inputs(4);
|
auto offset = ((numDist-1)/(grid.x*block.y)+1)*4;
|
||||||
|
rng_engine_inputs = gen->philox_engine_inputs(offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The kernel can only draw one sample before we have to
|
// The kernel can only draw one sample before we have to
|
||||||
|
|||||||
@ -431,13 +431,12 @@ __global__ void batch_norm_backward_reduce_kernel(
|
|||||||
const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
|
const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
|
||||||
GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
|
GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
|
||||||
GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
|
GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
|
||||||
GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean_dy,
|
GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
|
||||||
GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean_dy_xmu,
|
GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
|
||||||
GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight,
|
GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight,
|
||||||
GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias) {
|
GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias) {
|
||||||
|
|
||||||
index_t plane = blockIdx.x;
|
index_t plane = blockIdx.x;
|
||||||
index_t N = input.size(0) * input.size(2);
|
|
||||||
|
|
||||||
stat_accscalar_t r_mean = mean[plane];
|
stat_accscalar_t r_mean = mean[plane];
|
||||||
stat_accscalar_t factor = invstd[plane];
|
stat_accscalar_t factor = invstd[plane];
|
||||||
@ -446,7 +445,6 @@ __global__ void batch_norm_backward_reduce_kernel(
|
|||||||
Float2<input_scalar_t, stat_accscalar_t> res = reduce<Float2<input_scalar_t, stat_accscalar_t>, GradOp<input_scalar_t, stat_accscalar_t,
|
Float2<input_scalar_t, stat_accscalar_t> res = reduce<Float2<input_scalar_t, stat_accscalar_t>, GradOp<input_scalar_t, stat_accscalar_t,
|
||||||
GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>>>(g, grad_output, plane);
|
GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>>>(g, grad_output, plane);
|
||||||
|
|
||||||
stat_accscalar_t norm = stat_accscalar_t(1) / N;
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
if (grad_weight.size(0) > 0) {
|
if (grad_weight.size(0) > 0) {
|
||||||
grad_weight[plane] = static_cast<stat_scalar_t>(res.v2 * factor);
|
grad_weight[plane] = static_cast<stat_scalar_t>(res.v2 * factor);
|
||||||
@ -454,11 +452,11 @@ __global__ void batch_norm_backward_reduce_kernel(
|
|||||||
if (grad_bias.size(0) > 0) {
|
if (grad_bias.size(0) > 0) {
|
||||||
grad_bias[plane] = static_cast<stat_scalar_t>(res.v1);
|
grad_bias[plane] = static_cast<stat_scalar_t>(res.v1);
|
||||||
}
|
}
|
||||||
if (mean_dy.size(0) > 0) {
|
if (sum_dy.size(0) > 0) {
|
||||||
mean_dy[plane] = static_cast<stat_accscalar_t>(res.v1 * norm);
|
sum_dy[plane] = static_cast<stat_accscalar_t>(res.v1);
|
||||||
}
|
}
|
||||||
if (mean_dy_xmu.size(0) > 0) {
|
if (sum_dy_xmu.size(0) > 0) {
|
||||||
mean_dy_xmu[plane] = static_cast<stat_accscalar_t>(res.v2 * norm);
|
sum_dy_xmu[plane] = static_cast<stat_accscalar_t>(res.v2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -740,16 +738,16 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ
|
|||||||
|
|
||||||
using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
|
using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
|
||||||
int64_t n_input = input_.size(1);
|
int64_t n_input = input_.size(1);
|
||||||
Tensor mean_dy_;
|
Tensor sum_dy_;
|
||||||
Tensor mean_dy_xmu_;
|
Tensor sum_dy_xmu_;
|
||||||
Tensor grad_weight_;
|
Tensor grad_weight_;
|
||||||
Tensor grad_bias_;
|
Tensor grad_bias_;
|
||||||
auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
|
auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
|
||||||
auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
|
auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
|
||||||
|
|
||||||
if (input_g) {
|
if (input_g) {
|
||||||
mean_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
sum_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||||
mean_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
sum_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||||
}
|
}
|
||||||
if (weight_g) {
|
if (weight_g) {
|
||||||
grad_weight_ = at::empty({n_input}, weight_.options());
|
grad_weight_ = at::empty({n_input}, weight_.options());
|
||||||
@ -764,8 +762,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ
|
|||||||
auto grad_bias = packed_accessor_or_dummy<stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_);
|
auto grad_bias = packed_accessor_or_dummy<stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_);
|
||||||
auto mean = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_);
|
auto mean = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_);
|
||||||
auto invstd = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_);
|
auto invstd = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_);
|
||||||
auto mean_dy = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_dy_);
|
auto sum_dy = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_);
|
||||||
auto mean_dy_xmu = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_dy_xmu_);
|
auto sum_dy_xmu = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_);
|
||||||
|
|
||||||
auto batch_size = input_reshaped.size(0);
|
auto batch_size = input_reshaped.size(0);
|
||||||
auto feature_size = input_reshaped.size(2);
|
auto feature_size = input_reshaped.size(2);
|
||||||
@ -778,10 +776,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ
|
|||||||
const dim3 grid(n_input);
|
const dim3 grid(n_input);
|
||||||
|
|
||||||
batch_norm_backward_reduce_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<grid, block, 0, stream>>>
|
batch_norm_backward_reduce_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<grid, block, 0, stream>>>
|
||||||
(input, grad_output, mean, invstd, mean_dy, mean_dy_xmu, grad_weight, grad_bias);
|
(input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias);
|
||||||
AT_CUDA_CHECK(cudaGetLastError());
|
AT_CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
return std::make_tuple(mean_dy_, mean_dy_xmu_, grad_weight_, grad_bias_);
|
return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
|
template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
|
||||||
|
|||||||
@ -307,6 +307,15 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
|
|||||||
"tensor ", i);
|
"tensor ", i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Dtypes should be the same
|
||||||
|
const auto first_in_cat = inputs[0];
|
||||||
|
for (int64_t i = 1; i < inputs.size(); i++) {
|
||||||
|
TORCH_CHECK(first_in_cat.dtype() == inputs[i].dtype(),
|
||||||
|
"Expected object of scalar type ", first_in_cat.dtype(),
|
||||||
|
" but got scalar type ", inputs[i].dtype(),
|
||||||
|
" for sequence element ", i, ".");
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < inputs.size(); i++)
|
for (int i = 0; i < inputs.size(); i++)
|
||||||
{
|
{
|
||||||
if (should_skip(inputs[i])) {
|
if (should_skip(inputs[i])) {
|
||||||
@ -325,6 +334,12 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
|
|||||||
TORCH_CHECK(inputs.size() > 0, "invalid number of inputs ", inputs.size());
|
TORCH_CHECK(inputs.size() > 0, "invalid number of inputs ", inputs.size());
|
||||||
TORCH_CHECK(dimension >= 0, "invalid dimension ", dimension);
|
TORCH_CHECK(dimension >= 0, "invalid dimension ", dimension);
|
||||||
|
|
||||||
|
for (const Tensor& t: inputs) {
|
||||||
|
TORCH_CHECK(t.device() == notSkippedTensor->device(),
|
||||||
|
"All input tensors must be on the same device. Received ",
|
||||||
|
t.device(), " and ", notSkippedTensor->device());
|
||||||
|
}
|
||||||
|
|
||||||
c10::MemoryFormat memory_format = compute_output_memory_format(inputs);
|
c10::MemoryFormat memory_format = compute_output_memory_format(inputs);
|
||||||
|
|
||||||
std::vector<int64_t> size(notSkippedTensor->sizes().vec());
|
std::vector<int64_t> size(notSkippedTensor->sizes().vec());
|
||||||
@ -355,17 +370,11 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
|
|||||||
// 4. The number of dimensions is <= 4
|
// 4. The number of dimensions is <= 4
|
||||||
// 5. All input tensors are contiguous (output tensor may be non-contig)
|
// 5. All input tensors are contiguous (output tensor may be non-contig)
|
||||||
// 6. All input tensors can use 32-bit indexing
|
// 6. All input tensors can use 32-bit indexing
|
||||||
// 7. All input tensors are on the same device
|
|
||||||
|
|
||||||
const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(),
|
const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(),
|
||||||
[] (const Tensor& t) {
|
[] (const Tensor& t) {
|
||||||
return at::cuda::detail::canUse32BitIndexMath(t);
|
return at::cuda::detail::canUse32BitIndexMath(t);
|
||||||
});
|
});
|
||||||
Device firstDevice = notSkippedTensor->device();
|
|
||||||
const bool allSameDevice = std::all_of(inputs.begin(), inputs.end(),
|
|
||||||
[firstDevice](const Tensor& t) {
|
|
||||||
return t.device() == firstDevice;
|
|
||||||
});
|
|
||||||
const bool allContiguous = std::all_of(inputs.begin(), inputs.end(),
|
const bool allContiguous = std::all_of(inputs.begin(), inputs.end(),
|
||||||
[=](const Tensor& t) {
|
[=](const Tensor& t) {
|
||||||
return !t.defined() || t.is_contiguous(memory_format);
|
return !t.defined() || t.is_contiguous(memory_format);
|
||||||
@ -375,8 +384,7 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
|
|||||||
out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
|
out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
|
||||||
at::cuda::detail::canUse32BitIndexMath(out) &&
|
at::cuda::detail::canUse32BitIndexMath(out) &&
|
||||||
allContiguous &&
|
allContiguous &&
|
||||||
all32BitIndexable &&
|
all32BitIndexable) {
|
||||||
allSameDevice) {
|
|
||||||
|
|
||||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
|
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
|
||||||
at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
|
at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
|
||||||
|
|||||||
@ -65,7 +65,7 @@ struct TopKTypeConfig<int16_t> {
|
|||||||
typedef uint32_t RadixType;
|
typedef uint32_t RadixType;
|
||||||
|
|
||||||
static inline __device__ RadixType convert(int16_t v) {
|
static inline __device__ RadixType convert(int16_t v) {
|
||||||
assert(sizeof(short) == 2);
|
static_assert(sizeof(short) == 2, "");
|
||||||
return 32768u + v;
|
return 32768u + v;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,7 +79,7 @@ struct TopKTypeConfig<int32_t> {
|
|||||||
typedef uint32_t RadixType;
|
typedef uint32_t RadixType;
|
||||||
|
|
||||||
static inline __device__ RadixType convert(int32_t v) {
|
static inline __device__ RadixType convert(int32_t v) {
|
||||||
assert(sizeof(int) == 4);
|
static_assert(sizeof(int) == 4, "");
|
||||||
return 2147483648u + v;
|
return 2147483648u + v;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +93,7 @@ struct TopKTypeConfig<int64_t> {
|
|||||||
typedef uint64_t RadixType;
|
typedef uint64_t RadixType;
|
||||||
|
|
||||||
static inline __device__ RadixType convert(int64_t v) {
|
static inline __device__ RadixType convert(int64_t v) {
|
||||||
assert(sizeof(int64_t) == 8);
|
static_assert(sizeof(int64_t) == 8, "");
|
||||||
return 9223372036854775808ull + v;
|
return 9223372036854775808ull + v;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -125,7 +125,7 @@ struct TopKTypeConfig<at::Half> {
|
|||||||
static inline __device__ RadixType convert(at::Half v) {
|
static inline __device__ RadixType convert(at::Half v) {
|
||||||
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
|
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
|
||||||
RadixType x = __half_as_ushort(v);
|
RadixType x = __half_as_ushort(v);
|
||||||
RadixType mask = -((x >> 15)) | 0x8000;
|
RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000;
|
||||||
return (v == v) ? (x ^ mask) : 0xffff;
|
return (v == v) ? (x ^ mask) : 0xffff;
|
||||||
#else
|
#else
|
||||||
assert(false);
|
assert(false);
|
||||||
@ -135,7 +135,7 @@ struct TopKTypeConfig<at::Half> {
|
|||||||
|
|
||||||
static inline __device__ at::Half deconvert(RadixType v) {
|
static inline __device__ at::Half deconvert(RadixType v) {
|
||||||
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
|
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
|
||||||
RadixType mask = ((v >> 15) - 1) | 0x8000;
|
RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff;
|
||||||
return __ushort_as_half(v ^ mask);
|
return __ushort_as_half(v ^ mask);
|
||||||
#else
|
#else
|
||||||
assert(false);
|
assert(false);
|
||||||
|
|||||||
@ -44,6 +44,7 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Tensor empty_cuda(IntArrayRef size, const TensorOptions& options, c10::optional<MemoryFormat> optional_memory_format) {
|
Tensor empty_cuda(IntArrayRef size, const TensorOptions& options, c10::optional<MemoryFormat> optional_memory_format) {
|
||||||
|
TORCH_CHECK(!isComplexType(at::typeMetaToScalarType(options.dtype())), "Complex dtype not supported.");
|
||||||
AT_ASSERT(options.device().type() == at::DeviceType::CUDA);
|
AT_ASSERT(options.device().type() == at::DeviceType::CUDA);
|
||||||
TORCH_INTERNAL_ASSERT(impl::variable_excluded_from_dispatch());
|
TORCH_INTERNAL_ASSERT(impl::variable_excluded_from_dispatch());
|
||||||
TORCH_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned");
|
TORCH_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned");
|
||||||
|
|||||||
@ -238,18 +238,12 @@
|
|||||||
|
|
||||||
- func: real(Tensor self) -> Tensor
|
- func: real(Tensor self) -> Tensor
|
||||||
use_c10_dispatcher: full
|
use_c10_dispatcher: full
|
||||||
variants: function, method
|
variants: function
|
||||||
supports_named_tensor: True
|
|
||||||
|
|
||||||
- func: real.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
|
||||||
supports_named_tensor: True
|
supports_named_tensor: True
|
||||||
|
|
||||||
- func: imag(Tensor self) -> Tensor
|
- func: imag(Tensor self) -> Tensor
|
||||||
use_c10_dispatcher: full
|
use_c10_dispatcher: full
|
||||||
variants: function, method
|
variants: function
|
||||||
supports_named_tensor: True
|
|
||||||
|
|
||||||
- func: imag.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
|
||||||
supports_named_tensor: True
|
supports_named_tensor: True
|
||||||
|
|
||||||
- func: conj(Tensor self) -> Tensor
|
- func: conj(Tensor self) -> Tensor
|
||||||
@ -2872,7 +2866,7 @@
|
|||||||
|
|
||||||
- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
|
- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
|
||||||
use_c10_dispatcher: full
|
use_c10_dispatcher: full
|
||||||
variants: function
|
variants: function, method
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU: true_divide
|
CPU: true_divide
|
||||||
CUDA: true_divide
|
CUDA: true_divide
|
||||||
@ -2880,6 +2874,15 @@
|
|||||||
SparseCUDA: true_divide_sparse
|
SparseCUDA: true_divide_sparse
|
||||||
supports_named_tensor: True
|
supports_named_tensor: True
|
||||||
|
|
||||||
|
- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
|
||||||
|
variants: method
|
||||||
|
dispatch:
|
||||||
|
CPU: true_divide_
|
||||||
|
CUDA: true_divide_
|
||||||
|
SparseCPU: true_divide_sparse_
|
||||||
|
SparseCUDA: true_divide_sparse_
|
||||||
|
supports_named_tensor: True
|
||||||
|
|
||||||
- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU: true_divide_out
|
CPU: true_divide_out
|
||||||
@ -2890,7 +2893,11 @@
|
|||||||
|
|
||||||
- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
|
- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
|
||||||
use_c10_dispatcher: full
|
use_c10_dispatcher: full
|
||||||
variants: function
|
variants: function, method
|
||||||
|
supports_named_tensor: True
|
||||||
|
|
||||||
|
- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
|
||||||
|
variants: method
|
||||||
supports_named_tensor: True
|
supports_named_tensor: True
|
||||||
|
|
||||||
- func: trunc(Tensor self) -> Tensor
|
- func: trunc(Tensor self) -> Tensor
|
||||||
|
|||||||
@ -272,6 +272,10 @@ SparseTensor& true_divide_out_sparse_scalar(
|
|||||||
return true_divide_out_sparse_zerodim(result, dividend, wrapped_scalar_tensor(divisor));
|
return true_divide_out_sparse_zerodim(result, dividend, wrapped_scalar_tensor(divisor));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor& true_divide_sparse_(Tensor& self, const Tensor& divisor) {
|
||||||
|
return true_divide_out_sparse_zerodim(self, self, divisor);
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------
|
// --------------------------------------------------------------------
|
||||||
// floor_divide(SparseTensor, Scalar)
|
// floor_divide(SparseTensor, Scalar)
|
||||||
// --------------------------------------------------------------------
|
// --------------------------------------------------------------------
|
||||||
|
|||||||
@ -138,7 +138,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
|
|||||||
// broadcasting logic; instead, it will blast the elements from one
|
// broadcasting logic; instead, it will blast the elements from one
|
||||||
// to the other so long as the numel is the same
|
// to the other so long as the numel is the same
|
||||||
indicesSlice.copy_(indices1D);
|
indicesSlice.copy_(indices1D);
|
||||||
indices1D.div_(self.size(d));
|
indices1D.floor_divide_(self.size(d));
|
||||||
indicesSlice.add_(indices1D, -self.size(d));
|
indicesSlice.add_(indices1D, -self.size(d));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,7 +14,7 @@ namespace xnnpack {
|
|||||||
namespace {
|
namespace {
|
||||||
torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_class() {
|
torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_class() {
|
||||||
static auto register_linear_op_context_class =
|
static auto register_linear_op_context_class =
|
||||||
torch::jit::class_<XNNPackLinearOpContext>("XNNPackLinearOpContext")
|
torch::jit::class_<XNNPackLinearOpContext>("xnnpack", "XNNPackLinearOpContext")
|
||||||
.def_pickle(
|
.def_pickle(
|
||||||
[](const c10::intrusive_ptr<XNNPackLinearOpContext>& op_context)
|
[](const c10::intrusive_ptr<XNNPackLinearOpContext>& op_context)
|
||||||
-> SerializationTypeLinearPrePack { // __getstate__
|
-> SerializationTypeLinearPrePack { // __getstate__
|
||||||
@ -38,7 +38,7 @@ torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_cl
|
|||||||
|
|
||||||
torch::jit::class_<XNNPackConv2dOpContext> register_xnnpack_conv2d_op_context_class() {
|
torch::jit::class_<XNNPackConv2dOpContext> register_xnnpack_conv2d_op_context_class() {
|
||||||
static auto register_conv2d_op_context_class =
|
static auto register_conv2d_op_context_class =
|
||||||
torch::jit::class_<XNNPackConv2dOpContext>("XNNPackConv2dOpContext")
|
torch::jit::class_<XNNPackConv2dOpContext>("xnnpack", "XNNPackConv2dOpContext")
|
||||||
.def_pickle(
|
.def_pickle(
|
||||||
[](const c10::intrusive_ptr<XNNPackConv2dOpContext>& op_context)
|
[](const c10::intrusive_ptr<XNNPackConv2dOpContext>& op_context)
|
||||||
-> SerializationTypeConv2dPrePack { // __getstate__
|
-> SerializationTypeConv2dPrePack { // __getstate__
|
||||||
@ -74,25 +74,25 @@ static auto registry =
|
|||||||
// Registering under _xnnpack namespace for now. As we add more backend requiring similar functionality
|
// Registering under _xnnpack namespace for now. As we add more backend requiring similar functionality
|
||||||
// We can refactor the code and use a better namespace.
|
// We can refactor the code and use a better namespace.
|
||||||
torch::RegisterOperators()
|
torch::RegisterOperators()
|
||||||
.op("_xnnpack::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.XNNPackLinearOpContext",
|
.op("_xnnpack::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.xnnpack.XNNPackLinearOpContext",
|
||||||
torch::RegisterOperators::options()
|
torch::RegisterOperators::options()
|
||||||
.aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
|
.aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
|
||||||
.kernel<internal::linear::LinearPrePack>(
|
.kernel<internal::linear::LinearPrePack>(
|
||||||
DispatchKey::CPUTensorId))
|
DispatchKey::CPUTensorId))
|
||||||
.op("_xnnpack::linear_packed(Tensor X, __torch__.torch.classes.XNNPackLinearOpContext W_prepack) -> Tensor Y",
|
.op("_xnnpack::linear_packed(Tensor X, __torch__.torch.classes.xnnpack.XNNPackLinearOpContext W_prepack) -> Tensor Y",
|
||||||
torch::RegisterOperators::options()
|
torch::RegisterOperators::options()
|
||||||
.aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
|
.aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
|
||||||
.kernel<internal::linear::LinearPacked>(
|
.kernel<internal::linear::LinearPacked>(
|
||||||
DispatchKey::CPUTensorId))
|
DispatchKey::CPUTensorId))
|
||||||
.op("_xnnpack::conv2d_prepack(Tensor W, Tensor? B, int[2] stride, "
|
.op("_xnnpack::conv2d_prepack(Tensor W, Tensor? B, int[2] stride, "
|
||||||
"int[2] padding, int[2] dilation, int groups) "
|
"int[2] padding, int[2] dilation, int groups) "
|
||||||
"-> __torch__.torch.classes.XNNPackConv2dOpContext",
|
"-> __torch__.torch.classes.xnnpack.XNNPackConv2dOpContext",
|
||||||
torch::RegisterOperators::options()
|
torch::RegisterOperators::options()
|
||||||
.aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
|
.aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
|
||||||
.kernel<internal::convolution2d::Conv2dPrePack>(
|
.kernel<internal::convolution2d::Conv2dPrePack>(
|
||||||
DispatchKey::CPUTensorId))
|
DispatchKey::CPUTensorId))
|
||||||
.op("_xnnpack::conv2d_packed(Tensor X, "
|
.op("_xnnpack::conv2d_packed(Tensor X, "
|
||||||
"__torch__.torch.classes.XNNPackConv2dOpContext W_prepack) -> Tensor Y",
|
"__torch__.torch.classes.xnnpack.XNNPackConv2dOpContext W_prepack) -> Tensor Y",
|
||||||
torch::RegisterOperators::options()
|
torch::RegisterOperators::options()
|
||||||
.aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
|
.aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
|
||||||
.kernel<internal::convolution2d::Conv2dPacked>(
|
.kernel<internal::convolution2d::Conv2dPacked>(
|
||||||
|
|||||||
@ -423,6 +423,85 @@ class CAFFE2_API Tensor {
|
|||||||
|
|
||||||
// ~~~~~ Autograd API ~~~~~
|
// ~~~~~ Autograd API ~~~~~
|
||||||
|
|
||||||
|
/// \fn bool is_leaf() const;
|
||||||
|
///
|
||||||
|
/// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention.
|
||||||
|
///
|
||||||
|
/// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were
|
||||||
|
/// created by the user. This means that they are not the result of an operation and so
|
||||||
|
/// `grad_fn()` is `nullptr`.
|
||||||
|
///
|
||||||
|
/// Only leaf Tensors will have their `grad()` populated during a call to `backward()`.
|
||||||
|
/// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`.
|
||||||
|
///
|
||||||
|
/// Example:
|
||||||
|
/// @code
|
||||||
|
/// auto a = torch::rand(10, torch::requires_grad());
|
||||||
|
/// std::cout << a.is_leaf() << std::endl; // prints `true`
|
||||||
|
///
|
||||||
|
/// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA);
|
||||||
|
/// std::cout << b.is_leaf() << std::endl; // prints `false`
|
||||||
|
/// // b was created by the operation that cast a cpu Tensor into a cuda Tensor
|
||||||
|
///
|
||||||
|
/// auto c = torch::rand(10, torch::requires_grad()) + 2;
|
||||||
|
/// std::cout << c.is_leaf() << std::endl; // prints `false`
|
||||||
|
/// // c was created by the addition operation
|
||||||
|
///
|
||||||
|
/// auto d = torch::rand(10).cuda();
|
||||||
|
/// std::cout << d.is_leaf() << std::endl; // prints `true`
|
||||||
|
/// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
|
||||||
|
///
|
||||||
|
/// auto e = torch::rand(10).cuda().requires_grad_();
|
||||||
|
/// std::cout << e.is_leaf() << std::endl; // prints `true`
|
||||||
|
/// // e requires gradients and has no operations creating it
|
||||||
|
///
|
||||||
|
/// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true));
|
||||||
|
/// std::cout << f.is_leaf() << std::endl; // prints `true`
|
||||||
|
/// // f requires grad, has no operation creating it
|
||||||
|
/// @endcode
|
||||||
|
|
||||||
|
/// \fn void backward(const Tensor & gradient={}, bool keep_graph=false, bool create_graph=false) const;
|
||||||
|
///
|
||||||
|
/// Computes the gradient of current tensor with respect to graph leaves.
|
||||||
|
///
|
||||||
|
/// The graph is differentiated using the chain rule. If the tensor is
|
||||||
|
/// non-scalar (i.e. its data has more than one element) and requires
|
||||||
|
/// gradient, the function additionally requires specifying ``gradient``.
|
||||||
|
/// It should be a tensor of matching type and location, that contains
|
||||||
|
/// the gradient of the differentiated function w.r.t. this Tensor.
|
||||||
|
///
|
||||||
|
/// This function accumulates gradients in the leaves - you might need to
|
||||||
|
/// zero them before calling it.
|
||||||
|
///
|
||||||
|
/// \param gradient Gradient w.r.t. the
|
||||||
|
/// tensor. If it is a tensor, it will be automatically converted
|
||||||
|
/// to a Tensor that does not require grad unless ``create_graph`` is True.
|
||||||
|
/// None values can be specified for scalar Tensors or ones that
|
||||||
|
/// don't require grad. If a None value would be acceptable then
|
||||||
|
/// this argument is optional.
|
||||||
|
/// \param keep_graph If ``false``, the graph used to compute
|
||||||
|
/// the grads will be freed. Note that in nearly all cases setting
|
||||||
|
/// this option to True is not needed and often can be worked around
|
||||||
|
/// in a much more efficient way. Defaults to the value of
|
||||||
|
/// ``create_graph``.
|
||||||
|
/// \param create_graph If ``true``, graph of the derivative will
|
||||||
|
/// be constructed, allowing to compute higher order derivative
|
||||||
|
/// products. Defaults to ``false``.
|
||||||
|
|
||||||
|
/// \fn Tensor detach() const;
|
||||||
|
///
|
||||||
|
/// Returns a new Tensor, detached from the current graph.
|
||||||
|
/// The result will never require gradient.
|
||||||
|
|
||||||
|
/// \fn Tensor & detach_() const;
|
||||||
|
///
|
||||||
|
/// Detaches the Tensor from the graph that created it, making it a leaf.
|
||||||
|
/// Views cannot be detached in-place.
|
||||||
|
|
||||||
|
/// \fn void retain_grad() const;
|
||||||
|
///
|
||||||
|
/// Enables .grad() for non-leaf Tensors.
|
||||||
|
|
||||||
Tensor& set_requires_grad(bool requires_grad) {
|
Tensor& set_requires_grad(bool requires_grad) {
|
||||||
impl_->set_requires_grad(requires_grad);
|
impl_->set_requires_grad(requires_grad);
|
||||||
return *this;
|
return *this;
|
||||||
@ -431,9 +510,16 @@ class CAFFE2_API Tensor {
|
|||||||
return impl_->requires_grad();
|
return impl_->requires_grad();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return a mutable reference to the gradient. This is conventionally
|
||||||
|
/// used as `t.grad() = x` to set a gradient to a completely new tensor.
|
||||||
Tensor& grad() {
|
Tensor& grad() {
|
||||||
return impl_->grad();
|
return impl_->grad();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This function returns an undefined tensor by default and returns a defined tensor
|
||||||
|
/// the first time a call to `backward()` computes gradients for this Tensor.
|
||||||
|
/// The attribute will then contain the gradients computed and future calls
|
||||||
|
/// to `backward()` will accumulate (add) gradients into it.
|
||||||
const Tensor& grad() const {
|
const Tensor& grad() const {
|
||||||
return impl_->grad();
|
return impl_->grad();
|
||||||
}
|
}
|
||||||
@ -505,11 +591,38 @@ class CAFFE2_API Tensor {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
using hook_return_var_t = std::enable_if_t<std::is_same<typename std::result_of<T&(Tensor)>::type, Tensor>::value, unsigned>;
|
using hook_return_var_t = std::enable_if_t<std::is_same<typename std::result_of<T&(Tensor)>::type, Tensor>::value, unsigned>;
|
||||||
|
|
||||||
// Returns the index of the hook in the list which can be used to remove hook
|
/// Registers a backward hook.
|
||||||
// Register a hook with no return value
|
///
|
||||||
|
/// The hook will be called every time a gradient with respect to the Tensor is computed.
|
||||||
|
/// The hook should have one of the following signature:
|
||||||
|
/// ```
|
||||||
|
/// hook(Tensor grad) -> Tensor
|
||||||
|
/// ```
|
||||||
|
/// ```
|
||||||
|
/// hook(Tensor grad) -> void
|
||||||
|
/// ```
|
||||||
|
/// The hook should not modify its argument, but it can optionally return a new gradient
|
||||||
|
/// which will be used in place of `grad`.
|
||||||
|
///
|
||||||
|
/// This function returns the index of the hook in the list which can be used to remove hook.
|
||||||
|
///
|
||||||
|
/// Example:
|
||||||
|
/// @code
|
||||||
|
/// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad());
|
||||||
|
/// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient
|
||||||
|
/// v.backward(torch::tensor({1., 2., 3.}));
|
||||||
|
/// // This prints:
|
||||||
|
/// // ```
|
||||||
|
/// // 2
|
||||||
|
/// // 4
|
||||||
|
/// // 6
|
||||||
|
/// // [ CPUFloatType{3} ]
|
||||||
|
/// // ```
|
||||||
|
/// std::cout << v.grad() << std::endl;
|
||||||
|
/// v.remove_hook(h); // removes the hook
|
||||||
|
/// @endcode
|
||||||
template <typename T>
|
template <typename T>
|
||||||
hook_return_void_t<T> register_hook(T&& hook) const;
|
hook_return_void_t<T> register_hook(T&& hook) const;
|
||||||
// Register a hook with variable return value
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
hook_return_var_t<T> register_hook(T&& hook) const;
|
hook_return_var_t<T> register_hook(T&& hook) const;
|
||||||
|
|
||||||
@ -518,7 +631,7 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// Remove hook at given position
|
/// Remove hook at given position
|
||||||
void remove_hook(unsigned pos) const;
|
void remove_hook(unsigned pos) const;
|
||||||
|
|
||||||
// View Variables
|
// View Variables
|
||||||
|
|||||||
@ -69,12 +69,6 @@
|
|||||||
# define TH_UNUSED
|
# define TH_UNUSED
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__clang__)
|
|
||||||
#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero")))
|
|
||||||
#else
|
|
||||||
#define __ubsan_ignore_float_divide_by_zero__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef M_PI
|
#ifndef M_PI
|
||||||
# define M_PI 3.14159265358979323846
|
# define M_PI 3.14159265358979323846
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -9,7 +9,7 @@ set(extra_src)
|
|||||||
# loop over all types
|
# loop over all types
|
||||||
foreach(THC_TYPE Byte Char Short Int Long Half Float Double)
|
foreach(THC_TYPE Byte Char Short Int Long Half Float Double)
|
||||||
# loop over files which need to be split between types (because of long compile times)
|
# loop over files which need to be split between types (because of long compile times)
|
||||||
foreach(THC_FILE TensorSort TensorMathPointwise TensorMathReduce TensorMasked)
|
foreach(THC_FILE TensorSort TensorMathPointwise TensorMathReduce TensorMasked TensorTopK)
|
||||||
if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
|
if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
|
||||||
FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu"
|
FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu"
|
||||||
"#include <THC/THC${THC_FILE}.cuh>\n#include <THC/THCTensor.hpp>\n\n#include <THC/generic/THC${THC_FILE}.cu>\n#include <THC/THCGenerate${THC_TYPE}Type.h>\n")
|
"#include <THC/THC${THC_FILE}.cuh>\n#include <THC/THCTensor.hpp>\n\n#include <THC/generic/THC${THC_FILE}.cu>\n#include <THC/THCGenerate${THC_TYPE}Type.h>\n")
|
||||||
@ -56,7 +56,6 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
|
|||||||
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorIndex.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorIndex.cu
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cu
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorScatterGather.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorScatterGather.cu
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTopK.cu
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu
|
||||||
|
|||||||
@ -73,7 +73,7 @@ TensorInfo<T, IndexType>::TensorInfo(T* p,
|
|||||||
template <typename T, typename IndexType>
|
template <typename T, typename IndexType>
|
||||||
void
|
void
|
||||||
TensorInfo<T, IndexType>::reduceDim(int dim) {
|
TensorInfo<T, IndexType>::reduceDim(int dim) {
|
||||||
assert(dim < dims && dim >= 0);
|
TORCH_INTERNAL_ASSERT(dim < dims && dim >= 0);
|
||||||
sizes[dim] = 1;
|
sizes[dim] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,7 +81,7 @@ template <typename T, typename IndexType>
|
|||||||
int
|
int
|
||||||
TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
|
TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
|
||||||
|
|
||||||
assert(excludeDim >= -1 && excludeDim < dims);
|
TORCH_INTERNAL_ASSERT(excludeDim >= -1 && excludeDim < dims);
|
||||||
|
|
||||||
int stopDim = (excludeDim == -1) ? dims : excludeDim;
|
int stopDim = (excludeDim == -1) ? dims : excludeDim;
|
||||||
int newIndex = -1;
|
int newIndex = -1;
|
||||||
|
|||||||
@ -1,19 +0,0 @@
|
|||||||
#include <THC/THC.h>
|
|
||||||
#include <THC/THCReduceApplyUtils.cuh>
|
|
||||||
#include <THC/THCTensorCopy.h>
|
|
||||||
#include <THC/THCTensorMath.h>
|
|
||||||
#include <THC/THCAsmUtils.cuh>
|
|
||||||
#include <THC/THCScanUtils.cuh>
|
|
||||||
#include <THC/THCTensorTypeUtils.cuh>
|
|
||||||
#include <THC/THCTensorMathReduce.cuh>
|
|
||||||
#include <ATen/WrapDimUtils.h>
|
|
||||||
#include <algorithm> // for std::min
|
|
||||||
|
|
||||||
#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
|
|
||||||
#include <thrust/system/cuda/execution_policy.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <THC/THCTensorTopK.cuh>
|
|
||||||
|
|
||||||
#include <THC/generic/THCTensorTopK.cu>
|
|
||||||
#include <THC/THCGenerateAllTypes.h>
|
|
||||||
@ -1,6 +1,21 @@
|
|||||||
#ifndef THC_TENSOR_TOPK_CUH
|
#ifndef THC_TENSOR_TOPK_CUH
|
||||||
#define THC_TENSOR_TOPK_CUH
|
#define THC_TENSOR_TOPK_CUH
|
||||||
|
|
||||||
|
#include <THC/THC.h>
|
||||||
|
#include <THC/THCReduceApplyUtils.cuh>
|
||||||
|
#include <THC/THCTensorCopy.h>
|
||||||
|
#include <THC/THCTensorMath.h>
|
||||||
|
#include <THC/THCAsmUtils.cuh>
|
||||||
|
#include <THC/THCScanUtils.cuh>
|
||||||
|
#include <THC/THCTensorTypeUtils.cuh>
|
||||||
|
#include <THC/THCTensorMathReduce.cuh>
|
||||||
|
#include <ATen/WrapDimUtils.h>
|
||||||
|
#include <algorithm> // for std::min
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
|
||||||
|
#include <thrust/system/cuda/execution_policy.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <c10/macros/Macros.h>
|
#include <c10/macros/Macros.h>
|
||||||
#include <ATen/native/cuda/SortingRadixSelect.cuh>
|
#include <ATen/native/cuda/SortingRadixSelect.cuh>
|
||||||
|
|
||||||
@ -52,6 +67,7 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input,
|
|||||||
inputSliceStart, outputSliceSize,
|
inputSliceStart, outputSliceSize,
|
||||||
inputSliceSize, inputWithinSliceStride,
|
inputSliceSize, inputWithinSliceStride,
|
||||||
smem, &topKValue);
|
smem, &topKValue);
|
||||||
|
const auto topKConverted = at::native::TopKTypeConfig<T>::convert(topKValue);
|
||||||
|
|
||||||
// Every value that is strictly less/greater than `pattern`
|
// Every value that is strictly less/greater than `pattern`
|
||||||
// (depending on sort dir) in sorted int format is in the top-K.
|
// (depending on sort dir) in sorted int format is in the top-K.
|
||||||
@ -74,11 +90,12 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input,
|
|||||||
bool inRange = (i < inputSliceSize);
|
bool inRange = (i < inputSliceSize);
|
||||||
T v =
|
T v =
|
||||||
inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
|
inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
|
||||||
|
const auto convertedV = at::native::TopKTypeConfig<T>::convert(v);
|
||||||
bool hasTopK;
|
bool hasTopK;
|
||||||
if (Order) {
|
if (Order) {
|
||||||
hasTopK = inRange && (THCNumerics<T>::gt(v, topKValue));
|
hasTopK = inRange && (convertedV > topKConverted);
|
||||||
} else {
|
} else {
|
||||||
hasTopK = inRange && (THCNumerics<T>::lt(v, topKValue));
|
hasTopK = inRange && (convertedV < topKConverted);
|
||||||
}
|
}
|
||||||
|
|
||||||
int index;
|
int index;
|
||||||
@ -111,7 +128,8 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input,
|
|||||||
bool inRange = (i < inputSliceSize);
|
bool inRange = (i < inputSliceSize);
|
||||||
T v =
|
T v =
|
||||||
inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
|
inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
|
||||||
bool hasTopK = inRange && (THCNumerics<T>::eq(v, topKValue));
|
const auto convertedV = at::native::TopKTypeConfig<T>::convert(v);
|
||||||
|
bool hasTopK = inRange && (convertedV == topKConverted);
|
||||||
|
|
||||||
int index;
|
int index;
|
||||||
int carry;
|
int carry;
|
||||||
|
|||||||
5
aten/src/THC/generated/THCTensorTopKByte.cu
Normal file
5
aten/src/THC/generated/THCTensorTopKByte.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include <THC/THCTensorTopK.cuh>
|
||||||
|
#include <THC/THCTensor.hpp>
|
||||||
|
|
||||||
|
#include <THC/generic/THCTensorTopK.cu>
|
||||||
|
#include <THC/THCGenerateByteType.h>
|
||||||
5
aten/src/THC/generated/THCTensorTopKChar.cu
Normal file
5
aten/src/THC/generated/THCTensorTopKChar.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include <THC/THCTensorTopK.cuh>
|
||||||
|
#include <THC/THCTensor.hpp>
|
||||||
|
|
||||||
|
#include <THC/generic/THCTensorTopK.cu>
|
||||||
|
#include <THC/THCGenerateCharType.h>
|
||||||
5
aten/src/THC/generated/THCTensorTopKDouble.cu
Normal file
5
aten/src/THC/generated/THCTensorTopKDouble.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include <THC/THCTensorTopK.cuh>
|
||||||
|
#include <THC/THCTensor.hpp>
|
||||||
|
|
||||||
|
#include <THC/generic/THCTensorTopK.cu>
|
||||||
|
#include <THC/THCGenerateDoubleType.h>
|
||||||
5
aten/src/THC/generated/THCTensorTopKFloat.cu
Normal file
5
aten/src/THC/generated/THCTensorTopKFloat.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include <THC/THCTensorTopK.cuh>
|
||||||
|
#include <THC/THCTensor.hpp>
|
||||||
|
|
||||||
|
#include <THC/generic/THCTensorTopK.cu>
|
||||||
|
#include <THC/THCGenerateFloatType.h>
|
||||||
5
aten/src/THC/generated/THCTensorTopKHalf.cu
Normal file
5
aten/src/THC/generated/THCTensorTopKHalf.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include <THC/THCTensorTopK.cuh>
|
||||||
|
#include <THC/THCTensor.hpp>
|
||||||
|
|
||||||
|
#include <THC/generic/THCTensorTopK.cu>
|
||||||
|
#include <THC/THCGenerateHalfType.h>
|
||||||
5
aten/src/THC/generated/THCTensorTopKInt.cu
Normal file
5
aten/src/THC/generated/THCTensorTopKInt.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include <THC/THCTensorTopK.cuh>
|
||||||
|
#include <THC/THCTensor.hpp>
|
||||||
|
|
||||||
|
#include <THC/generic/THCTensorTopK.cu>
|
||||||
|
#include <THC/THCGenerateIntType.h>
|
||||||
5
aten/src/THC/generated/THCTensorTopKLong.cu
Normal file
5
aten/src/THC/generated/THCTensorTopKLong.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include <THC/THCTensorTopK.cuh>
|
||||||
|
#include <THC/THCTensor.hpp>
|
||||||
|
|
||||||
|
#include <THC/generic/THCTensorTopK.cu>
|
||||||
|
#include <THC/THCGenerateLongType.h>
|
||||||
5
aten/src/THC/generated/THCTensorTopKShort.cu
Normal file
5
aten/src/THC/generated/THCTensorTopKShort.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include <THC/THCTensorTopK.cuh>
|
||||||
|
#include <THC/THCTensor.hpp>
|
||||||
|
|
||||||
|
#include <THC/generic/THCTensorTopK.cu>
|
||||||
|
#include <THC/THCGenerateShortType.h>
|
||||||
@ -269,7 +269,7 @@ void THCTensor_(mode)(THCState *state,
|
|||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
default:
|
default:
|
||||||
assert(false);
|
TORCH_INTERNAL_ASSERT(false);
|
||||||
}
|
}
|
||||||
THCudaCheck(cudaGetLastError());
|
THCudaCheck(cudaGetLastError());
|
||||||
|
|
||||||
|
|||||||
@ -101,7 +101,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
|
|||||||
/* Nothing to do, data already sorted */ \
|
/* Nothing to do, data already sorted */ \
|
||||||
break; \
|
break; \
|
||||||
default: \
|
default: \
|
||||||
assert(false); \
|
TORCH_INTERNAL_ASSERT(false); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -23,6 +23,14 @@
|
|||||||
|
|
||||||
#include "c10/macros/Export.h"
|
#include "c10/macros/Export.h"
|
||||||
|
|
||||||
|
#if defined(__clang__)
|
||||||
|
#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero")))
|
||||||
|
#define __ubsan_ignore_float_cast_overflow__ __attribute__((no_sanitize("float-cast-overflow")))
|
||||||
|
#else
|
||||||
|
#define __ubsan_ignore_float_divide_by_zero__
|
||||||
|
#define __ubsan_ignore_float_cast_overflow__
|
||||||
|
#endif
|
||||||
|
|
||||||
// Disable the copy and assignment operator for a class. Note that this will
|
// Disable the copy and assignment operator for a class. Note that this will
|
||||||
// disable the usage of the class in std containers.
|
// disable the usage of the class in std containers.
|
||||||
#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
|
#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
|
||||||
@ -196,25 +204,29 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
|
|||||||
#define __func__ __FUNCTION__
|
#define __func__ __FUNCTION__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda
|
// CUDA_KERNEL_ASSERT checks the assertion
|
||||||
// kernels. This is not supported by Apple platforms so we special case it.
|
|
||||||
// See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion
|
|
||||||
#if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
|
|
||||||
#define CUDA_KERNEL_ASSERT(...)
|
|
||||||
#else // __APPLE__
|
|
||||||
#define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
|
|
||||||
#endif // __APPLE__
|
|
||||||
|
|
||||||
// CUDA_ALWAYS_ASSERT is similar to CUDA_KERNEL_ASSERT but checks the assertion
|
|
||||||
// even when NDEBUG is defined. This is useful for important assertions in CUDA
|
// even when NDEBUG is defined. This is useful for important assertions in CUDA
|
||||||
// code that when building Release.
|
// code that when building Release.
|
||||||
#if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
|
#if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
|
||||||
// Those platforms do not support assert()
|
// Those platforms do not support assert()
|
||||||
#define CUDA_ALWAYS_ASSERT(cond)
|
#define CUDA_KERNEL_ASSERT(cond)
|
||||||
#elif defined(_MSC_VER)
|
#elif defined(_MSC_VER)
|
||||||
// TODO: This should be defined but I don't have the environment to properly
|
#if defined(NDEBUG)
|
||||||
// test it. See e.g., https://github.com/pytorch/pytorch/pull/32719#discussion_r379918384
|
extern "C" {
|
||||||
#define CUDA_ALWAYS_ASSERT(cond)
|
C10_IMPORT
|
||||||
|
#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) || defined(__HIP__)
|
||||||
|
__host__ __device__
|
||||||
|
#endif // __CUDA_ARCH__
|
||||||
|
void _wassert(
|
||||||
|
wchar_t const* _Message,
|
||||||
|
wchar_t const* _File,
|
||||||
|
unsigned _Line);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#define CUDA_KERNEL_ASSERT(cond) \
|
||||||
|
if (C10_UNLIKELY(!(cond))) { \
|
||||||
|
(void)(_wassert(_CRT_WIDE(#cond), _CRT_WIDE(__FILE__), static_cast<unsigned>(__LINE__)), 0); \
|
||||||
|
}
|
||||||
#else // __APPLE__, _MSC_VER
|
#else // __APPLE__, _MSC_VER
|
||||||
#if defined(NDEBUG)
|
#if defined(NDEBUG)
|
||||||
extern "C" {
|
extern "C" {
|
||||||
@ -233,7 +245,7 @@ __host__ __device__
|
|||||||
const char* function) throw();
|
const char* function) throw();
|
||||||
}
|
}
|
||||||
#endif // NDEBUG
|
#endif // NDEBUG
|
||||||
#define CUDA_ALWAYS_ASSERT(cond) \
|
#define CUDA_KERNEL_ASSERT(cond) \
|
||||||
if (C10_UNLIKELY(!(cond))) { \
|
if (C10_UNLIKELY(!(cond))) { \
|
||||||
__assert_fail(#cond, __FILE__, static_cast<unsigned int>(__LINE__), \
|
__assert_fail(#cond, __FILE__, static_cast<unsigned int>(__LINE__), \
|
||||||
__func__); \
|
__func__); \
|
||||||
|
|||||||
@ -66,24 +66,44 @@ void Error::AppendMessage(const std::string& new_msg) {
|
|||||||
namespace Warning {
|
namespace Warning {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
WarningHandler* getHandler() {
|
WarningHandler* getBaseHandler() {
|
||||||
static WarningHandler base_warning_handler_ = WarningHandler();
|
static WarningHandler base_warning_handler_ = WarningHandler();
|
||||||
return &base_warning_handler_;
|
return &base_warning_handler_;
|
||||||
};
|
};
|
||||||
static thread_local WarningHandler* warning_handler_ = getHandler();
|
|
||||||
|
class ThreadWarningHandler {
|
||||||
|
public:
|
||||||
|
ThreadWarningHandler() = delete;
|
||||||
|
|
||||||
|
static WarningHandler* get_handler() {
|
||||||
|
if (!warning_handler_) {
|
||||||
|
warning_handler_ = getBaseHandler();
|
||||||
|
}
|
||||||
|
return warning_handler_;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_handler(WarningHandler* handler) {
|
||||||
|
warning_handler_ = handler;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static thread_local WarningHandler* warning_handler_;
|
||||||
|
};
|
||||||
|
|
||||||
|
thread_local WarningHandler* ThreadWarningHandler::warning_handler_ = nullptr;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void warn(SourceLocation source_location, const std::string& msg) {
|
void warn(SourceLocation source_location, const std::string& msg) {
|
||||||
warning_handler_->process(source_location, msg);
|
ThreadWarningHandler::get_handler()->process(source_location, msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_warning_handler(WarningHandler* handler) noexcept(true) {
|
void set_warning_handler(WarningHandler* handler) noexcept(true) {
|
||||||
warning_handler_ = handler;
|
ThreadWarningHandler::set_handler(handler);
|
||||||
}
|
}
|
||||||
|
|
||||||
WarningHandler* get_warning_handler() noexcept(true) {
|
WarningHandler* get_warning_handler() noexcept(true) {
|
||||||
return warning_handler_;
|
return ThreadWarningHandler::get_handler();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Warning
|
} // namespace Warning
|
||||||
|
|||||||
@ -67,7 +67,7 @@ struct maybe_real<true, src_t> {
|
|||||||
|
|
||||||
template <typename dest_t, typename src_t>
|
template <typename dest_t, typename src_t>
|
||||||
struct static_cast_with_inter_type {
|
struct static_cast_with_inter_type {
|
||||||
C10_HOST_DEVICE static inline dest_t apply(src_t src) {
|
C10_HOST_DEVICE __ubsan_ignore_float_cast_overflow__ static inline dest_t apply(src_t src) {
|
||||||
constexpr bool real = needs_real<dest_t, src_t>::value;
|
constexpr bool real = needs_real<dest_t, src_t>::value;
|
||||||
return static_cast<dest_t>(
|
return static_cast<dest_t>(
|
||||||
static_cast<inter_copy_type_t<dest_t>>(maybe_real<real, src_t>::apply(src)));
|
static_cast<inter_copy_type_t<dest_t>>(maybe_real<real, src_t>::apply(src)));
|
||||||
|
|||||||
@ -748,7 +748,7 @@ if (NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
|
|||||||
target_include_directories(torch_cuda PUBLIC "${NVTOOLEXT_HOME}/include")
|
target_include_directories(torch_cuda PUBLIC "${NVTOOLEXT_HOME}/include")
|
||||||
# -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it.
|
# -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it.
|
||||||
# Related issue: https://github.com/pytorch/pytorch/issues/31611
|
# Related issue: https://github.com/pytorch/pytorch/issues/31611
|
||||||
target_link_libraries(torch_cuda INTERFACE "-INCLUDE:\"?warp_size@cuda@at@@YAHXZ\"")
|
target_link_libraries(torch_cuda INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
|
||||||
|
|
||||||
elseif(APPLE)
|
elseif(APPLE)
|
||||||
set(TORCH_CUDA_LIBRARIES
|
set(TORCH_CUDA_LIBRARIES
|
||||||
@ -949,6 +949,31 @@ if (USE_OPENMP AND OPENMP_FOUND)
|
|||||||
target_link_libraries(torch_cpu PRIVATE ${OpenMP_CXX_LIBRARIES})
|
target_link_libraries(torch_cpu PRIVATE ${OpenMP_CXX_LIBRARIES})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if ($ENV{TH_BINARY_BUILD})
|
||||||
|
if (NOT MSVC AND USE_CUDA AND NOT APPLE)
|
||||||
|
# Note [Extra MKL symbols for MAGMA in torch_cpu]
|
||||||
|
#
|
||||||
|
# When we build CUDA libraries and link against MAGMA, MAGMA makes use of
|
||||||
|
# some BLAS symbols in its CPU fallbacks when it has no GPU versions
|
||||||
|
# of kernels. Previously, we ensured the BLAS symbols were filled in by
|
||||||
|
# MKL by linking torch_cuda with BLAS, but when we are statically linking
|
||||||
|
# against MKL (when we do wheel builds), this actually ends up pulling in a
|
||||||
|
# decent chunk of MKL into torch_cuda, inflating our torch_cuda binary
|
||||||
|
# size by 8M. torch_cpu exposes most of the MKL symbols we need, but
|
||||||
|
# empirically we determined that there are four which it doesn't provide. If
|
||||||
|
# we link torch_cpu with these --undefined symbols, we can ensure they
|
||||||
|
# do get pulled in, and then we can avoid statically linking in MKL to
|
||||||
|
# torch_cuda at all!
|
||||||
|
#
|
||||||
|
# We aren't really optimizing for binary size on Windows (and this link
|
||||||
|
# line doesn't work on Windows), so don't do it there.
|
||||||
|
#
|
||||||
|
# These linker commands do not work on OS X, do not attempt this there.
|
||||||
|
# (It shouldn't matter anyway, though, because OS X has dropped CUDA support)
|
||||||
|
set_target_properties(torch_cpu PROPERTIES LINK_FLAGS "-Wl,--undefined=mkl_lapack_slaed0 -Wl,--undefined=mkl_lapack_dlaed0 -Wl,--undefined=mkl_lapack_dormql -Wl,--undefined=mkl_lapack_sormql")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
target_link_libraries(torch_cpu PUBLIC c10)
|
target_link_libraries(torch_cpu PUBLIC c10)
|
||||||
target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
|
target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
|
||||||
target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS})
|
target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS})
|
||||||
|
|||||||
@ -261,15 +261,6 @@ CAFFE2_CUDA_API const char* curandGetErrorString(curandStatus_t error);
|
|||||||
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
|
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
|
||||||
j += blockDim.y * gridDim.y)
|
j += blockDim.y * gridDim.y)
|
||||||
|
|
||||||
// CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda
|
|
||||||
// kernels. This is not supported by Apple platforms so we special case it.
|
|
||||||
// See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion
|
|
||||||
#if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
|
|
||||||
#define CUDA_KERNEL_ASSERT(...)
|
|
||||||
#else // __APPLE__
|
|
||||||
#define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
|
|
||||||
#endif // __APPLE__
|
|
||||||
|
|
||||||
// The following helper functions are here so that you can write a kernel call
|
// The following helper functions are here so that you can write a kernel call
|
||||||
// when you are not particularly interested in maxing out the kernels'
|
// when you are not particularly interested in maxing out the kernels'
|
||||||
// performance. Usually, this will give you a reasonable speed, but if you
|
// performance. Usually, this will give you a reasonable speed, but if you
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
#include "caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h"
|
#include "caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h"
|
||||||
#include <fp16.h>
|
#include <fp16.h>
|
||||||
|
#ifdef __AVX__
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
#include "c10/util/Registry.h"
|
#include "c10/util/Registry.h"
|
||||||
|
|
||||||
namespace caffe2 {
|
namespace caffe2 {
|
||||||
|
|||||||
@ -50,8 +50,13 @@ __global__ void ReluCUDAKernel<half2>(const int N, const half2* X, half2* Y) {
|
|||||||
Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i));
|
Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i));
|
||||||
#else
|
#else
|
||||||
const float2 xx = __half22float2(X[i]);
|
const float2 xx = __half22float2(X[i]);
|
||||||
Y[i] =
|
// There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered
|
||||||
__floats2half2_rn(xx.x > 0.0f ? xx.x : 0.0f, xx.y > 0.0f ? xx.y : 0.0f);
|
// sometimes:
|
||||||
|
//
|
||||||
|
// error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 0>' can be
|
||||||
|
// converted to 'float' and vice versa
|
||||||
|
Y[i] = __floats2half2_rn(xx.x > 0.0f ? static_cast<float>(xx.x) : 0.0f,
|
||||||
|
xx.y > 0.0f ? static_cast<float>(xx.y) : 0.0f);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -100,8 +105,14 @@ __global__ void ReluGradientCUDAKernel<half2>(
|
|||||||
#else
|
#else
|
||||||
const float2 dy = __half22float2(dY[i]);
|
const float2 dy = __half22float2(dY[i]);
|
||||||
const float2 yy = __half22float2(Y[i]);
|
const float2 yy = __half22float2(Y[i]);
|
||||||
dX[i] =
|
// There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered
|
||||||
__floats2half2_rn(yy.x > 0.0f ? dy.x : 0.0f, yy.y > 0.0f ? dy.y : 0.0f);
|
// sometimes:
|
||||||
|
//
|
||||||
|
// error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 1>' can be
|
||||||
|
// converted to 'float' and vice versa
|
||||||
|
|
||||||
|
dX[i] = __floats2half2_rn(yy.x > 0.0f ? static_cast<float>(dy.x) : 0.0f,
|
||||||
|
yy.y > 0.0f ? static_cast<float>(dy.y) : 0.0f);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -76,7 +76,7 @@ struct TopKTypeConfig<short> {
|
|||||||
typedef unsigned int RadixType;
|
typedef unsigned int RadixType;
|
||||||
|
|
||||||
static inline __device__ RadixType convert(short v) {
|
static inline __device__ RadixType convert(short v) {
|
||||||
CUDA_KERNEL_ASSERT(sizeof(short) == 2);
|
static_assert(sizeof(short) == 2, "");
|
||||||
return 32768u + v;
|
return 32768u + v;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,7 +90,7 @@ struct TopKTypeConfig<int> {
|
|||||||
typedef unsigned int RadixType;
|
typedef unsigned int RadixType;
|
||||||
|
|
||||||
static inline __device__ RadixType convert(int v) {
|
static inline __device__ RadixType convert(int v) {
|
||||||
CUDA_KERNEL_ASSERT(sizeof(int) == 4);
|
static_assert(sizeof(int) == 4, "");
|
||||||
return 2147483648u + v;
|
return 2147483648u + v;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,6 +104,7 @@ struct TopKTypeConfig<long> {
|
|||||||
typedef unsigned long long int RadixType;
|
typedef unsigned long long int RadixType;
|
||||||
|
|
||||||
static inline __device__ RadixType convert(long v) {
|
static inline __device__ RadixType convert(long v) {
|
||||||
|
//static_assert fails on windows, so leave it as CUDA_KERNEL_ASSERT
|
||||||
CUDA_KERNEL_ASSERT(sizeof(long) == 8);
|
CUDA_KERNEL_ASSERT(sizeof(long) == 8);
|
||||||
return 9223372036854775808ull + v;
|
return 9223372036854775808ull + v;
|
||||||
}
|
}
|
||||||
|
|||||||
40
cmake/External/nccl.cmake
vendored
40
cmake/External/nccl.cmake
vendored
@ -15,6 +15,7 @@ if (NOT __NCCL_INCLUDED)
|
|||||||
# this second replacement is needed when there are multiple archs
|
# this second replacement is needed when there are multiple archs
|
||||||
string(REPLACE ";-gencode" " -gencode" NVCC_GENCODE "${NVCC_GENCODE}")
|
string(REPLACE ";-gencode" " -gencode" NVCC_GENCODE "${NVCC_GENCODE}")
|
||||||
|
|
||||||
|
set(__NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl")
|
||||||
ExternalProject_Add(nccl_external
|
ExternalProject_Add(nccl_external
|
||||||
SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl/nccl
|
SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl/nccl
|
||||||
BUILD_IN_SOURCE 1
|
BUILD_IN_SOURCE 1
|
||||||
@ -30,20 +31,49 @@ if (NOT __NCCL_INCLUDED)
|
|||||||
"CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}"
|
"CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}"
|
||||||
"NVCC=${CUDA_NVCC_EXECUTABLE}"
|
"NVCC=${CUDA_NVCC_EXECUTABLE}"
|
||||||
"NVCC_GENCODE=${NVCC_GENCODE}"
|
"NVCC_GENCODE=${NVCC_GENCODE}"
|
||||||
"BUILDDIR=${CMAKE_CURRENT_BINARY_DIR}/nccl"
|
"BUILDDIR=${__NCCL_BUILD_DIR}"
|
||||||
"VERBOSE=0"
|
"VERBOSE=0"
|
||||||
"-j"
|
"-j"
|
||||||
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/nccl/lib/libnccl_static.a"
|
BUILD_BYPRODUCTS "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
|
||||||
INSTALL_COMMAND ""
|
INSTALL_COMMAND ""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Detect objcopy version
|
||||||
|
execute_process (COMMAND "${CMAKE_OBJCOPY}" "--version" OUTPUT_VARIABLE OBJCOPY_VERSION_STR)
|
||||||
|
string(REGEX REPLACE "GNU objcopy version ([0-9])\\.([0-9]+).*" "\\1" OBJCOPY_VERSION_MAJOR ${OBJCOPY_VERSION_STR})
|
||||||
|
string(REGEX REPLACE "GNU objcopy version ([0-9])\\.([0-9]+).*" "\\2" OBJCOPY_VERSION_MINOR ${OBJCOPY_VERSION_STR})
|
||||||
|
|
||||||
|
if ((${OBJCOPY_VERSION_MAJOR} GREATER 2) OR ((${OBJCOPY_VERSION_MAJOR} EQUAL 2) AND (${OBJCOPY_VERSION_MINOR} GREATER 27)))
|
||||||
|
message(WARNING "Enabling NCCL library slimming")
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a"
|
||||||
|
DEPENDS nccl_external
|
||||||
|
COMMAND "${CMAKE_COMMAND}" -E make_directory "${__NCCL_BUILD_DIR}/objects"
|
||||||
|
COMMAND cd objects
|
||||||
|
COMMAND "${CMAKE_AR}" x "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
|
||||||
|
COMMAND for obj in all_gather_* all_reduce_* broadcast_* reduce_*.o$<SEMICOLON> do "${CMAKE_OBJCOPY}" --remove-relocations .nvFatBinSegment --remove-section __nv_relfatbin $$obj$<SEMICOLON> done
|
||||||
|
COMMAND "${CMAKE_AR}" cr "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a" "*.o"
|
||||||
|
COMMAND cd -
|
||||||
|
COMMAND "${CMAKE_COMMAND}" -E remove_directory "${__NCCL_BUILD_DIR}/objects"
|
||||||
|
WORKING_DIRECTORY "${__NCCL_BUILD_DIR}"
|
||||||
|
COMMENT "Slimming NCCL"
|
||||||
|
)
|
||||||
|
add_custom_target(nccl_slim_external DEPENDS "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a")
|
||||||
|
set(__NCCL_LIBRARY_DEP nccl_slim_external)
|
||||||
|
set(NCCL_LIBRARIES ${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a)
|
||||||
|
else()
|
||||||
|
message(WARNING "Objcopy version is too old to support NCCL library slimming")
|
||||||
|
set(__NCCL_LIBRARY_DEP nccl_external)
|
||||||
|
set(NCCL_LIBRARIES ${__NCCL_BUILD_DIR}/lib/libnccl_static.a)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
set(NCCL_FOUND TRUE)
|
set(NCCL_FOUND TRUE)
|
||||||
add_library(__caffe2_nccl INTERFACE)
|
add_library(__caffe2_nccl INTERFACE)
|
||||||
# The following old-style variables are set so that other libs, such as Gloo,
|
# The following old-style variables are set so that other libs, such as Gloo,
|
||||||
# can still use it.
|
# can still use it.
|
||||||
set(NCCL_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/nccl/include)
|
set(NCCL_INCLUDE_DIRS ${__NCCL_BUILD_DIR}/include)
|
||||||
set(NCCL_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/nccl/lib/libnccl_static.a)
|
add_dependencies(__caffe2_nccl ${__NCCL_LIBRARY_DEP})
|
||||||
add_dependencies(__caffe2_nccl nccl_external)
|
|
||||||
target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES})
|
target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES})
|
||||||
target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS})
|
target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS})
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@ -56,6 +56,10 @@ INPUT = ../../../aten/src/ATen/ATen.h \
|
|||||||
../../../c10/cuda/CUDAStream.h \
|
../../../c10/cuda/CUDAStream.h \
|
||||||
../../../torch/csrc/api/include \
|
../../../torch/csrc/api/include \
|
||||||
../../../torch/csrc/api/src \
|
../../../torch/csrc/api/src \
|
||||||
|
../../../torch/csrc/autograd/autograd.h \
|
||||||
|
../../../torch/csrc/autograd/custom_function.h \
|
||||||
|
../../../torch/csrc/autograd/function.h \
|
||||||
|
../../../torch/csrc/autograd/variable.h \
|
||||||
../../../torch/csrc/autograd/generated/variable_factories.h \
|
../../../torch/csrc/autograd/generated/variable_factories.h \
|
||||||
../../../torch/csrc/jit/runtime/custom_operator.h \
|
../../../torch/csrc/jit/runtime/custom_operator.h \
|
||||||
../../../torch/csrc/jit/serialization/import.h \
|
../../../torch/csrc/jit/serialization/import.h \
|
||||||
|
|||||||
@ -281,7 +281,9 @@ change one property, this is quite practical.
|
|||||||
In conclusion, we can now compare how ``TensorOptions`` defaults, together with
|
In conclusion, we can now compare how ``TensorOptions`` defaults, together with
|
||||||
the abbreviated API for creating ``TensorOptions`` using free functions, allow
|
the abbreviated API for creating ``TensorOptions`` using free functions, allow
|
||||||
tensor creation in C++ with the same convenience as in Python. Compare this
|
tensor creation in C++ with the same convenience as in Python. Compare this
|
||||||
call in Python::
|
call in Python:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
torch.randn(3, 4, dtype=torch.float32, device=torch.device('cuda', 1), requires_grad=True)
|
torch.randn(3, 4, dtype=torch.float32, device=torch.device('cuda', 1), requires_grad=True)
|
||||||
|
|
||||||
|
|||||||
99
docs/cpp/source/notes/tensor_indexing.rst
Normal file
99
docs/cpp/source/notes/tensor_indexing.rst
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
Tensor Indexing API
|
||||||
|
===================
|
||||||
|
|
||||||
|
Indexing a tensor in the PyTorch C++ API works very similar to the Python API.
|
||||||
|
All index types such as ``None`` / ``...`` / integer / boolean / slice / tensor
|
||||||
|
are available in the C++ API, making translation from Python indexing code to C++
|
||||||
|
very simple. The main difference is that, instead of using the ``[]``-operator
|
||||||
|
similar to the Python API syntax, in the C++ API the indexing methods are:
|
||||||
|
|
||||||
|
- ``torch::Tensor::index`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor5indexE8ArrayRefIN2at8indexing11TensorIndexEE>`_)
|
||||||
|
- ``torch::Tensor::index_put_`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4N2at6Tensor10index_put_E8ArrayRefIN2at8indexing11TensorIndexEERK6Tensor>`_)
|
||||||
|
|
||||||
|
It's also important to note that index types such as ``None`` / ``Ellipsis`` / ``Slice``
|
||||||
|
live in the ``torch::indexing`` namespace, and it's recommended to put ``using namespace torch::indexing``
|
||||||
|
before any indexing code for convenient use of those index types.
|
||||||
|
|
||||||
|
Here are some examples of translating Python indexing code to C++:
|
||||||
|
|
||||||
|
Getter
|
||||||
|
------
|
||||||
|
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| Python | C++ (assuming ``using namespace torch::indexing``) |
|
||||||
|
+==========================================================+======================================================================================+
|
||||||
|
| ``tensor[None]`` | ``tensor.index({None})`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[Ellipsis, ...]`` | ``tensor.index({Ellipsis, "..."})`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[1, 2]`` | ``tensor.index({1, 2})`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[True, False]`` | ``tensor.index({true, false})`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[1::2]`` | ``tensor.index({Slice(1, None, 2)})`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[torch.tensor([1, 2])]`` | ``tensor.index({torch::tensor({1, 2})})`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[..., 0, True, 1::2, torch.tensor([1, 2])]`` | ``tensor.index({"...", 0, true, Slice(1, None, 2), torch::tensor({1, 2})})`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
|
||||||
|
Setter
|
||||||
|
------
|
||||||
|
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| Python | C++ (assuming ``using namespace torch::indexing``) |
|
||||||
|
+==========================================================+======================================================================================+
|
||||||
|
| ``tensor[None] = 1`` | ``tensor.index_put_({None}, 1)`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[Ellipsis, ...] = 1`` | ``tensor.index_put_({Ellipsis, "..."}, 1)`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[1, 2] = 1`` | ``tensor.index_put_({1, 2}, 1)`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[True, False] = 1`` | ``tensor.index_put_({true, false}, 1)`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[1::2] = 1`` | ``tensor.index_put_({Slice(1, None, 2)}, 1)`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[torch.tensor([1, 2])] = 1`` | ``tensor.index_put_({torch::tensor({1, 2})}, 1)`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
| ``tensor[..., 0, True, 1::2, torch.tensor([1, 2])] = 1`` | ``tensor.index_put_({"...", 0, true, Slice(1, None, 2), torch::tensor({1, 2})}, 1)`` |
|
||||||
|
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||||
|
|
||||||
|
|
||||||
|
Translating between Python/C++ index types
|
||||||
|
------------------------------------------
|
||||||
|
|
||||||
|
The one-to-one translation between Python and C++ index types is as follows:
|
||||||
|
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| Python | C++ (assuming ``using namespace torch::indexing``) |
|
||||||
|
+=========================+========================================================================+
|
||||||
|
| ``None`` | ``None`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``Ellipsis`` | ``Ellipsis`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``...`` | ``"..."`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``123`` | ``123`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``True`` | ``true`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``False`` | ``false`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``:`` or ``::`` | ``Slice()`` or ``Slice(None, None)`` or ``Slice(None, None, None)`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``1:`` or ``1::`` | ``Slice(1, None)`` or ``Slice(1, None, None)`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``:3`` or ``:3:`` | ``Slice(None, 3)`` or ``Slice(None, 3, None)`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``::2`` | ``Slice(None, None, 2)`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``1:3`` | ``Slice(1, 3)`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``1::2`` | ``Slice(1, None, 2)`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``:3:2`` | ``Slice(None, 3, 2)`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``1:3:2`` | ``Slice(1, 3, 2)`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
|
| ``torch.tensor([1, 2])``| ``torch::tensor({1, 2})`` |
|
||||||
|
+-------------------------+------------------------------------------------------------------------+
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user