mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 04:04:57 +08:00 
			
		
		
		
	Compare commits
	
		
			169 Commits
		
	
	
		
			v2.5.0-rc3
			...
			release/1.
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 923085e424 | |||
| 841c9cf491 | |||
| 3c31d73c87 | |||
| dfe8cdff5a | |||
| e7a6ed8151 | |||
| fc0dde5db3 | |||
| 83edd5164a | |||
| 833c4201ad | |||
| 5579c9e4c2 | |||
| 367901e1f9 | |||
| c4903bde61 | |||
| 7d2fcd505c | |||
| bb33e5fc85 | |||
| c5424a85dc | |||
| 5d01f87e58 | |||
| 82f549b0a8 | |||
| f306655d49 | |||
| 409e42e3b8 | |||
| 6151405f6c | |||
| d01065e50c | |||
| 67508dadaa | |||
| b54a731c8e | |||
| 3920c1d173 | |||
| 8d48a6490a | |||
| 17eae0e0cd | |||
| 4a9e45d50e | |||
| eb387a0a2b | |||
| 420c6dc43d | |||
| 39f0a2752a | |||
| 366026ab10 | |||
| 408e158df9 | |||
| 3598dea7ad | |||
| a5b05e8867 | |||
| 7fc2433458 | |||
| aba610b9e8 | |||
| dc30c519dd | |||
| 9bf2aaa659 | |||
| 25621d05df | |||
| 96f218d7dd | |||
| f810011c40 | |||
| 5f8bb352c3 | |||
| 52469a512b | |||
| c56adee862 | |||
| 4ff3872a20 | |||
| d7bdffabed | |||
| 9ba0a89489 | |||
| c164fbccb1 | |||
| 9a51e477ac | |||
| 375566fb78 | |||
| dfdc788076 | |||
| 9e6ef814cc | |||
| 31461800f6 | |||
| e741839b0e | |||
| 8eb39c9cfd | |||
| b5e4c0993d | |||
| 6bc6832bda | |||
| 593594839c | |||
| cf65c8ef15 | |||
| ca0dc1fcdc | |||
| b58f89b2e4 | |||
| 87b6685c6b | |||
| f746f1b746 | |||
| 1379415150 | |||
| 7d638d2596 | |||
| bad005d331 | |||
| 16d8a52407 | |||
| a33b264588 | |||
| 3a67e00889 | |||
| 6bd039551d | |||
| b6c3058d61 | |||
| ed908b4fbc | |||
| b66e0af58b | |||
| bf8a5ede96 | |||
| c2bc5c56c5 | |||
| db3c3ed662 | |||
| 9de4770bbd | |||
| 911a2a6b63 | |||
| 60375bcfdf | |||
| 63dcd9eccc | |||
| e8236d2ed4 | |||
| 0058b1bb7e | |||
| 419283e291 | |||
| 0e6f6ba218 | |||
| ec8dbaf920 | |||
| 7e168d134f | |||
| 6daae58871 | |||
| fee0ff1bf6 | |||
| deaf3b65cf | |||
| dca9c2501d | |||
| 842cd47416 | |||
| a30b49085c | |||
| 82626f8ad9 | |||
| 27fddfda4f | |||
| 7ecf6a1c10 | |||
| beb07a44c4 | |||
| a01c3bd1fe | |||
| ffd010f8a0 | |||
| 8ad59f03a8 | |||
| ed3640df68 | |||
| fb88942f6c | |||
| 5d05c51887 | |||
| df5986fbf3 | |||
| 165403f614 | |||
| fbf18c34ff | |||
| 84f806c821 | |||
| 94139a7d95 | |||
| 75e36186b2 | |||
| f4a0b406dd | |||
| e884e720f0 | |||
| dacdbc22d1 | |||
| 2a789cd0e0 | |||
| f9b010f399 | |||
| 55614ff306 | |||
| b12579da53 | |||
| 920e3eb761 | |||
| bec01e755a | |||
| 6a880e1bc9 | |||
| fa86e32a4e | |||
| 5aabaf2b18 | |||
| 4a707e8f95 | |||
| db127b21eb | |||
| 45313cd9e1 | |||
| df531973e1 | |||
| 9e3c577caa | |||
| 5357b8e4d9 | |||
| 0f23d23db4 | |||
| 7c24280a3f | |||
| 7100f0be13 | |||
| f7f611c2ec | |||
| acb982d0b0 | |||
| aa8b7ad989 | |||
| 2d403ed8be | |||
| c25a664f77 | |||
| ab660ae394 | |||
| 3c476a8858 | |||
| 651fa88645 | |||
| 565c3400b4 | |||
| 3e332778b4 | |||
| f598738920 | |||
| 4c6bfa0187 | |||
| 6f25003682 | |||
| 752c129fa1 | |||
| fb59a9caca | |||
| 4d30dbdd35 | |||
| b7f4a1a397 | |||
| afda1dc943 | |||
| d506ae882b | |||
| 36e5abe531 | |||
| 6e6f62230e | |||
| 5d15577e6c | |||
| 6aa5298c5c | |||
| f3df13725b | |||
| 4eee3caa11 | |||
| 4d96463130 | |||
| 246b824644 | |||
| 5ca9014318 | |||
| 48590d6a9b | |||
| 75148df1f5 | |||
| b89eb7c654 | |||
| 8877885454 | |||
| e2184ba083 | |||
| 8ef47ad2f0 | |||
| 6725b6f503 | |||
| bcd3f6da1a | |||
| 0b3d2f7b7d | |||
| f522651a7e | |||
| 01c8ef2757 | |||
| 7cfe68ce3a | |||
| 6f3120c6b9 | 
| @ -466,7 +466,7 @@ But if you want to try, then I’d recommend | ||||
| # Always install miniconda 3, even if building for Python <3 | ||||
| new_conda="~/my_new_conda" | ||||
| conda_sh="$new_conda/install_miniconda.sh" | ||||
| curl -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh | ||||
| curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh | ||||
| chmod +x "$conda_sh" | ||||
| "$conda_sh" -b -p "$MINICONDA_ROOT" | ||||
| rm -f "$conda_sh" | ||||
|  | ||||
| @ -34,8 +34,6 @@ def get_processor_arch_name(cuda_version): | ||||
|  | ||||
| LINUX_PACKAGE_VARIANTS = OrderedDict( | ||||
|     manywheel=[ | ||||
|         "2.7m", | ||||
|         "2.7mu", | ||||
|         "3.5m", | ||||
|         "3.6m", | ||||
|         "3.7m", | ||||
| @ -43,7 +41,7 @@ LINUX_PACKAGE_VARIANTS = OrderedDict( | ||||
|     ], | ||||
|     conda=dimensions.STANDARD_PYTHON_VERSIONS, | ||||
|     libtorch=[ | ||||
|         "2.7m", | ||||
|         "3.7m", | ||||
|     ], | ||||
| ) | ||||
|  | ||||
| @ -53,11 +51,21 @@ CONFIG_TREE_DATA = OrderedDict( | ||||
|         wheel=dimensions.STANDARD_PYTHON_VERSIONS, | ||||
|         conda=dimensions.STANDARD_PYTHON_VERSIONS, | ||||
|         libtorch=[ | ||||
|             "2.7", | ||||
|             "3.7", | ||||
|         ], | ||||
|     )), | ||||
|     windows=(dimensions.CUDA_VERSIONS, OrderedDict( | ||||
|         wheel=dimensions.STANDARD_PYTHON_VERSIONS, | ||||
|         conda=dimensions.STANDARD_PYTHON_VERSIONS, | ||||
|         libtorch=[ | ||||
|             "3.7", | ||||
|         ], | ||||
|     )), | ||||
| ) | ||||
|  | ||||
| CONFIG_TREE_DATA_NO_WINDOWS = CONFIG_TREE_DATA.copy() | ||||
| CONFIG_TREE_DATA_NO_WINDOWS.pop("windows") | ||||
|  | ||||
| # GCC config variants: | ||||
| # | ||||
| # All the nightlies (except libtorch with new gcc ABI) are built with devtoolset7, | ||||
| @ -74,6 +82,11 @@ LINUX_GCC_CONFIG_VARIANTS = OrderedDict( | ||||
|     ], | ||||
| ) | ||||
|  | ||||
| WINDOWS_LIBTORCH_CONFIG_VARIANTS = [ | ||||
|     "debug", | ||||
|     "release", | ||||
| ] | ||||
|  | ||||
|  | ||||
| class TopLevelNode(ConfigNode): | ||||
|     def __init__(self, node_name, config_tree_data, smoke): | ||||
| @ -108,6 +121,8 @@ class PackageFormatConfigNode(ConfigNode): | ||||
|     def get_children(self): | ||||
|         if self.find_prop("os_name") == "linux": | ||||
|             return [LinuxGccConfigNode(self, v) for v in LINUX_GCC_CONFIG_VARIANTS[self.find_prop("package_format")]] | ||||
|         elif self.find_prop("os_name") == "windows" and self.find_prop("package_format") == "libtorch": | ||||
|             return [WindowsLibtorchConfigNode(self, v) for v in WINDOWS_LIBTORCH_CONFIG_VARIANTS] | ||||
|         else: | ||||
|             return [ArchConfigNode(self, v) for v in self.find_prop("cuda_versions")] | ||||
|  | ||||
| @ -129,6 +144,16 @@ class LinuxGccConfigNode(ConfigNode): | ||||
|         return [ArchConfigNode(self, v) for v in cuda_versions] | ||||
|  | ||||
|  | ||||
| class WindowsLibtorchConfigNode(ConfigNode): | ||||
|     def __init__(self, parent, libtorch_config_variant): | ||||
|         super(WindowsLibtorchConfigNode, self).__init__(parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant)) | ||||
|  | ||||
|         self.props["libtorch_config_variant"] = libtorch_config_variant | ||||
|  | ||||
|     def get_children(self): | ||||
|         return [ArchConfigNode(self, v) for v in self.find_prop("cuda_versions")] | ||||
|  | ||||
|  | ||||
| class ArchConfigNode(ConfigNode): | ||||
|     def __init__(self, parent, cu): | ||||
|         super(ArchConfigNode, self).__init__(parent, get_processor_arch_name(cu)) | ||||
|  | ||||
| @ -6,7 +6,7 @@ import cimodel.lib.miniutils as miniutils | ||||
|  | ||||
|  | ||||
| class Conf(object): | ||||
|     def __init__(self, os, cuda_version, pydistro, parms, smoke, libtorch_variant, gcc_config_variant): | ||||
|     def __init__(self, os, cuda_version, pydistro, parms, smoke, libtorch_variant, gcc_config_variant, libtorch_config_variant): | ||||
|  | ||||
|         self.os = os | ||||
|         self.cuda_version = cuda_version | ||||
| @ -15,11 +15,14 @@ class Conf(object): | ||||
|         self.smoke = smoke | ||||
|         self.libtorch_variant = libtorch_variant | ||||
|         self.gcc_config_variant = gcc_config_variant | ||||
|         self.libtorch_config_variant = libtorch_config_variant | ||||
|  | ||||
|     def gen_build_env_parms(self): | ||||
|         elems = [self.pydistro] + self.parms + [binary_build_data.get_processor_arch_name(self.cuda_version)] | ||||
|         if self.gcc_config_variant is not None: | ||||
|             elems.append(str(self.gcc_config_variant)) | ||||
|         if self.libtorch_config_variant is not None: | ||||
|             elems.append(str(self.libtorch_config_variant)) | ||||
|         return elems | ||||
|  | ||||
|     def gen_docker_image(self): | ||||
| @ -67,9 +70,14 @@ class Conf(object): | ||||
|             job_def["requires"].append("update_s3_htmls_for_nightlies_devtoolset7") | ||||
|             job_def["filters"] = {"branches": {"only": "postnightly"}} | ||||
|         else: | ||||
|             filter_branches = ["nightly"] | ||||
|             # we only want to add the release branch filter if we aren't | ||||
|             # uploading | ||||
|             if phase not in ["upload"]: | ||||
|                 filter_branches.append(r"/release\/.*/") | ||||
|             job_def["filters"] = { | ||||
|                 "branches": { | ||||
|                     "only": "nightly" | ||||
|                     "only": filter_branches | ||||
|                 }, | ||||
|                 # Will run on tags like v1.5.0-rc1, etc. | ||||
|                 "tags": { | ||||
| @ -105,11 +113,18 @@ class Conf(object): | ||||
|  | ||||
| def get_root(smoke, name): | ||||
|  | ||||
|     return binary_build_data.TopLevelNode( | ||||
|         name, | ||||
|         binary_build_data.CONFIG_TREE_DATA, | ||||
|         smoke, | ||||
|     ) | ||||
|     if smoke: | ||||
|         return binary_build_data.TopLevelNode( | ||||
|             name, | ||||
|             binary_build_data.CONFIG_TREE_DATA_NO_WINDOWS, | ||||
|             smoke, | ||||
|         ) | ||||
|     else: | ||||
|         return binary_build_data.TopLevelNode( | ||||
|             name, | ||||
|             binary_build_data.CONFIG_TREE_DATA, | ||||
|             smoke, | ||||
|         ) | ||||
|  | ||||
|  | ||||
| def gen_build_env_list(smoke): | ||||
| @ -127,6 +142,7 @@ def gen_build_env_list(smoke): | ||||
|             c.find_prop("smoke"), | ||||
|             c.find_prop("libtorch_variant"), | ||||
|             c.find_prop("gcc_config_variant"), | ||||
|             c.find_prop("libtorch_config_variant"), | ||||
|         ) | ||||
|         newlist.append(conf) | ||||
|  | ||||
|  | ||||
| @ -4,7 +4,6 @@ from cimodel.lib.conf_tree import Ver | ||||
|  | ||||
| CONFIG_TREE_DATA = [ | ||||
|     (Ver("ubuntu", "16.04"), [ | ||||
|         ([Ver("gcc", "5")], [XImportant("onnx_py2")]), | ||||
|         ([Ver("clang", "7")], [XImportant("onnx_main_py3.6"), | ||||
|                                XImportant("onnx_ort1_py3.6"), | ||||
|                                XImportant("onnx_ort2_py3.6")]), | ||||
|  | ||||
| @ -33,8 +33,7 @@ class Conf: | ||||
|     # TODO: Eventually we can probably just remove the cudnn7 everywhere. | ||||
|     def get_cudnn_insertion(self): | ||||
|  | ||||
|         omit = self.language == "onnx_py2" \ | ||||
|             or self.language == "onnx_main_py3.6" \ | ||||
|         omit = self.language == "onnx_main_py3.6" \ | ||||
|             or self.language == "onnx_ort1_py3.6" \ | ||||
|             or self.language == "onnx_ort2_py3.6" \ | ||||
|             or set(self.compiler_names).intersection({"android", "mkl", "clang"}) \ | ||||
| @ -71,11 +70,10 @@ class Conf: | ||||
|     def gen_docker_image(self): | ||||
|  | ||||
|         lang_substitutions = { | ||||
|             "onnx_py2": "py2", | ||||
|             "onnx_main_py3.6": "py3.6", | ||||
|             "onnx_ort1_py3.6": "py3.6", | ||||
|             "onnx_ort2_py3.6": "py3.6", | ||||
|             "cmake": "py2", | ||||
|             "cmake": "py3", | ||||
|         } | ||||
|  | ||||
|         lang = miniutils.override(self.language, lang_substitutions) | ||||
| @ -85,7 +83,7 @@ class Conf: | ||||
|     def gen_workflow_params(self, phase): | ||||
|         parameters = OrderedDict() | ||||
|         lang_substitutions = { | ||||
|             "onnx_py2": "onnx-py2", | ||||
|             "onnx_py3": "onnx-py3", | ||||
|             "onnx_main_py3.6": "onnx-main-py3.6", | ||||
|             "onnx_ort1_py3.6": "onnx-ort1-py3.6", | ||||
|             "onnx_ort2_py3.6": "onnx-ort2-py3.6", | ||||
| @ -129,7 +127,7 @@ class Conf: | ||||
|             job_name = "caffe2_" + self.get_platform() + "_build" | ||||
|  | ||||
|         if not self.is_important: | ||||
|             job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/"]}} | ||||
|             job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/", r"/release\/.*/"]}} | ||||
|         job_def.update(self.gen_workflow_params(phase)) | ||||
|         return {job_name : job_def} | ||||
|  | ||||
|  | ||||
| @ -8,7 +8,6 @@ CUDA_VERSIONS = [ | ||||
| ] | ||||
|  | ||||
| STANDARD_PYTHON_VERSIONS = [ | ||||
|     "2.7", | ||||
|     "3.5", | ||||
|     "3.6", | ||||
|     "3.7", | ||||
|  | ||||
| @ -114,7 +114,7 @@ class Conf: | ||||
|         if not self.is_important: | ||||
|             # If you update this, update | ||||
|             # caffe2_build_definitions.py too | ||||
|             job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/"]}} | ||||
|             job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/", r"/release\/.*/"]}} | ||||
|         job_def.update(self.gen_workflow_params(phase)) | ||||
|  | ||||
|         return {job_name : job_def} | ||||
|  | ||||
							
								
								
									
										3290
									
								
								.circleci/config.yml
									
									
									
									
									
								
							
							
						
						
									
										3290
									
								
								.circleci/config.yml
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -4,7 +4,7 @@ set -ex | ||||
|  | ||||
| # Optionally install conda | ||||
| if [ -n "$ANACONDA_PYTHON_VERSION" ]; then | ||||
|   BASE_URL="https://repo.continuum.io/miniconda" | ||||
|   BASE_URL="https://repo.anaconda.com/miniconda" | ||||
|  | ||||
|   MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1) | ||||
|  | ||||
|  | ||||
| @ -10,6 +10,11 @@ retry () { | ||||
| if [[ "$(uname)" == Darwin ]]; then | ||||
|   # macos executor (builds and tests) | ||||
|   workdir="/Users/distiller/project" | ||||
| elif [[ "$OSTYPE" == "msys" ]]; then | ||||
|   # windows executor (builds and tests) | ||||
|   rm -rf /c/w | ||||
|   ln -s "/c/Users/circleci/project" /c/w | ||||
|   workdir="/c/w" | ||||
| elif [[ -d "/home/circleci/project" ]]; then | ||||
|   # machine executor (binary tests) | ||||
|   workdir="/home/circleci/project" | ||||
| @ -19,8 +24,14 @@ else | ||||
| fi | ||||
|  | ||||
| # It is very important that this stays in sync with binary_populate_env.sh | ||||
| export PYTORCH_ROOT="$workdir/pytorch" | ||||
| export BUILDER_ROOT="$workdir/builder" | ||||
| if [[ "$OSTYPE" == "msys" ]]; then | ||||
|   # We need to make the paths as short as possible on Windows | ||||
|   export PYTORCH_ROOT="$workdir/p" | ||||
|   export BUILDER_ROOT="$workdir/b" | ||||
| else | ||||
|   export PYTORCH_ROOT="$workdir/pytorch" | ||||
|   export BUILDER_ROOT="$workdir/builder" | ||||
| fi | ||||
|  | ||||
| # Clone the Pytorch branch | ||||
| retry git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT" | ||||
|  | ||||
| @ -31,9 +31,9 @@ fi | ||||
|  | ||||
| conda_sh="$workdir/install_miniconda.sh" | ||||
| if [[ "$(uname)" == Darwin ]]; then | ||||
|   curl --retry 3 -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh | ||||
|   curl --retry 3 -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh | ||||
| else | ||||
|   curl --retry 3 -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh | ||||
|   curl --retry 3 -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh | ||||
| fi | ||||
| chmod +x "$conda_sh" | ||||
| "$conda_sh" -b -p "$MINICONDA_ROOT" | ||||
|  | ||||
| @ -2,11 +2,31 @@ | ||||
| set -eux -o pipefail | ||||
| export TZ=UTC | ||||
|  | ||||
| tagged_version() { | ||||
|   # Grabs version from either the env variable CIRCLE_TAG | ||||
|   # or the pytorch git described version | ||||
|   if [[ "$OSTYPE" == "msys" ]]; then | ||||
|     GIT_DESCRIBE="git --git-dir ${workdir}/p/.git describe" | ||||
|   else | ||||
|     GIT_DESCRIBE="git --git-dir ${workdir}/pytorch/.git describe" | ||||
|   fi | ||||
|   if [[ -n "${CIRCLE_TAG:-}" ]]; then | ||||
|     echo "${CIRCLE_TAG}" | ||||
|   elif ${GIT_DESCRIBE} --exact --tags >/dev/null; then | ||||
|     ${GIT_DESCRIBE} --tags | ||||
|   else | ||||
|     return 1 | ||||
|   fi | ||||
| } | ||||
|  | ||||
| # We need to write an envfile to persist these variables to following | ||||
| # steps, but the location of the envfile depends on the circleci executor | ||||
| if [[ "$(uname)" == Darwin ]]; then | ||||
|   # macos executor (builds and tests) | ||||
|   workdir="/Users/distiller/project" | ||||
| elif [[ "$OSTYPE" == "msys" ]]; then | ||||
|   # windows executor (builds and tests) | ||||
|   workdir="/c/w" | ||||
| elif [[ -d "/home/circleci/project" ]]; then | ||||
|   # machine executor (binary tests) | ||||
|   workdir="/home/circleci/project" | ||||
| @ -23,7 +43,15 @@ configs=($BUILD_ENVIRONMENT) | ||||
| export PACKAGE_TYPE="${configs[0]}" | ||||
| export DESIRED_PYTHON="${configs[1]}" | ||||
| export DESIRED_CUDA="${configs[2]}" | ||||
| export DESIRED_DEVTOOLSET="${configs[3]:-}" | ||||
| if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then | ||||
|   export DESIRED_DEVTOOLSET="" | ||||
|   export LIBTORCH_CONFIG="${configs[3]:-}" | ||||
|   if [[ "$LIBTORCH_CONFIG" == 'debug' ]]; then | ||||
|     export DEBUG=1 | ||||
|   fi | ||||
| else | ||||
|   export DESIRED_DEVTOOLSET="${configs[3]:-}" | ||||
| fi | ||||
| if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then | ||||
|   export BUILD_PYTHONLESS=1 | ||||
| fi | ||||
| @ -47,15 +75,17 @@ export DATE="$(date -u +%Y%m%d)" | ||||
| #TODO: We should be pulling semver version from the base version.txt | ||||
| BASE_BUILD_VERSION="1.5.0.dev$DATE" | ||||
| # Change BASE_BUILD_VERSION to git tag when on a git tag | ||||
| if git describe --tags --exact >/dev/null 2>/dev/null; then | ||||
| # Use 'git -C' to make doubly sure we're in the correct directory for checking | ||||
| # the git tag | ||||
| if tagged_version >/dev/null; then | ||||
|   # Switch upload folder to 'test/' if we are on a tag | ||||
|   PIP_UPLOAD_FOLDER='test/' | ||||
|   # Grab git tag, remove prefixed v and remove everything after - | ||||
|   # Used to clean up tags that are for release candidates like v1.5.0-rc1 | ||||
|   # Turns tag v1.5.0-rc1 -> v1.5.0 | ||||
|   BASE_BUILD_VERSION="$(git describe --tags | sed -e 's/^v//' -e 's/-.*$//')" | ||||
|   BASE_BUILD_VERSION="$(tagged_version | sed -e 's/^v//' -e 's/-.*$//')" | ||||
| fi | ||||
| if [[ "$(uname)" == 'Darwin' ]] || [[ "$DESIRED_CUDA" == "cu101" ]] || [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
| if [[ "$(uname)" == 'Darwin' ]] || [[ "$DESIRED_CUDA" == "cu102" ]] || [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
|   export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}" | ||||
| else | ||||
|   export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA" | ||||
| @ -94,6 +124,10 @@ export DESIRED_CUDA="$DESIRED_CUDA" | ||||
| export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}" | ||||
| export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}" | ||||
| export DESIRED_DEVTOOLSET="$DESIRED_DEVTOOLSET" | ||||
| if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then | ||||
|   export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}" | ||||
|   export DEBUG="${DEBUG:-}" | ||||
| fi | ||||
|  | ||||
| export DATE="$DATE" | ||||
| export NIGHTLIES_DATE_PREAMBLE=1.5.0.dev | ||||
| @ -113,8 +147,13 @@ export DOCKER_IMAGE="$DOCKER_IMAGE" | ||||
|  | ||||
| export workdir="$workdir" | ||||
| export MAC_PACKAGE_WORK_DIR="$workdir" | ||||
| export PYTORCH_ROOT="$workdir/pytorch" | ||||
| export BUILDER_ROOT="$workdir/builder" | ||||
| if [[ "$OSTYPE" == "msys" ]]; then | ||||
|   export PYTORCH_ROOT="$workdir/p" | ||||
|   export BUILDER_ROOT="$workdir/b" | ||||
| else | ||||
|   export PYTORCH_ROOT="$workdir/pytorch" | ||||
|   export BUILDER_ROOT="$workdir/builder" | ||||
| fi | ||||
| export MINICONDA_ROOT="$workdir/miniconda" | ||||
| export PYTORCH_FINAL_PACKAGE_DIR="$workdir/final_pkgs" | ||||
|  | ||||
|  | ||||
							
								
								
									
										33
									
								
								.circleci/scripts/binary_windows_build.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								.circleci/scripts/binary_windows_build.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,33 @@ | ||||
| #!/bin/bash | ||||
| set -eux -o pipefail | ||||
|  | ||||
| source "/c/w/env" | ||||
| mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" | ||||
|  | ||||
| export CUDA_VERSION="${DESIRED_CUDA/cu/}" | ||||
| export VC_YEAR=2017 | ||||
| export USE_SCCACHE=1 | ||||
| export SCCACHE_BUCKET=ossci-compiler-cache-windows | ||||
| export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT" | ||||
|  | ||||
| set +x | ||||
| export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-} | ||||
| export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-} | ||||
| set -x | ||||
|  | ||||
| if [[ "$CIRCLECI" == 'true' && -d "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019" ]]; then | ||||
|   rm -rf "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019" | ||||
| fi | ||||
|  | ||||
| echo "Free space on filesystem before build:" | ||||
| df -h | ||||
|  | ||||
| pushd "$BUILDER_ROOT" | ||||
| if [[ "$PACKAGE_TYPE" == 'conda' ]]; then | ||||
|   ./windows/internal/build_conda.bat | ||||
| elif [[ "$PACKAGE_TYPE" == 'wheel' || "$PACKAGE_TYPE" == 'libtorch' ]]; then | ||||
|   ./windows/internal/build_wheels.bat | ||||
| fi | ||||
|  | ||||
| echo "Free space on filesystem after build:" | ||||
| df -h | ||||
							
								
								
									
										37
									
								
								.circleci/scripts/binary_windows_upload.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								.circleci/scripts/binary_windows_upload.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | ||||
| #!/bin/bash | ||||
| set -eu -o pipefail | ||||
| set +x | ||||
| declare -x "AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}" | ||||
| declare -x "AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}" | ||||
|  | ||||
| #!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#! | ||||
| # DO NOT TURN -x ON BEFORE THIS LINE | ||||
| #!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#! | ||||
| set -eux -o pipefail | ||||
|  | ||||
| source "/env" | ||||
|  | ||||
| # This gets set in binary_populate_env.sh, but lets have a sane default just in case | ||||
| PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly/} | ||||
| # TODO: Combine CONDA_UPLOAD_CHANNEL and PIP_UPLOAD_FOLDER into one variable | ||||
| #       The only difference is the trailing slash | ||||
| # Strip trailing slashes if there | ||||
| CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::') | ||||
|  | ||||
| pushd /root/workspace/final_pkgs | ||||
| # Upload the package to the final location | ||||
| if [[ "$PACKAGE_TYPE" == conda ]]; then | ||||
|   retry conda install -yq anaconda-client | ||||
|   anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload  "$(ls)" -u "pytorch-${CONDA_UPLOAD_CHANNEL}" --label main --no-progress --force | ||||
| elif [[ "$PACKAGE_TYPE" == libtorch ]]; then | ||||
|   retry conda install -c conda-forge -yq awscli | ||||
|   s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
|   for pkg in $(ls); do | ||||
|     retry aws s3 cp "$pkg" "$s3_dir" --acl public-read | ||||
|   done | ||||
| else | ||||
|   retry conda install -c conda-forge -yq awscli | ||||
|   s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/" | ||||
|   retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read | ||||
| fi | ||||
|  | ||||
| @ -72,10 +72,10 @@ time python tools/setup_helpers/generate_code.py \ | ||||
|  | ||||
| # Build the docs | ||||
| pushd docs/cpp | ||||
| pip install breathe>=4.13.0 bs4 lxml six | ||||
| pip install breathe==4.13.0 bs4 lxml six | ||||
| pip install --no-cache-dir -e "git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme" | ||||
| pip install exhale>=0.2.1 | ||||
| pip install sphinx>=2.0 | ||||
| pip install sphinx==2.4.4 | ||||
| # Uncomment once it is fixed | ||||
| # pip install -r requirements.txt | ||||
| time make VERBOSE=1 html -j | ||||
|  | ||||
| @ -52,3 +52,12 @@ binary_mac_params: &binary_mac_params | ||||
|   environment: | ||||
|     BUILD_ENVIRONMENT: << parameters.build_environment >> | ||||
|  | ||||
| binary_windows_params: &binary_windows_params | ||||
|   parameters: | ||||
|     build_environment: | ||||
|       type: string | ||||
|       default: "" | ||||
|   environment: | ||||
|     BUILD_ENVIRONMENT: << parameters.build_environment >> | ||||
|     BUILD_FOR_SYSTEM: windows | ||||
|  | ||||
|  | ||||
| @ -275,3 +275,46 @@ | ||||
|           script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh" | ||||
|           cat "$script" | ||||
|           source "$script" | ||||
|  | ||||
|   binary_windows_build: | ||||
|     <<: *binary_windows_params | ||||
|     executor: | ||||
|       name: windows-cpu-with-nvidia-cuda | ||||
|     steps: | ||||
|     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml | ||||
|     - attach_scripts | ||||
|     - run: | ||||
|         <<: *binary_checkout | ||||
|     - run: | ||||
|         <<: *binary_populate_env | ||||
|     - run: | ||||
|         name: Build | ||||
|         no_output_timeout: "1h" | ||||
|         command: | | ||||
|           set -eux -o pipefail | ||||
|           script="/c/w/p/.circleci/scripts/binary_windows_build.sh" | ||||
|           cat "$script" | ||||
|           source "$script" | ||||
|     - persist_to_workspace: | ||||
|         root: "C:/w" | ||||
|         paths: final_pkgs | ||||
|  | ||||
|   binary_windows_upload: | ||||
|     <<: *binary_windows_params | ||||
|     docker: | ||||
|       - image: continuumio/miniconda | ||||
|     steps: | ||||
|     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml | ||||
|     - attach_scripts | ||||
|     - run: | ||||
|         <<: *binary_checkout | ||||
|     - run: | ||||
|         <<: *binary_populate_env | ||||
|     - run: | ||||
|         name: Upload | ||||
|         no_output_timeout: "10m" | ||||
|         command: | | ||||
|           set -eux -o pipefail | ||||
|           script="/pytorch/.circleci/scripts/binary_windows_upload.sh" | ||||
|           cat "$script" | ||||
|           source "$script" | ||||
|  | ||||
| @ -151,7 +151,7 @@ | ||||
|             # Install Anaconda if we need to | ||||
|             if [ -n "${CAFFE2_USE_ANACONDA}" ]; then | ||||
|               rm -rf ${TMPDIR}/anaconda | ||||
|               curl --retry 3 -o ${TMPDIR}/conda.sh https://repo.continuum.io/miniconda/Miniconda${ANACONDA_VERSION}-latest-MacOSX-x86_64.sh | ||||
|               curl --retry 3 -o ${TMPDIR}/conda.sh https://repo.anaconda.com/miniconda/Miniconda${ANACONDA_VERSION}-latest-MacOSX-x86_64.sh | ||||
|               chmod +x ${TMPDIR}/conda.sh | ||||
|               /bin/bash ${TMPDIR}/conda.sh -b -p ${TMPDIR}/anaconda | ||||
|               rm -f ${TMPDIR}/conda.sh | ||||
|  | ||||
| @ -20,16 +20,16 @@ jobs: | ||||
|           export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) | ||||
|  | ||||
|           # TODO We may want to move the rebase logic to a separate step after checkout | ||||
|           # Rebase to master only if in xenial_py3_6_gcc5_4 case | ||||
|           if [[ "${CIRCLE_BRANCH}" != "master" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then | ||||
|             echo "Merge master branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT" | ||||
|           # Rebase to release/1.5 only if in xenial_py3_6_gcc5_4 case | ||||
|           if [[ "${CIRCLE_BRANCH}" != "release/1.5" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then | ||||
|             echo "Merge release/1.5 branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT" | ||||
|             set -x | ||||
|             git config --global user.email "circleci.ossci@gmail.com" | ||||
|             git config --global user.name "CircleCI" | ||||
|             git config remote.origin.url https://github.com/pytorch/pytorch.git | ||||
|             git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master | ||||
|             git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet | ||||
|             export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master` | ||||
|             git config --add remote.origin.fetch +refs/heads/release/1.5:refs/remotes/origin/release/1.5 | ||||
|             git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/release/1.5:refs/remotes/origin/release/1.5 --depth=100 --quiet | ||||
|             export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/release/1.5` | ||||
|             echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET} | ||||
|             export GIT_COMMIT=${CIRCLE_SHA1} | ||||
|             echo "GIT_COMMIT: " ${GIT_COMMIT} | ||||
| @ -38,7 +38,7 @@ jobs: | ||||
|             git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET} | ||||
|             set +x | ||||
|           else | ||||
|             echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT" | ||||
|             echo "Do NOT merge release/1.5 branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT" | ||||
|           fi | ||||
|  | ||||
|           git submodule sync && git submodule update -q --init --recursive | ||||
|  | ||||
| @ -15,6 +15,7 @@ | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       - pytorch_windows_test: | ||||
|           name: pytorch_windows_vs2017_14.11_py36_cuda10.1_test1 | ||||
|           test_name: pytorch-windows-test1 | ||||
| @ -32,6 +33,7 @@ | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       - pytorch_windows_test: | ||||
|           name: pytorch_windows_vs2017_14.11_py36_cuda10.1_test2 | ||||
|           test_name: pytorch-windows-test2 | ||||
| @ -49,6 +51,7 @@ | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       - pytorch_windows_build: | ||||
|           name: pytorch_windows_vs2017_14.16_py36_cuda10.1_build | ||||
|           cuda_version: "10" | ||||
| @ -64,6 +67,7 @@ | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       - pytorch_windows_test: | ||||
|           name: pytorch_windows_vs2017_14.16_py36_cuda10.1_test1 | ||||
|           test_name: pytorch-windows-test1 | ||||
| @ -81,6 +85,7 @@ | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       - pytorch_windows_test: | ||||
|           name: pytorch_windows_vs2017_14.16_py36_cuda10.1_test2 | ||||
|           test_name: pytorch-windows-test2 | ||||
| @ -98,6 +103,7 @@ | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       - pytorch_windows_build: | ||||
|           name: pytorch_windows_vs2019_py36_cuda10.1_build | ||||
|           cuda_version: "10" | ||||
|  | ||||
| @ -7,12 +7,6 @@ | ||||
|       # pytorch-ci-hud to adjust the list of whitelisted builds | ||||
|       # at https://github.com/ezyang/pytorch-ci-hud/blob/master/src/BuildHistoryDisplay.js | ||||
|  | ||||
|       - binary_linux_build: | ||||
|           name: binary_linux_manywheel_2_7mu_cpu_devtoolset7_build | ||||
|           build_environment: "manywheel 2.7mu cpu devtoolset7" | ||||
|           requires: | ||||
|             - setup | ||||
|           docker_image: "pytorch/manylinux-cuda102" | ||||
|       - binary_linux_build: | ||||
|           name: binary_linux_manywheel_3_7m_cu102_devtoolset7_build | ||||
|           build_environment: "manywheel 3.7m cu102 devtoolset7" | ||||
| @ -23,24 +17,21 @@ | ||||
|             branches: | ||||
|               only: | ||||
|                 - master | ||||
|       - binary_linux_build: | ||||
|           name: binary_linux_conda_2_7_cpu_devtoolset7_build | ||||
|           build_environment: "conda 2.7 cpu devtoolset7" | ||||
|           requires: | ||||
|             - setup | ||||
|           docker_image: "pytorch/conda-cuda" | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       # This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710 | ||||
|       # - binary_linux_conda_3_6_cu90_devtoolset7_build | ||||
|       # TODO rename to remove python version for libtorch | ||||
|       - binary_linux_build: | ||||
|           name: binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_build | ||||
|           build_environment: "libtorch 2.7m cpu devtoolset7" | ||||
|           name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build | ||||
|           build_environment: "libtorch 3.7m cpu devtoolset7" | ||||
|           requires: | ||||
|             - setup | ||||
|           libtorch_variant: "shared-with-deps" | ||||
|           docker_image: "pytorch/manylinux-cuda102" | ||||
|       - binary_linux_build: | ||||
|           name: binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build | ||||
|           build_environment: "libtorch 2.7m cpu gcc5.4_cxx11-abi" | ||||
|           name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build | ||||
|           build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi" | ||||
|           requires: | ||||
|             - setup | ||||
|           libtorch_variant: "shared-with-deps" | ||||
| @ -48,45 +39,51 @@ | ||||
|       # TODO we should test a libtorch cuda build, but they take too long | ||||
|       # - binary_linux_libtorch_2_7m_cu90_devtoolset7_static-without-deps_build | ||||
|       - binary_mac_build: | ||||
|           name: binary_macos_wheel_3_6_cpu_build | ||||
|           build_environment: "wheel 3.6 cpu" | ||||
|           requires: | ||||
|             - setup | ||||
|           filters: | ||||
|             branches: | ||||
|               only: | ||||
|                 - master | ||||
|       - binary_mac_build: | ||||
|           name: binary_macos_conda_2_7_cpu_build | ||||
|           build_environment: "conda 2.7 cpu" | ||||
|           name: binary_macos_wheel_3_7_cpu_build | ||||
|           build_environment: "wheel 3.7 cpu" | ||||
|           requires: | ||||
|             - setup | ||||
|           filters: | ||||
|             branches: | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       # This job has an average run time of 3 hours o.O | ||||
|       # Now only running this on master to reduce overhead | ||||
|       # TODO rename to remove python version for libtorch | ||||
|       - binary_mac_build: | ||||
|           name: binary_macos_libtorch_2_7_cpu_build | ||||
|           build_environment: "libtorch 2.7 cpu" | ||||
|           name: binary_macos_libtorch_3_7_cpu_build | ||||
|           build_environment: "libtorch 3.7 cpu" | ||||
|           requires: | ||||
|             - setup | ||||
|           filters: | ||||
|             branches: | ||||
|               only: | ||||
|                 - master | ||||
|       - binary_linux_test: | ||||
|           name: binary_linux_manywheel_2_7mu_cpu_devtoolset7_test | ||||
|           build_environment: "manywheel 2.7mu cpu devtoolset7" | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       - binary_windows_build: | ||||
|           name: binary_windows_libtorch_3_7_cpu_debug_build | ||||
|           build_environment: "libtorch 3.7 cpu debug" | ||||
|           requires: | ||||
|             - setup | ||||
|       - binary_windows_build: | ||||
|           name: binary_windows_libtorch_3_7_cpu_release_build | ||||
|           build_environment: "libtorch 3.7 cpu release" | ||||
|           requires: | ||||
|             - setup | ||||
|       - binary_windows_build: | ||||
|           name: binary_windows_wheel_3_7_cu102_build | ||||
|           build_environment: "wheel 3.7 cu102" | ||||
|           requires: | ||||
|             - setup | ||||
|             - binary_linux_manywheel_2_7mu_cpu_devtoolset7_build | ||||
|           docker_image: "pytorch/manylinux-cuda102" | ||||
|           filters: | ||||
|             branches: | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       - binary_linux_test: | ||||
|           name: binary_linux_manywheel_3_7m_cu102_devtoolset7_test | ||||
|           build_environment: "manywheel 3.7m cu102 devtoolset7" | ||||
| @ -100,29 +97,25 @@ | ||||
|             branches: | ||||
|               only: | ||||
|                 - master | ||||
|       - binary_linux_test: | ||||
|           name: binary_linux_conda_2_7_cpu_devtoolset7_test | ||||
|           build_environment: "conda 2.7 cpu devtoolset7" | ||||
|           requires: | ||||
|             - setup | ||||
|             - binary_linux_conda_2_7_cpu_devtoolset7_build | ||||
|           docker_image: "pytorch/conda-cuda" | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|       # This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710 | ||||
|       # - binary_linux_conda_3_6_cu90_devtoolset7_test: | ||||
|       # TODO rename to remove python version for libtorch | ||||
|       - binary_linux_test: | ||||
|           name: binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_test | ||||
|           build_environment: "libtorch 2.7m cpu devtoolset7" | ||||
|           name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test | ||||
|           build_environment: "libtorch 3.7m cpu devtoolset7" | ||||
|           requires: | ||||
|             - setup | ||||
|             - binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_build | ||||
|             - binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build | ||||
|           libtorch_variant: "shared-with-deps" | ||||
|           docker_image: "pytorch/manylinux-cuda102" | ||||
|       - binary_linux_test: | ||||
|           name: binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test | ||||
|           build_environment: "libtorch 2.7m cpu gcc5.4_cxx11-abi" | ||||
|           name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test | ||||
|           build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi" | ||||
|           requires: | ||||
|             - setup | ||||
|             - binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build | ||||
|             - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build | ||||
|           libtorch_variant: "shared-with-deps" | ||||
|           docker_image: "pytorch/pytorch-binary-docker-image-ubuntu16.04:latest" | ||||
|  | ||||
|  | ||||
| @ -20,21 +20,12 @@ | ||||
|       - docker_build_job: | ||||
|           name: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7" | ||||
|           image_name: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7" | ||||
|       - docker_build_job: | ||||
|           name: "pytorch-linux-xenial-cuda9-cudnn7-py2" | ||||
|           image_name: "pytorch-linux-xenial-cuda9-cudnn7-py2" | ||||
|       - docker_build_job: | ||||
|           name: "pytorch-linux-xenial-cuda9-cudnn7-py3" | ||||
|           image_name: "pytorch-linux-xenial-cuda9-cudnn7-py3" | ||||
|       - docker_build_job: | ||||
|           name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7" | ||||
|           image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7" | ||||
|       - docker_build_job: | ||||
|           name: "pytorch-linux-xenial-py2.7.9" | ||||
|           image_name: "pytorch-linux-xenial-py2.7.9" | ||||
|       - docker_build_job: | ||||
|           name: "pytorch-linux-xenial-py2.7" | ||||
|           image_name: "pytorch-linux-xenial-py2.7" | ||||
|       - docker_build_job: | ||||
|           name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c" | ||||
|           image_name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c" | ||||
|  | ||||
| @ -4,6 +4,8 @@ | ||||
|             branches: | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build | ||||
|  | ||||
| @ -13,6 +15,8 @@ | ||||
|             branches: | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|           requires: | ||||
|             - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build | ||||
|             - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build | ||||
|  | ||||
| @ -7,10 +7,10 @@ | ||||
|           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:f990c76a-a798-42bb-852f-5be5006f8026" | ||||
|           resource_class: large | ||||
|       - pytorch_linux_test: | ||||
|           name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test | ||||
|           name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test | ||||
|           requires: | ||||
|             - setup | ||||
|             - pytorch_linux_xenial_py3_6_gcc5_4_build | ||||
|           build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test" | ||||
|           build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_profiling-test" | ||||
|           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:f990c76a-a798-42bb-852f-5be5006f8026" | ||||
|           resource_class: large | ||||
|  | ||||
| @ -31,6 +31,7 @@ | ||||
|               only: | ||||
|                 - master | ||||
|                 - /ci-all\/.*/ | ||||
|                 - /release\/.*/ | ||||
|           build_environment: "pytorch-linux-xenial-py3-clang5-mobile-code-analysis" | ||||
|           build_only: "1" | ||||
|           # Use LLVM-DEV toolchain in android-ndk-r19c docker image | ||||
|  | ||||
							
								
								
									
										42
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										42
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							| @ -67,7 +67,7 @@ jobs: | ||||
|       - name: Run flake8 | ||||
|         run: | | ||||
|           set -eux | ||||
|           pip install flake8 flake8-mypy flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi mccabe pycodestyle pyflakes | ||||
|           pip install flake8==3.7.9 flake8-mypy flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi mccabe pycodestyle==2.5.0 pyflakes==2.1.1 | ||||
|           flake8 --version | ||||
|           flake8 --exit-zero > ${GITHUB_WORKSPACE}/flake8-output.txt | ||||
|           cat ${GITHUB_WORKSPACE}/flake8-output.txt | ||||
| @ -81,44 +81,6 @@ jobs: | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
|   flake8-py2: | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v1 | ||||
|         with: | ||||
|           python-version: 2.x | ||||
|           architecture: x64 | ||||
|       - name: Fetch PyTorch | ||||
|         uses: actions/checkout@v1 | ||||
|       - name: Checkout PR tip | ||||
|         run: | | ||||
|           set -eux | ||||
|           if [[ "${{ github.event_name }}" == "pull_request" ]]; then | ||||
|             # We are on a PR, so actions/checkout leaves us on a merge commit. | ||||
|             # Check out the actual tip of the branch. | ||||
|             git checkout ${{ github.event.pull_request.head.sha }} | ||||
|           fi | ||||
|           echo ::set-output name=commit_sha::$(git rev-parse HEAD) | ||||
|         id: get_pr_tip | ||||
|       - name: Run flake8 | ||||
|         run: | | ||||
|           set -eux | ||||
|           pip install flake8 | ||||
|           rm -rf .circleci tools/clang_format_new.py | ||||
|           flake8 --exit-zero > ${GITHUB_WORKSPACE}/flake8-output.txt | ||||
|           cat ${GITHUB_WORKSPACE}/flake8-output.txt | ||||
|       - name: Add annotations | ||||
|         uses: pytorch/add-annotations-github-action@master | ||||
|         with: | ||||
|           check_name: 'flake8-py2' | ||||
|           linter_output_path: 'flake8-output.txt' | ||||
|           commit_sha: ${{ steps.get_pr_tip.outputs.commit_sha }} | ||||
|           regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w\d+) (?<errorDesc>.*)' | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
|  | ||||
|   clang-tidy: | ||||
|     if: github.event_name == 'pull_request' | ||||
|     runs-on: ubuntu-latest | ||||
| @ -198,6 +160,8 @@ jobs: | ||||
|             -g"-torch/csrc/jit/export.cpp"            \ | ||||
|             -g"-torch/csrc/jit/import.cpp"            \ | ||||
|             -g"-torch/csrc/jit/netdef_converter.cpp"  \ | ||||
|             -g"-torch/csrc/cuda/nccl.*"               \ | ||||
|             -g"-torch/csrc/cuda/python_nccl.cpp"      \ | ||||
|             "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt | ||||
|  | ||||
|           cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt | ||||
|  | ||||
| @ -6,6 +6,69 @@ TEST_DIR="$ROOT_DIR/caffe2_tests" | ||||
| gtest_reports_dir="${TEST_DIR}/cpp" | ||||
| pytest_reports_dir="${TEST_DIR}/python" | ||||
|  | ||||
| # This is needed to work around ROCm using old docker images until | ||||
| # the transition to new images is complete. | ||||
| # TODO: Remove once ROCm CI is using new images. | ||||
| if [[ $BUILD_ENVIRONMENT == py3.6-devtoolset7-rocmrpm-centos* ]]; then | ||||
|   # This file is sourced multiple times, only install conda the first time. | ||||
|   # We must install conda where we have write access. | ||||
|   CONDA_DIR="$ROOT_DIR/conda" | ||||
|   if [[ ! -d $CONDA_DIR ]]; then | ||||
|     ANACONDA_PYTHON_VERSION=3.6 | ||||
|     BASE_URL="https://repo.anaconda.com/miniconda" | ||||
|     CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh" | ||||
|     mkdir $CONDA_DIR | ||||
|     pushd /tmp | ||||
|     wget -q "${BASE_URL}/${CONDA_FILE}" | ||||
|     chmod +x "${CONDA_FILE}" | ||||
|     ./"${CONDA_FILE}" -b -f -p "$CONDA_DIR" | ||||
|     popd | ||||
|     export PATH="$CONDA_DIR/bin:$PATH" | ||||
|     # Ensure we run conda in a directory that jenkins has write access to | ||||
|     pushd $CONDA_DIR | ||||
|     # Track latest conda update | ||||
|     conda update -n base conda | ||||
|     # Install correct Python version | ||||
|     conda install python="$ANACONDA_PYTHON_VERSION" | ||||
|  | ||||
|     conda_install() { | ||||
|       # Ensure that the install command don't upgrade/downgrade Python | ||||
|       # This should be called as | ||||
|       #   conda_install pkg1 pkg2 ... [-c channel] | ||||
|       conda install -q -y python="$ANACONDA_PYTHON_VERSION" $* | ||||
|     } | ||||
|  | ||||
|     # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README | ||||
|     conda_install numpy pyyaml mkl mkl-include setuptools cffi typing future six | ||||
|  | ||||
|     # TODO: This isn't working atm | ||||
|     conda_install nnpack -c killeent | ||||
|  | ||||
|     # Install some other packages | ||||
|  | ||||
|     # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by | ||||
|     # defaults installs the most recent networkx version, so we install this lower | ||||
|     # version explicitly before scikit-image pulls it in as a dependency | ||||
|     pip install networkx==2.0 | ||||
|  | ||||
|     # TODO: Why is scipy pinned | ||||
|     # numba & llvmlite is pinned because of https://github.com/numba/numba/issues/4368 | ||||
|     # scikit-learn is pinned because of | ||||
|     # https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 | ||||
|     # only) | ||||
|     pip install --progress-bar off pytest scipy==1.1.0 scikit-learn==0.20.3 scikit-image librosa>=0.6.2 psutil numba==0.46.0 llvmlite==0.30.0 | ||||
|  | ||||
|     # click - onnx | ||||
|     # hypothesis - tests | ||||
|     # jupyter - for tutorials | ||||
|     pip install --progress-bar off click hypothesis jupyter protobuf tabulate virtualenv mock typing-extensions | ||||
|  | ||||
|     popd | ||||
|   else | ||||
|     export PATH="$CONDA_DIR/bin:$PATH" | ||||
|   fi | ||||
| fi | ||||
|  | ||||
| # Figure out which Python to use | ||||
| PYTHON="$(which python)" | ||||
| if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then | ||||
|  | ||||
| @ -144,7 +144,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then | ||||
|     # default pip version is too old(9.0.2), unable to support tag `manylinux2010`. | ||||
|     # Fix the pip error: Couldn't find a version that satisfies the requirement | ||||
|     sudo pip install --upgrade pip | ||||
|     pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.1.0.dev1228 | ||||
|     pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.3.0.dev202005123 | ||||
|   fi | ||||
|   "$ROOT_DIR/scripts/onnx/test.sh" | ||||
| fi | ||||
|  | ||||
| @ -167,7 +167,7 @@ fi | ||||
|  | ||||
| # Patch required to build xla | ||||
| if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then | ||||
|   git clone --recursive https://github.com/pytorch/xla.git | ||||
|   git clone --recursive -b r1.5 https://github.com/pytorch/xla.git | ||||
|   ./xla/scripts/apply_patches.sh | ||||
| fi | ||||
|  | ||||
| @ -259,7 +259,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then | ||||
|   # XLA build requires Bazel | ||||
|   # We use bazelisk to avoid updating Bazel version manually. | ||||
|   sudo npm install -g @bazel/bazelisk | ||||
|   sudo ln -s "$(command -v bazelisk)" /usr/bin/bazel | ||||
|   sudo ln -sf "$(command -v bazelisk)" /usr/bin/bazel | ||||
|  | ||||
|   # Install bazels3cache for cloud cache | ||||
|   sudo npm install -g bazels3cache | ||||
|  | ||||
| @ -13,12 +13,12 @@ mkdir -p ${WORKSPACE_DIR} | ||||
| # If a local installation of conda doesn't exist, we download and install conda | ||||
| if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then | ||||
|   mkdir -p ${WORKSPACE_DIR} | ||||
|   curl --retry 3 https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${WORKSPACE_DIR}/miniconda3.sh | ||||
|   curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${WORKSPACE_DIR}/miniconda3.sh | ||||
|   retry bash ${WORKSPACE_DIR}/miniconda3.sh -b -p ${WORKSPACE_DIR}/miniconda3 | ||||
| fi | ||||
| export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH" | ||||
| source ${WORKSPACE_DIR}/miniconda3/bin/activate | ||||
| retry conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja | ||||
| retry conda install -y mkl mkl-include numpy pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja | ||||
|  | ||||
| # The torch.hub tests make requests to GitHub. | ||||
| # | ||||
|  | ||||
| @ -4,7 +4,7 @@ | ||||
| source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh" | ||||
|  | ||||
| conda install -y six | ||||
| pip install -q hypothesis "librosa>=0.6.2" psutil | ||||
| pip install -q hypothesis "librosa>=0.6.2" "numba<=0.49.1" psutil | ||||
|  | ||||
| # TODO move this to docker | ||||
| pip install unittest-xml-reporting | ||||
|  | ||||
| @ -20,7 +20,7 @@ if [ -n "${IN_CIRCLECI}" ]; then | ||||
|     sudo apt-get install -y --allow-downgrades --allow-change-held-packages libnccl-dev=2.5.6-1+cuda10.1 libnccl2=2.5.6-1+cuda10.1 | ||||
|   fi | ||||
|  | ||||
|   if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then | ||||
|   if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda10.1-cudnn7-py3* ]]; then | ||||
|     # TODO: move this to Docker | ||||
|     sudo apt-get update | ||||
|     sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev | ||||
|  | ||||
| @ -21,7 +21,7 @@ if [ -n "${IN_CIRCLECI}" ]; then | ||||
|     sudo apt-get -qq install --allow-downgrades --allow-change-held-packages libnccl-dev=2.5.6-1+cuda10.1 libnccl2=2.5.6-1+cuda10.1 | ||||
|   fi | ||||
|  | ||||
|   if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then | ||||
|   if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda10.1-cudnn7-py3* ]]; then | ||||
|     # TODO: move this to Docker | ||||
|     sudo apt-get -qq update | ||||
|     sudo apt-get -qq install --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev | ||||
| @ -141,8 +141,8 @@ test_python_nn() { | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
| test_python_ge_config_simple() { | ||||
|   time python test/run_test.py --include test_jit_simple --verbose --determine-from="$DETERMINE_FROM" | ||||
| test_python_ge_config_profiling() { | ||||
|   time python test/run_test.py --include test_jit_profiling test_jit_fuser_profiling --verbose --determine-from="$DETERMINE_FROM" | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
| @ -152,7 +152,7 @@ test_python_ge_config_legacy() { | ||||
| } | ||||
|  | ||||
| test_python_all_except_nn() { | ||||
|   time python test/run_test.py --exclude test_nn test_jit_simple test_jit_legacy test_jit_fuser_legacy --verbose --bring-to-front test_quantization test_quantized test_quantized_tensor test_quantized_nn_mods --determine-from="$DETERMINE_FROM" | ||||
|   time python test/run_test.py --exclude test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_profiling --verbose --bring-to-front test_quantization test_quantized test_quantized_tensor test_quantized_nn_mods --determine-from="$DETERMINE_FROM" | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
| @ -244,7 +244,7 @@ test_backward_compatibility() { | ||||
|   pushd test/backward_compatibility | ||||
|   python dump_all_function_schemas.py --filename new_schemas.txt | ||||
|   pip_uninstall torch | ||||
|   pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html | ||||
|   pip_install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html | ||||
|   python check_backward_compatibility.py --new-schemas new_schemas.txt | ||||
|   popd | ||||
|   set +x | ||||
| @ -264,8 +264,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then | ||||
|   test_xla | ||||
| elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then | ||||
|   test_python_ge_config_legacy | ||||
| elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_simple* || "${JOB_BASE_NAME}" == *ge_config_simple* ]]; then | ||||
|   test_python_ge_config_simple | ||||
| elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_profiling* || "${JOB_BASE_NAME}" == *ge_config_profiling* ]]; then | ||||
|   test_python_ge_config_profiling | ||||
| elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then | ||||
|   # TODO: run some C++ tests | ||||
|   echo "no-op at the moment" | ||||
|  | ||||
| @ -22,7 +22,7 @@ call %INSTALLER_DIR%\install_miniconda3.bat | ||||
|  | ||||
|  | ||||
| :: Install ninja | ||||
| if "%REBUILD%"=="" ( pip install -q ninja ) | ||||
| if "%REBUILD%"=="" ( pip install -q "ninja==1.9.0" ) | ||||
|  | ||||
| git submodule sync --recursive | ||||
| git submodule update --init --recursive | ||||
|  | ||||
| @ -5,7 +5,7 @@ if "%BUILD_ENVIRONMENT%"=="" ( | ||||
| ) | ||||
| if "%REBUILD%"=="" ( | ||||
|   IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 ) | ||||
|   curl --retry 3 -k https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe | ||||
|   curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe | ||||
|   %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 | ||||
| ) | ||||
| call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 | ||||
|  | ||||
| @ -13,7 +13,7 @@ if "%BUILD_ENVIRONMENT%"=="" ( | ||||
| ) | ||||
| if NOT "%BUILD_ENVIRONMENT%"=="" ( | ||||
|     IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 ) | ||||
|     curl --retry 3 https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe | ||||
|     curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe | ||||
|     %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 | ||||
| ) | ||||
| call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 | ||||
| @ -23,8 +23,18 @@ if NOT "%BUILD_ENVIRONMENT%"=="" ( | ||||
|     call conda install -y -q python=3.6.7 numpy mkl cffi pyyaml boto3 protobuf numba==0.44.0 | ||||
|     call conda install -y -q -c conda-forge cmake | ||||
| ) | ||||
|  | ||||
| pushd . | ||||
| if "%VC_VERSION%" == "" ( | ||||
|     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 | ||||
| ) else ( | ||||
|     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION% | ||||
| ) | ||||
| @echo on | ||||
| popd | ||||
|  | ||||
| :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 | ||||
| pip install ninja future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow | ||||
| pip install "ninja==1.9.0" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow | ||||
| :: No need to install faulthandler since we only test Python >= 3.6 on Windows | ||||
| :: faulthandler is builtin since Python 3.3 | ||||
|  | ||||
| @ -34,15 +44,6 @@ goto cuda_build_end | ||||
|  | ||||
| :cuda_build_9 | ||||
|  | ||||
| pushd . | ||||
| if "%VC_VERSION%" == "" ( | ||||
|     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 | ||||
| ) else ( | ||||
|     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION% | ||||
| ) | ||||
| @echo on | ||||
| popd | ||||
|  | ||||
| set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2 | ||||
| set CUDA_PATH_V9_2=%CUDA_PATH% | ||||
|  | ||||
| @ -50,15 +51,6 @@ goto cuda_build_common | ||||
|  | ||||
| :cuda_build_10 | ||||
|  | ||||
| pushd . | ||||
| if "%VC_VERSION%" == "" ( | ||||
|     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 | ||||
| ) else ( | ||||
|     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION% | ||||
| ) | ||||
| @echo on | ||||
| popd | ||||
|  | ||||
| set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 | ||||
| set CUDA_PATH_V10_1=%CUDA_PATH% | ||||
|  | ||||
|  | ||||
| @ -1,3 +1,3 @@ | ||||
| call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat | ||||
| cd test && python run_test.py --exclude test_nn test_jit_simple test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="%1" && cd .. | ||||
| cd test && python run_test.py --exclude test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_profiling --verbose --determine-from="%1" && cd .. | ||||
| if ERRORLEVEL 1 exit /b 1 | ||||
|  | ||||
| @ -160,20 +160,18 @@ ENDIF(BLAS_FOUND) | ||||
|  | ||||
| IF(LAPACK_FOUND) | ||||
|   list(APPEND ATen_CPU_DEPENDENCY_LIBS ${LAPACK_LIBRARIES}) | ||||
|   if(USE_CUDA) | ||||
|   if(USE_CUDA AND MSVC) | ||||
|     # Although Lapack provides CPU (and thus, one might expect that ATen_cuda | ||||
|     # would not need this at all), some of our libraries (magma in particular) | ||||
|     # backend to CPU BLAS/LAPACK implementations, and so it is very important | ||||
|     # we get the *right* implementation, because even if the symbols are the | ||||
|     # same, LAPACK implementions may have different calling conventions. | ||||
|     # This caused https://github.com/pytorch/pytorch/issues/7353 | ||||
|     # | ||||
|     # We do NOT do this on Linux, since we just rely on torch_cpu to | ||||
|     # provide all of the symbols we need | ||||
|     list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${LAPACK_LIBRARIES}) | ||||
|   endif() | ||||
|   if(USE_ROCM) | ||||
|     # It's not altogether clear that HIP behaves the same way, but it | ||||
|     # seems safer to assume that it needs it too | ||||
|     list(APPEND ATen_HIP_DEPENDENCY_LIBS ${LAPACK_LIBRARIES}) | ||||
|   endif() | ||||
| ENDIF(LAPACK_FOUND) | ||||
|  | ||||
| IF (UNIX AND NOT APPLE) | ||||
| @ -331,8 +329,12 @@ IF(USE_CUDA AND NOT USE_ROCM) | ||||
|   IF(USE_MAGMA) | ||||
|     list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${MAGMA_LIBRARIES}) | ||||
|     IF ($ENV{TH_BINARY_BUILD}) | ||||
|       list(APPEND ATen_CUDA_DEPENDENCY_LIBS | ||||
|         "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}") | ||||
|       IF (MSVC) | ||||
|         # Do not do this on Linux: see Note [Extra MKL symbols for MAGMA in torch_cpu] | ||||
|         # in caffe2/CMakeLists.txt | ||||
|         list(APPEND ATen_CUDA_DEPENDENCY_LIBS | ||||
|           "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}") | ||||
|       ENDIF(MSVC) | ||||
|     ENDIF($ENV{TH_BINARY_BUILD}) | ||||
|   ENDIF(USE_MAGMA) | ||||
|   IF ($ENV{ATEN_STATIC_CUDA}) | ||||
|  | ||||
| @ -125,13 +125,15 @@ void _parallel_run( | ||||
|   std::tie(num_tasks, chunk_size) = | ||||
|       internal::calc_num_tasks_and_chunk_size(begin, end, grain_size); | ||||
|  | ||||
|   std::atomic_flag err_flag = ATOMIC_FLAG_INIT; | ||||
|   std::exception_ptr eptr; | ||||
|   std::vector<std::shared_ptr<c10::ivalue::Future>> futures(num_tasks); | ||||
|   for (size_t task_id = 0; task_id < num_tasks; ++task_id) { | ||||
|     futures[task_id] = std::make_shared<c10::ivalue::Future>(c10::NoneType::get()); | ||||
|   } | ||||
|   auto task = [f, &eptr, &err_flag, &futures, begin, end, chunk_size] | ||||
|   struct { | ||||
|     std::atomic_flag err_flag = ATOMIC_FLAG_INIT; | ||||
|     std::exception_ptr eptr; | ||||
|     std::mutex mutex; | ||||
|     volatile size_t remaining; | ||||
|     std::condition_variable cv; | ||||
|   } state; | ||||
|  | ||||
|   auto task = [f, &state, begin, end, chunk_size] | ||||
|       (int /* unused */, size_t task_id) { | ||||
|     int64_t local_start = begin + task_id * chunk_size; | ||||
|     if (local_start < end) { | ||||
| @ -140,21 +142,30 @@ void _parallel_run( | ||||
|         ParallelRegionGuard guard(task_id); | ||||
|         f(local_start, local_end, task_id); | ||||
|       } catch (...) { | ||||
|         if (!err_flag.test_and_set()) { | ||||
|           eptr = std::current_exception(); | ||||
|         if (!state.err_flag.test_and_set()) { | ||||
|           state.eptr = std::current_exception(); | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|     futures[task_id]->markCompleted(); | ||||
|     { | ||||
|       std::unique_lock<std::mutex> lk(state.mutex); | ||||
|       if (--state.remaining == 0) { | ||||
|         state.cv.notify_one(); | ||||
|       } | ||||
|     } | ||||
|   }; | ||||
|   state.remaining = num_tasks; | ||||
|   _run_with_pool(task, num_tasks); | ||||
|  | ||||
|   // Wait for all tasks to finish. | ||||
|   for (size_t task_id = 0; task_id < num_tasks; ++task_id) { | ||||
|     futures[task_id]->wait(); | ||||
|   { | ||||
|     std::unique_lock<std::mutex> lk(state.mutex); | ||||
|     if (state.remaining != 0) { | ||||
|       state.cv.wait(lk); | ||||
|     } | ||||
|   } | ||||
|   if (eptr) { | ||||
|     std::rethrow_exception(eptr); | ||||
|   if (state.eptr) { | ||||
|     std::rethrow_exception(state.eptr); | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -16,14 +16,6 @@ | ||||
| #include <numeric> | ||||
| #include <memory> | ||||
|  | ||||
| #if defined(__clang__) | ||||
| #define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero"))) | ||||
| #define __ubsan_ignore_vptr__ __attribute__((no_sanitize("vptr"))) | ||||
| #else | ||||
| #define __ubsan_ignore_float_divide_by_zero__ | ||||
| #define __ubsan_ignore_vptr__ | ||||
| #endif | ||||
|  | ||||
| #define AT_DISALLOW_COPY_AND_ASSIGN(TypeName) \ | ||||
|   TypeName(const TypeName&) = delete; \ | ||||
|   void operator=(const TypeName&) = delete | ||||
|  | ||||
| @ -27,14 +27,9 @@ using c10::KernelFunction; | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| void variable_fallback_kernel(const OperatorHandle& op, Stack* stack) { | ||||
|     at::AutoNonVariableTypeMode _var_guard(true); | ||||
|     Dispatcher::singleton().callBoxed(op, stack); | ||||
| } | ||||
|  | ||||
| static auto registry = Dispatcher::singleton().registerBackendFallbackKernel( | ||||
| static auto registry = c10::Dispatcher::singleton().registerBackendFallbackKernel( | ||||
|     DispatchKey::VariableTensorId, | ||||
|     KernelFunction::makeFromBoxedFunction<&variable_fallback_kernel>() | ||||
|     KernelFunction::makeFallthrough() | ||||
| ); | ||||
|  | ||||
| } | ||||
|  | ||||
| @ -20,6 +20,10 @@ void registerCustomClass(at::ClassTypePtr class_type) { | ||||
| } | ||||
|  | ||||
| at::ClassTypePtr getCustomClass(const std::string& name) { | ||||
|   // BC hack so we can upgrade a binary internally | ||||
|   if (name == "__torch__.torch.classes.SentencePiece") { | ||||
|     return getCustomClass("__torch__.torch.classes.fb.SentencePiece"); | ||||
|   } | ||||
|   return customClasses().count(name) ? customClasses()[name] : nullptr; | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -15,6 +15,7 @@ | ||||
| #include <c10/util/math_compat.h> | ||||
| #include <ATen/native/cpu/zmath.h> | ||||
| #include <c10/util/TypeCast.h> | ||||
| #include <c10/macros/Macros.h> | ||||
|  | ||||
| #if defined(__GNUC__) | ||||
| #define __at_align32__ __attribute__((aligned(32))) | ||||
|  | ||||
| @ -145,7 +145,7 @@ private: | ||||
|  | ||||
| std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d); | ||||
|  | ||||
| class FilterDescriptor | ||||
| class TORCH_CUDA_API FilterDescriptor | ||||
|   : public Descriptor<cudnnFilterStruct, | ||||
|                       &cudnnCreateFilterDescriptor, | ||||
|                       &cudnnDestroyFilterDescriptor> | ||||
|  | ||||
| @ -550,7 +550,6 @@ FunctionOption = TypedDict('FunctionOption', { | ||||
|     'type_method_definition_dispatch': str, | ||||
|     'type_method_formals': List[str], | ||||
|     'variants': str, | ||||
|     'with_gil': bool, | ||||
|     'zero_dim_dispatch_when_scalar': str, | ||||
| }) | ||||
|  | ||||
|  | ||||
| @ -673,11 +673,11 @@ Tensor & leaky_relu_( | ||||
|   return at::leaky_relu_out(self, self, neg_val); | ||||
| } | ||||
|  | ||||
| // Note: leakyReLu backward calculation doesn't support in-place call with non-positive slope. | ||||
| // Note: leakyReLu backward calculation doesn't support in-place call with negative slope. | ||||
| // The reason is that for in-place forward call, the forward result will be saved into autograd | ||||
| // node instead of the input itself, when calculating backward gradient, there is no way to know | ||||
| // whether the original input for current node is positive or not if the input slope is | ||||
| // non-positive. eg. forward is 2, slope is -0.2, the original input for this node could be | ||||
| // negative. eg. forward is 2, slope is -0.2, the original input for this node could be | ||||
| // either 2, or -10, so no way to get a correct backward gradient in this case. | ||||
| Tensor leaky_relu_backward( | ||||
|     const Tensor& grad_output, | ||||
| @ -685,11 +685,11 @@ Tensor leaky_relu_backward( | ||||
|     Scalar negval, | ||||
|     bool is_result) { | ||||
|   TORCH_CHECK( | ||||
|     !is_result || negval.to<double>() > 0.0, | ||||
|     "In-place leakyReLu backward calculation is triggered with a non-positive slope which is not supported. " | ||||
|     "This is caused by calling in-place forward function with a non-positive slope, " | ||||
|     !is_result || negval.to<double>() >= 0.0, | ||||
|     "In-place leakyReLu backward calculation is triggered with a negative slope which is not supported. " | ||||
|     "This is caused by calling in-place forward function with a negative slope, " | ||||
|     "please call out-of-place version instead. File an issue at https://github.com/pytorch/pytorch if you do " | ||||
|     "require supporting in-place leakRelu backward calculation with non-positive slope"); | ||||
|     "require supporting in-place leakRelu backward calculation with negative slope"); | ||||
|  | ||||
|   Tensor result; | ||||
|   auto iter = TensorIterator::binary_op(result, self_or_result, grad_output); | ||||
| @ -698,17 +698,34 @@ Tensor leaky_relu_backward( | ||||
| } | ||||
|  | ||||
| std::tuple<Tensor, Tensor> log_sigmoid_forward_cpu(const Tensor& input) { | ||||
|   auto result = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|   auto buffer = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|   // FIXME: do these actually need to be zeros_like or can they be empty_like? | ||||
|   auto result = at::zeros_like(input, at::MemoryFormat::Contiguous); | ||||
|   auto buffer = at::zeros_like(input, at::MemoryFormat::Contiguous); | ||||
|   log_sigmoid_cpu_stub(kCPU, result, buffer, input.contiguous()); | ||||
|   return std::make_tuple(result, buffer); | ||||
| } | ||||
|  | ||||
| std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_cpu(Tensor& result, Tensor& buffer, const Tensor& input) { | ||||
|   log_sigmoid_cpu_stub(kCPU, result, buffer, input); | ||||
|   result.resize_as_(input); | ||||
|   buffer.resize_as_(input, at::MemoryFormat::Contiguous); | ||||
|   TORCH_CHECK(buffer.is_contiguous(), "Contiguous buffer required for log_sigmoid with out parameter"); | ||||
|   Tensor result_tmp = result.is_contiguous() ? result : at::empty_like(result, at::MemoryFormat::Contiguous); | ||||
|   log_sigmoid_cpu_stub(kCPU, result_tmp, buffer, input.contiguous()); | ||||
|   if (!result.is_contiguous()) { | ||||
|     result.copy_(result_tmp); | ||||
|   } | ||||
|   return std::forward_as_tuple(result, buffer); | ||||
| } | ||||
|  | ||||
| Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) { | ||||
|   Tensor buffer = at::empty({0}, self.options()); | ||||
|   return std::get<0>(at::log_sigmoid_forward_out(output, buffer, self)); | ||||
| } | ||||
|  | ||||
| Tensor log_sigmoid(const Tensor & self) { | ||||
|   return std::get<0>(at::log_sigmoid_forward(self)); | ||||
| } | ||||
|  | ||||
| Tensor log_sigmoid_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& buffer) { | ||||
|   Tensor grad_input; | ||||
|   auto iter = at::TensorIterator(); | ||||
|  | ||||
| @ -138,6 +138,10 @@ Tensor true_divide(const Tensor& self, const Tensor& divisor) { | ||||
|   return iter.output(); | ||||
| } | ||||
|  | ||||
| Tensor& true_divide_(Tensor& self, const Tensor& divisor) { | ||||
|   return native::true_divide_out(self, self, divisor); | ||||
| } | ||||
|  | ||||
| Tensor& floor_divide_out(Tensor& result, const Tensor& self, const Tensor& other) { | ||||
|   auto iter = TensorIterator::binary_op(result, self, other, | ||||
|     /*check_mem_overlap=*/true); | ||||
| @ -731,7 +735,11 @@ Tensor& fmod_(Tensor& self, Scalar other) { | ||||
| } | ||||
|  | ||||
| Tensor true_divide(const Tensor& self, Scalar divisor) { | ||||
|   return at::true_divide(self, wrapped_scalar_tensor(divisor)); // redispatch! | ||||
|   return self.true_divide(wrapped_scalar_tensor(divisor)); // redispatch! | ||||
| } | ||||
|  | ||||
| Tensor& true_divide_(Tensor& self, Scalar divisor) { | ||||
|   return self.true_divide_(wrapped_scalar_tensor(divisor)); // redispatch! | ||||
| } | ||||
|  | ||||
| } | ||||
|  | ||||
| @ -609,7 +609,7 @@ at::Tensor _convolution( | ||||
|     auto weight_view = at::_unsafe_view(weight, -1); | ||||
|     auto out = input*weight_view[0]; | ||||
|     if (bias.defined()) | ||||
|       out = out + bias[0]; | ||||
|       out.add_(bias[0]); | ||||
|     return out.view(o); | ||||
|   } | ||||
|  | ||||
| @ -639,7 +639,7 @@ at::Tensor _convolution( | ||||
|             input.contiguous(cudnn_memory_format), weight, | ||||
|             padding, stride, dilation, params.groups, params.benchmark, params.deterministic); | ||||
|         if (bias.defined()) { | ||||
|           output = output + reshape_bias(input.dim(), bias); | ||||
|           output.add_(reshape_bias(input.dim(), bias)); | ||||
|         } | ||||
|  | ||||
|       } else if (params.use_miopen(input, bias.defined())){ | ||||
| @ -662,14 +662,14 @@ at::Tensor _convolution( | ||||
|           input.contiguous(cudnn_memory_format), weight, | ||||
|           params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic); | ||||
|       if (bias.defined()) { | ||||
|         output = output + reshape_bias(input.dim(), bias); | ||||
|         output.add_(reshape_bias(input.dim(), bias)); | ||||
|       } | ||||
|     } else { | ||||
|       output = at::cudnn_convolution( | ||||
|           input.contiguous(cudnn_memory_format), weight, | ||||
|           params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic); | ||||
|       if (bias.defined()) { | ||||
|         output = output + reshape_bias(input.dim(), bias); | ||||
|         output.add_(reshape_bias(input.dim(), bias)); | ||||
|       } | ||||
|     } | ||||
|   } else if (params.use_miopen(input, bias.defined())) { | ||||
|  | ||||
| @ -70,8 +70,8 @@ struct CAFFE2_API DispatchStub<rT (*)(Args...), T> { | ||||
|       // they will still compute the same value for cpu_dispatch_ptr. | ||||
|       if (!cpu_dispatch_ptr.load(std::memory_order_relaxed)) { | ||||
|         FnPtr tmp_cpu_dispatch_ptr = nullptr; | ||||
|         cpu_dispatch_ptr.compare_exchange_weak( | ||||
|             tmp_cpu_dispatch_ptr, choose_cpu_impl(), std::memory_order_relaxed); | ||||
|         while(!cpu_dispatch_ptr.compare_exchange_weak( | ||||
|             tmp_cpu_dispatch_ptr, choose_cpu_impl(), std::memory_order_relaxed)); | ||||
|       } | ||||
|       return (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...); | ||||
|     } else if (device_type == DeviceType::CUDA) { | ||||
|  | ||||
| @ -31,15 +31,6 @@ Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & wei | ||||
|   return std::get<0>(at::nll_loss2d_forward(self, target, weight, reduction, ignore_index)); | ||||
| } | ||||
|  | ||||
| Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) { | ||||
|   Tensor buffer = at::empty({0}, self.options()); | ||||
|   return std::get<0>(at::log_sigmoid_forward_out(output, buffer, self)); | ||||
| } | ||||
|  | ||||
| Tensor log_sigmoid(const Tensor & self) { | ||||
|   return std::get<0>(at::log_sigmoid_forward(self)); | ||||
| } | ||||
|  | ||||
| Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding) { | ||||
|   Tensor finput = at::empty({0}, self.options()); | ||||
|   Tensor fgrad_input = at::empty({0}, self.options()); | ||||
|  | ||||
| @ -533,7 +533,7 @@ Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { | ||||
|     return at::norm(self, 2, dim, keepdim, self.scalar_type()); | ||||
|   } | ||||
|   if (self.is_complex()){ | ||||
|     return at::sqrt(at::sum((self.conj() * self).real(), dim, keepdim)); | ||||
|     return at::sqrt(at::sum(at::real(self.conj() * self), dim, keepdim)); | ||||
|   } else { | ||||
|     return at::sqrt(at::sum((self * self), dim, keepdim)); | ||||
|   } | ||||
| @ -553,7 +553,7 @@ Tensor &frobenius_norm_out( | ||||
|     return at::norm_out(result, self, 2, dim, keepdim, self.scalar_type()); | ||||
|   } | ||||
|   if (self.is_complex()){ | ||||
|     return at::sqrt_out(result, at::sum((self.conj() * self).real(), dim, keepdim)); | ||||
|     return at::sqrt_out(result, at::sum(at::real(self.conj() * self), dim, keepdim)); | ||||
|   } else { | ||||
|     return at::sqrt_out(result, at::sum((self * self), dim, keepdim)); | ||||
|   } | ||||
|  | ||||
| @ -799,7 +799,7 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim, | ||||
|  | ||||
|   if (at::isComplexType(self.scalar_type())){ | ||||
|     ScalarType dtype = c10::toValueType(get_dtype(result, self, {}, true)); | ||||
|     Tensor real_in = self.real().to(dtype); | ||||
|     Tensor real_in = at::real(self).to(dtype); | ||||
|     Tensor real_out = at::empty({0}, self.options().dtype(dtype)); | ||||
|     auto iter = make_reduction("std or var", real_out, real_in, dim, keepdim, dtype); | ||||
|     if (iter.numel() == 0) { | ||||
| @ -807,7 +807,7 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim, | ||||
|     } else { | ||||
|       std_var_stub(iter.device_type(), iter, unbiased, false); | ||||
|     } | ||||
|     Tensor imag_in = self.imag().to(dtype); | ||||
|     Tensor imag_in = at::imag(self).to(dtype); | ||||
|     Tensor imag_out = at::empty({0}, self.options().dtype(dtype)); | ||||
|     iter = make_reduction("std or var", imag_out, imag_in, dim, keepdim, dtype); | ||||
|     if (iter.numel() == 0) { | ||||
| @ -845,7 +845,7 @@ static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &r | ||||
|            "."); | ||||
|   if (at::isComplexType(self.scalar_type())){ | ||||
|     ScalarType dtype = c10::toValueType(get_dtype(result1, self, {}, true)); | ||||
|     Tensor real_in = self.real().to(dtype); | ||||
|     Tensor real_in = at::real(self).to(dtype); | ||||
|     Tensor real_out_var = at::empty({0}, self.options().dtype(dtype)); | ||||
|     Tensor real_out_mean = at::empty({0}, self.options().dtype(dtype)); | ||||
|     auto iter = make_reduction(fname, real_out_var, real_out_mean, real_in, dim, keepdim, dtype); | ||||
| @ -855,7 +855,7 @@ static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &r | ||||
|     } else { | ||||
|       std_var_stub(iter.device_type(), iter, unbiased, false); | ||||
|     } | ||||
|     Tensor imag_in = self.imag().to(dtype); | ||||
|     Tensor imag_in = at::imag(self).to(dtype); | ||||
|     Tensor imag_out_var = at::empty({0}, self.options().dtype(dtype)); | ||||
|     Tensor imag_out_mean = at::empty({0}, self.options().dtype(dtype)); | ||||
|     iter = make_reduction(fname, imag_out_var, imag_out_mean, imag_in, dim, keepdim, dtype); | ||||
|  | ||||
| @ -85,6 +85,7 @@ inline void setStrided( | ||||
|     IntArrayRef size, | ||||
|     IntArrayRef stride, | ||||
|     int64_t storage_offset) { | ||||
|   TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape"); | ||||
|   auto* self_ = self.unsafeGetTensorImpl(); | ||||
|   checkInBoundsForStorage(size, stride, storage_offset, self_->storage()); | ||||
|  | ||||
| @ -93,7 +94,6 @@ inline void setStrided( | ||||
|   self_->set_storage_offset(storage_offset); | ||||
|  | ||||
|   /* size and stride */ | ||||
|   AT_ASSERT(size.size() == stride.size()); | ||||
|   if (self_->sizes() == size && self_->strides() == stride) { | ||||
|     return; | ||||
|   } | ||||
|  | ||||
| @ -130,6 +130,28 @@ static Tensor reshape_indexer(const Tensor& index, int64_t dims_before, int64_t | ||||
|   return index.reshape(shape); | ||||
| } | ||||
|  | ||||
| // checks whether index.dtype == int64 | ||||
| // and self.dtyp == src.dtype if src is a Tensor | ||||
| static void scatter_gather_dtype_check( | ||||
|   const std::string& method_name, | ||||
|   const Tensor& self, | ||||
|   const Tensor& index, | ||||
|   const c10::optional<const Tensor>& src_opt = c10::nullopt | ||||
| ) { | ||||
|   TORCH_CHECK( | ||||
|     index.scalar_type() == at::ScalarType::Long, | ||||
|     method_name, "(): Expected dtype int64 for index" | ||||
|   ); | ||||
|  | ||||
|   if (src_opt.has_value()) { | ||||
|     auto src = src_opt.value(); | ||||
|     TORCH_CHECK( | ||||
|       self.scalar_type() == src.scalar_type(), | ||||
|       method_name, "(): Expected self.dtype to be equal to src.dtype" | ||||
|     ); | ||||
|   } | ||||
| } | ||||
|  | ||||
| AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list) | ||||
| { | ||||
|   int64_t element_size_bytes = src.element_size(); | ||||
| @ -493,40 +515,48 @@ Tensor index_fill(const Tensor & self, int64_t dim, const Tensor & index, const | ||||
| } | ||||
|  | ||||
| Tensor & gather_out_cpu(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) { | ||||
|   scatter_gather_dtype_check("gather_out_cpu", self, index, result); | ||||
|   result.resize_(index.sizes()); | ||||
|   gather_stub(result.device().type(), result, self, dim, index); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| Tensor gather_cpu(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) { | ||||
|   scatter_gather_dtype_check("gather_cpu", self, index); | ||||
|   Tensor result = at::empty({0}, self.options()); | ||||
|   return gather_out_cpu(result, self, dim, index, sparse_grad); | ||||
| } | ||||
|  | ||||
| Tensor & scatter_cpu_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) { | ||||
|   scatter_gather_dtype_check("scatter_cpu", self, index, src); | ||||
|   scatter_stub(self.device().type(), self, dim, index, src); | ||||
|   return self; | ||||
| } | ||||
|  | ||||
| Tensor & scatter_fill_cpu_(Tensor & self, int64_t dim, const Tensor & index, Scalar src) { | ||||
|   scatter_gather_dtype_check("scatter_fill_cpu", self, index); | ||||
|   scatter_fill_stub(self.device().type(), self, dim, index, src); | ||||
|   return self; | ||||
| } | ||||
|  | ||||
| Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { | ||||
|   scatter_gather_dtype_check("scatter", self, index, source); | ||||
|   return self.clone(at::MemoryFormat::Preserve).scatter_(dim, index, source); | ||||
| } | ||||
|  | ||||
| Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, Scalar source) { | ||||
|   scatter_gather_dtype_check("scatter", self, index); | ||||
|   return self.clone(at::MemoryFormat::Preserve).scatter_(dim, index, source); | ||||
| } | ||||
|  | ||||
| Tensor & scatter_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) { | ||||
|   scatter_gather_dtype_check("scatter_add_cpu", self, index, src); | ||||
|   scatter_add_stub(self.device().type(), self, dim, index, src); | ||||
|   return self; | ||||
| } | ||||
|  | ||||
| Tensor scatter_add(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { | ||||
|   scatter_gather_dtype_check("scatter_add", self, index, source); | ||||
|   return self.clone(at::MemoryFormat::Preserve).scatter_add_(dim, index, source); | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -33,7 +33,7 @@ static inline Tensor to_impl(const Tensor& self, const TensorOptions& options, b | ||||
|     if (self.is_non_overlapping_and_dense()) { | ||||
|       // Copy all strides | ||||
|       auto r = at::empty_strided(self.sizes(), self.strides(), options.memory_format(c10::nullopt)); | ||||
|       r.copy_(self); | ||||
|       r.copy_(self, non_blocking); | ||||
|       return r; | ||||
|     } else { | ||||
|       memory_format = self.suggest_memory_format(); | ||||
|  | ||||
| @ -99,7 +99,7 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) { | ||||
|  | ||||
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
| Tensor empty_cpu(IntArrayRef size, const TensorOptions& options_, c10::optional<c10::MemoryFormat> optional_memory_format) { | ||||
|  | ||||
|   TORCH_CHECK(!isComplexType(at::typeMetaToScalarType(options_.dtype())), "Complex dtype not supported."); | ||||
|   TORCH_CHECK( | ||||
|     !(options_.has_memory_format() && optional_memory_format.has_value()), | ||||
|     "Cannot set memory_format both in TensorOptions and explicit argument; please delete " | ||||
|  | ||||
| @ -638,7 +638,7 @@ void TensorIterator::narrow(int dim, int64_t start, int64_t size) { | ||||
|   for (auto& op : operands_) { | ||||
|     op.data = ((char*)op.data) + op.stride_bytes[dim] * start; | ||||
|   } | ||||
|   if (size == 1) { | ||||
|   if (size == 1 && !is_reduction_) { | ||||
|     coalesce_dimensions(); | ||||
|   } | ||||
| } | ||||
| @ -891,10 +891,13 @@ std::unique_ptr<TensorIterator> TensorIterator::split(int dim) { | ||||
| } | ||||
|  | ||||
| int TensorIterator::get_dim_to_split() const { | ||||
|   TORCH_INTERNAL_ASSERT(ndim() >= 1 && shape()[ndim() - 1] >= 2); | ||||
|   TORCH_INTERNAL_ASSERT(ndim() >= 1); | ||||
|   int64_t max_extent = -1; | ||||
|   int dim_to_split = -1; | ||||
|   for (int dim = ndim() - 1; dim >= 0; dim--) { | ||||
|     if (shape_[dim] == 0) { | ||||
|       continue; | ||||
|     } | ||||
|     int64_t size = shape_[dim]; | ||||
|     for (auto& op : operands_) { | ||||
|       int64_t extent = (size - 1) * op.stride_bytes[dim]; | ||||
|  | ||||
| @ -98,6 +98,15 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) { | ||||
|         "output memory locations. Found overlap in input tensor ", i); | ||||
|   } | ||||
|  | ||||
|   // Dtypes should be the same | ||||
|   const auto first_in_cat = tensors[0]; | ||||
|   for (int64_t i = 1; i < tensors.size(); i++) { | ||||
|     TORCH_CHECK(first_in_cat.dtype() == tensors[i].dtype(), | ||||
|               "Expected object of scalar type ", first_in_cat.dtype(), | ||||
|               " but got scalar type ", tensors[i].dtype(), | ||||
|               " for sequence element ", i, "."); | ||||
|   } | ||||
|  | ||||
|   auto should_skip = [](const Tensor& t) { return t.numel() == 0 && t.dim() == 1; }; | ||||
|   for (auto const &tensor : tensors) { | ||||
|     if (should_skip(tensor)) { | ||||
|  | ||||
| @ -73,11 +73,17 @@ Tensor& abs_(Tensor& self) { return unary_op_impl_(self, at::abs_out); } | ||||
| Tensor& angle_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, angle_stub); } | ||||
| Tensor angle(const Tensor& self) { return unary_op_impl(self, at::angle_out); } | ||||
|  | ||||
| Tensor& real_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, real_stub); } | ||||
| Tensor real(const Tensor& self) { return unary_op_impl(self, at::real_out); } | ||||
| Tensor real(const Tensor& self) { | ||||
|   TORCH_CHECK(!self.is_complex(), "real is not yet implemented for complex tensors."); | ||||
|   return self; | ||||
| } | ||||
|  | ||||
| Tensor& imag_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, imag_stub); } | ||||
| Tensor imag(const Tensor& self) { return unary_op_impl(self, at::imag_out); } | ||||
| Tensor imag(const Tensor& self) { | ||||
|   TORCH_CHECK(false, "imag is not yet implemented."); | ||||
|  | ||||
|   // Note: unreachable | ||||
|   return at::zeros_like(self); | ||||
| } | ||||
|  | ||||
| Tensor& conj_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, conj_stub); } | ||||
| Tensor conj(const Tensor& self) { return unary_op_impl(self, at::conj_out); } | ||||
|  | ||||
| @ -7,6 +7,7 @@ | ||||
| #include <ATen/native/TensorIterator.h> | ||||
| #include <ATen/native/BinaryOps.h> | ||||
| #include <ATen/native/cpu/Loops.h> | ||||
| #include <c10/macros/Macros.h> | ||||
|  | ||||
| namespace at { namespace native { | ||||
| namespace { | ||||
|  | ||||
| @ -87,6 +87,10 @@ static void max_kernel_impl( | ||||
|     Tensor& max_indices, | ||||
|     const Tensor& self, | ||||
|     c10::optional<int64_t> dim) { | ||||
|  | ||||
|   TORCH_CHECK(max.scalar_type() == self.scalar_type() && max_indices.scalar_type() == kLong, | ||||
|     "Expect dtype ", self.scalar_type(), "and torch.long, but got ", max.scalar_type(), "and", max_indices.scalar_type()); | ||||
|  | ||||
|   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(ScalarType::Bool, self.scalar_type(), "max", [&] { | ||||
|     Reduction<scalar_t, int64_t>::apply(max, max_indices, self, dim, true); | ||||
|   }); | ||||
| @ -97,6 +101,10 @@ static void min_kernel_impl( | ||||
|     Tensor& min_indices, | ||||
|     const Tensor& self, | ||||
|     c10::optional<int64_t> dim) { | ||||
|  | ||||
|   TORCH_CHECK(min.scalar_type() == self.scalar_type() && min_indices.scalar_type() == kLong, | ||||
|     "Expect dtype ", self.scalar_type(), "and torch.long, but got ", min.scalar_type(), "and", min_indices.scalar_type()); | ||||
|  | ||||
|   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(ScalarType::Bool, self.scalar_type(), "min", [&] { | ||||
|     Reduction<scalar_t, int64_t>::apply(min, min_indices, self, dim, false); | ||||
|   }); | ||||
|  | ||||
| @ -4,7 +4,7 @@ | ||||
| #include <ATen/native/cuda/zmath.cuh> | ||||
| #include <ATen/native/TensorIterator.h> | ||||
| #include <ATen/native/BinaryOps.h> | ||||
|  | ||||
| #include <c10/macros/Macros.h> | ||||
|  | ||||
| // NOTE: CUDA on Windows requires that the enclosing function | ||||
| // of a __device__ lambda not have internal linkage. | ||||
| @ -69,7 +69,6 @@ void remainder_kernel_cuda(TensorIterator& iter) { | ||||
|     AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "remainder_cuda", [&]() { | ||||
|       using thrust_t = typename ztype_cuda<scalar_t>::thrust_t; | ||||
|       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(thrust_t a, thrust_t b) -> thrust_t { | ||||
|         CUDA_KERNEL_ASSERT(b != 0); | ||||
|         thrust_t r = a % b; | ||||
|         if ((r != 0) && ((r < 0) != (b < 0))) { | ||||
|           r += b; | ||||
|  | ||||
| @ -358,7 +358,7 @@ void max_pool2d_with_indices_out_cuda_template( | ||||
|  | ||||
|   Tensor input = input_.contiguous(memory_format); | ||||
|  | ||||
|   const int64_t in_stride_n = input.stride(-4); | ||||
|   const int64_t in_stride_n = input_.ndimension() == 4 ? input.stride(-4) : 0; | ||||
|   const int64_t in_stride_c = input.stride(-3); | ||||
|   const int64_t in_stride_h = input.stride(-2); | ||||
|   const int64_t in_stride_w = input.stride(-1); | ||||
| @ -506,7 +506,7 @@ void max_pool2d_with_indices_backward_out_cuda_template( | ||||
|   const int64_t inputHeight = input.size(-2); | ||||
|   const int64_t inputWidth = input.size(-1); | ||||
|  | ||||
|   const int64_t in_stride_n = input.stride(-4); | ||||
|   const int64_t in_stride_n = input.ndimension() == 4 ? input.stride(-4) : 0; | ||||
|   const int64_t in_stride_c = input.stride(-3); | ||||
|   const int64_t in_stride_h = input.stride(-2); | ||||
|   const int64_t in_stride_w = input.stride(-1); | ||||
|  | ||||
| @ -54,7 +54,7 @@ __global__ void EmbeddingBag_updateOutputKernel( | ||||
|       scalar_t *weightFeat = weight + featureDim * weight_stride1; | ||||
|       int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it | ||||
|       int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices; | ||||
|       assert(end >= begin); | ||||
|       CUDA_KERNEL_ASSERT(end >= begin); | ||||
|  | ||||
|       accscalar_t weightFeatSum = 0; | ||||
|       scalar_t weightFeatMax; | ||||
|  | ||||
| @ -192,13 +192,13 @@ void index_put_accum_kernel(Tensor & self, TensorList indices, const Tensor & va | ||||
|   if (num_indices > 0 && sliceSize > 0) { | ||||
|       const bool permuted = !src.is_contiguous(); | ||||
|       auto src_ = permuted ? src.contiguous() : src; | ||||
|       linearIndex = linearIndex.view(-1); | ||||
|       linearIndex = linearIndex.reshape(-1); | ||||
|       auto sorted_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|       auto orig_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|       using device_ptr = thrust::device_ptr<int64_t>; | ||||
|       const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|       linearIndex.div_(sliceSize); | ||||
|       linearIndex.floor_divide_(sliceSize); | ||||
|       { | ||||
|       sorted_indices.copy_(linearIndex); | ||||
|       auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); | ||||
|  | ||||
| @ -35,13 +35,13 @@ __global__ void renormRowsL1(scalar_t* dist, long rows, long cols) { | ||||
|     scalar_t sum = static_cast<scalar_t>(0); | ||||
|     for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) { | ||||
|       val = dist[row * cols + col]; | ||||
|       CUDA_ALWAYS_ASSERT(!THCNumerics<scalar_t>::lt(val, zero)); // ! < 0 for NaN handling | ||||
|       CUDA_KERNEL_ASSERT(!THCNumerics<scalar_t>::lt(val, zero)); // ! < 0 for NaN handling | ||||
|       sum = sum + val; | ||||
|     } | ||||
|  | ||||
|     sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<scalar_t>(), zero); | ||||
|     if (threadIdx.x == 0) { | ||||
|       CUDA_ALWAYS_ASSERT(!THCNumerics<scalar_t>::lt(val, zero)); // ! < 0 for NaN handling | ||||
|       CUDA_KERNEL_ASSERT(!THCNumerics<scalar_t>::lt(val, zero)); // ! < 0 for NaN handling | ||||
|       smem[0] = sum; | ||||
|     } | ||||
|     __syncthreads(); | ||||
| @ -61,7 +61,7 @@ void renormRows(Tensor& t) { | ||||
|   int64_t cols = t.size(1); | ||||
|  | ||||
|   auto props = at::cuda::getCurrentDeviceProperties(); | ||||
|   CUDA_ALWAYS_ASSERT(props != NULL); | ||||
|   CUDA_KERNEL_ASSERT(props != NULL); | ||||
|   int numSM = props->multiProcessorCount; | ||||
|   int maxThreads = props->maxThreadsPerBlock; | ||||
|  | ||||
| @ -84,7 +84,7 @@ __device__ int binarySearchForMultinomial(scalar_t* cumdist, | ||||
|   int start = 0; | ||||
|   int end = size; | ||||
|   // cumdist[size - 1] = 0 => all zero prob dist | ||||
|   CUDA_ALWAYS_ASSERT(cumdist[size - 1] > static_cast<scalar_t>(0)); | ||||
|   CUDA_KERNEL_ASSERT(cumdist[size - 1] > static_cast<scalar_t>(0)); | ||||
|  | ||||
|   while (end - start > 0) { | ||||
|     int mid = start + (end - start) / 2; | ||||
| @ -124,36 +124,33 @@ sampleMultinomialWithReplacement(std::pair<uint64_t, uint64_t> seeds, | ||||
|   // search due to divergence. It seems possible to compute multiple | ||||
|   // values and limit divergence though later on. | ||||
|  | ||||
|   // global index formula for 1D grid of 2D blocks | ||||
|   int idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; | ||||
|   // global index formula for 2D grid of 1D blocks | ||||
|   int idx = blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; | ||||
|  | ||||
|   curandStatePhilox4_32_10_t state; | ||||
|   curand_init(seeds.first, idx, seeds.second, &state); | ||||
|  | ||||
|   // The block determines the distribution for which we generate a point | ||||
|   for (int64_t curDist = blockIdx.x; | ||||
|   for (int64_t curDist = blockIdx.y; | ||||
|        curDist < distributions; | ||||
|        curDist += gridDim.x) { | ||||
|     for (int sampleBase = 0; | ||||
|          sampleBase < totalSamples; sampleBase += blockDim.y) { | ||||
|       // The warp determines the sample | ||||
|       int sample = sampleBase + threadIdx.y; | ||||
|        curDist += gridDim.y) { | ||||
|     for (int sample = blockIdx.x*blockDim.x + threadIdx.x; | ||||
|          sample < totalSamples; sample += blockDim.x*gridDim.x) { | ||||
|  | ||||
|       // All threads participate in this | ||||
|       //we are losing 3 out of 4 generated numbers but it's ok | ||||
|       //this kernel is not very efficient anyway | ||||
|       auto rand = curand_uniform4(&state); | ||||
|       scalar_t r = static_cast<scalar_t>(rand.x); | ||||
|  | ||||
|       if (threadIdx.x == 0 && sample < totalSamples) { | ||||
|         // Find the bucket that a uniform sample lies in | ||||
|         int choice = binarySearchForMultinomial<scalar_t>( | ||||
|             normDistPrefixSum + curDist * categories, | ||||
|             normDist + curDist * categories, | ||||
|             categories, | ||||
|             r); | ||||
|       // Find the bucket that a uniform sample lies in | ||||
|       int choice = binarySearchForMultinomial<scalar_t>( | ||||
|           normDistPrefixSum + curDist * categories, | ||||
|           normDist + curDist * categories, | ||||
|           categories, | ||||
|           r); | ||||
|  | ||||
|       dest[curDist * totalSamples + sample] = choice; | ||||
|  | ||||
|         // Torch indices are 1-based | ||||
|         dest[curDist * totalSamples + sample] = choice; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
| @ -180,17 +177,14 @@ sampleMultinomialWithoutReplacement(std::pair<uint64_t, uint64_t> seeds, | ||||
|  | ||||
|   // The block and warp determines the distribution for which we | ||||
|   // generate a point | ||||
|   for (int64_t curDistBase = blockIdx.x * blockDim.y; | ||||
|        curDistBase < distributions; | ||||
|        curDistBase += gridDim.x * blockDim.y) { | ||||
|     // The warp determines the distribution | ||||
|     int64_t curDist = curDistBase + threadIdx.y; | ||||
|   for (int64_t curDist = blockIdx.x * blockDim.y + threadIdx.y; | ||||
|        curDist < distributions; | ||||
|        curDist += gridDim.x * blockDim.y) { | ||||
|  | ||||
|     // All threads must participate in this | ||||
|     auto rand = curand_uniform4(&state); | ||||
|     scalar_t r = static_cast<scalar_t>(rand.x); | ||||
|  | ||||
|     if (threadIdx.x == 0 && curDist < distributions) { | ||||
|     if (threadIdx.x == 0) { | ||||
|       // Find the bucket that a uniform sample lies in | ||||
|       int choice = binarySearchForMultinomial<scalar_t>( | ||||
|           normDistPrefixSum + curDist * categories, | ||||
| @ -240,9 +234,9 @@ sampleMultinomialOnce(int64_t* dest, | ||||
|     scalar_t val; | ||||
|     for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) { | ||||
|       val = dist[curDist * stride_dist + cat * stride_categories]; | ||||
|       CUDA_ALWAYS_ASSERT(val >= zero); | ||||
|       CUDA_ALWAYS_ASSERT(!THCNumerics<scalar_t>::isinf(val)); | ||||
|       CUDA_ALWAYS_ASSERT(!THCNumerics<scalar_t>::isnan(val)); | ||||
|       CUDA_KERNEL_ASSERT(val >= zero); | ||||
|       CUDA_KERNEL_ASSERT(!THCNumerics<scalar_t>::isinf(val)); | ||||
|       CUDA_KERNEL_ASSERT(!THCNumerics<scalar_t>::isnan(val)); | ||||
|       sum = sum + static_cast<accscalar_t>(val); | ||||
|     } | ||||
|  | ||||
| @ -252,8 +246,8 @@ sampleMultinomialOnce(int64_t* dest, | ||||
|     // Broadcast sum and sample value | ||||
|     if (threadIdx.x == 0) { | ||||
|       // Make sure the sum of our distribution didn't overflow | ||||
|       CUDA_ALWAYS_ASSERT(!THCNumerics<accscalar_t>::isinf(sum)); | ||||
|       CUDA_ALWAYS_ASSERT(sum > accZero); | ||||
|       CUDA_KERNEL_ASSERT(!THCNumerics<accscalar_t>::isinf(sum)); | ||||
|       CUDA_KERNEL_ASSERT(sum > accZero); | ||||
|  | ||||
|       asmem[0] = sum; | ||||
|       smem[0] = sampled[curDist]; | ||||
| @ -363,7 +357,7 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n | ||||
|   AT_DISPATCH_FLOATING_TYPES_AND_HALF(self_v.scalar_type(), "multinomial_kernel_cuda", [&] { | ||||
|     using accscalar_t = at::acc_type<scalar_t, true>; | ||||
|     auto props = at::cuda::getCurrentDeviceProperties(); | ||||
|     CUDA_ALWAYS_ASSERT(props != NULL); | ||||
|     CUDA_KERNEL_ASSERT(props != NULL); | ||||
|     int numSM = props->multiProcessorCount; | ||||
|     int maxThreads = props->maxThreadsPerBlock; | ||||
|     int maxShared = props->sharedMemPerBlock; | ||||
| @ -415,26 +409,27 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n | ||||
|       std::pair<uint64_t, uint64_t> rng_engine_inputs; | ||||
|  | ||||
|       if (with_replacement) { | ||||
|         // Binary search is warp divergent (so effectively we're running | ||||
|         // with just a single thread), but for better utilization, | ||||
|         // we need each block to have at least 4 warps. | ||||
|         dim3 block(128); | ||||
|  | ||||
|         // Each block will generate a sample from one | ||||
|         // distribution concurrently. | ||||
|         int grid_y=std::min<int>(numDist, at::cuda::getCurrentDeviceProperties()->maxGridSize[1]); | ||||
|         dim3 grid((n_sample-1)/block.x+1, grid_y); | ||||
|         { | ||||
|           // See Note [Acquire lock when using random generators] | ||||
|           std::lock_guard<std::mutex> lock(gen->mutex_); | ||||
|  | ||||
|           // each thread will utilize one random, however, since we have to use | ||||
|           // each thread generates a single sample for (numdist/numblocks.y) distributions, however, since we have to use | ||||
|           // curand_uniform4 (See Note [Register spilling in curand call for CUDA < 10]), | ||||
|           // offset is 4. | ||||
|           rng_engine_inputs = gen->philox_engine_inputs(4); | ||||
|           // offset is 4 times that. | ||||
|           auto offset = ((numDist-1)/grid.y+1)*4; | ||||
|           rng_engine_inputs = gen->philox_engine_inputs(offset); | ||||
|         } | ||||
|         // Sample with replacement | ||||
|  | ||||
|         // Binary search is warp divergent (so effectively we're running | ||||
|         // with just a single thread), but for better utilization, | ||||
|         // we need each block to have at least 4 warps. | ||||
|         dim3 block(32, 4); | ||||
|  | ||||
|         // Each warp in a block will generate a sample from one | ||||
|         // distribution concurrently. | ||||
|         dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS); | ||||
|  | ||||
|         sampleMultinomialWithReplacement | ||||
|             <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>( | ||||
|             rng_engine_inputs, | ||||
| @ -470,10 +465,11 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n | ||||
|             // See Note [Acquire lock when using random generators] | ||||
|             std::lock_guard<std::mutex> lock(gen->mutex_); | ||||
|  | ||||
|             // each thread will utilize one random, however, since we have to use | ||||
|             // each thread will utilize distributions/(gridDim.x*blockDim.y) randoms, however, since we have to use | ||||
|             // curand_uniform4 (See Note [Register spilling in curand call for CUDA < 10]), | ||||
|             // offset is 4. | ||||
|             rng_engine_inputs = gen->philox_engine_inputs(4); | ||||
|             // offset is 4 times that. | ||||
|             auto offset = ((numDist-1)/(grid.x*block.y)+1)*4; | ||||
|             rng_engine_inputs = gen->philox_engine_inputs(offset); | ||||
|           } | ||||
|  | ||||
|           // The kernel can only draw one sample before we have to | ||||
|  | ||||
| @ -431,13 +431,12 @@ __global__ void batch_norm_backward_reduce_kernel( | ||||
|     const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output, | ||||
|     GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean, | ||||
|     GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd, | ||||
|     GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean_dy, | ||||
|     GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean_dy_xmu, | ||||
|     GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy, | ||||
|     GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu, | ||||
|     GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight, | ||||
|     GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias) { | ||||
|  | ||||
|   index_t plane = blockIdx.x; | ||||
|   index_t N = input.size(0) * input.size(2); | ||||
|  | ||||
|   stat_accscalar_t r_mean = mean[plane]; | ||||
|   stat_accscalar_t factor = invstd[plane]; | ||||
| @ -446,7 +445,6 @@ __global__ void batch_norm_backward_reduce_kernel( | ||||
|   Float2<input_scalar_t, stat_accscalar_t> res = reduce<Float2<input_scalar_t, stat_accscalar_t>, GradOp<input_scalar_t, stat_accscalar_t, | ||||
|                                                                                    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>>>(g, grad_output, plane); | ||||
|  | ||||
|   stat_accscalar_t norm = stat_accscalar_t(1) / N; | ||||
|   if (threadIdx.x == 0) { | ||||
|     if (grad_weight.size(0) > 0) { | ||||
|       grad_weight[plane] = static_cast<stat_scalar_t>(res.v2 * factor); | ||||
| @ -454,11 +452,11 @@ __global__ void batch_norm_backward_reduce_kernel( | ||||
|     if (grad_bias.size(0) > 0) { | ||||
|       grad_bias[plane] = static_cast<stat_scalar_t>(res.v1); | ||||
|     } | ||||
|     if (mean_dy.size(0) > 0) { | ||||
|       mean_dy[plane] = static_cast<stat_accscalar_t>(res.v1 * norm); | ||||
|     if (sum_dy.size(0) > 0) { | ||||
|       sum_dy[plane] = static_cast<stat_accscalar_t>(res.v1); | ||||
|     } | ||||
|     if (mean_dy_xmu.size(0) > 0) { | ||||
|       mean_dy_xmu[plane] = static_cast<stat_accscalar_t>(res.v2 * norm); | ||||
|     if (sum_dy_xmu.size(0) > 0) { | ||||
|       sum_dy_xmu[plane] = static_cast<stat_accscalar_t>(res.v2); | ||||
|     } | ||||
|   } | ||||
| } | ||||
| @ -740,16 +738,16 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ | ||||
|  | ||||
|   using stat_accscalar_t = at::acc_type<stat_scalar_t, true>; | ||||
|   int64_t n_input = input_.size(1); | ||||
|   Tensor mean_dy_; | ||||
|   Tensor mean_dy_xmu_; | ||||
|   Tensor sum_dy_; | ||||
|   Tensor sum_dy_xmu_; | ||||
|   Tensor grad_weight_; | ||||
|   Tensor grad_bias_; | ||||
|   auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions | ||||
|   auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); | ||||
|  | ||||
|   if (input_g) { | ||||
|     mean_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|     mean_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|     sum_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|     sum_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|   } | ||||
|   if (weight_g) { | ||||
|     grad_weight_ = at::empty({n_input}, weight_.options()); | ||||
| @ -764,8 +762,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ | ||||
|   auto grad_bias = packed_accessor_or_dummy<stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_); | ||||
|   auto mean = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_); | ||||
|   auto invstd = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_); | ||||
|   auto mean_dy = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_dy_); | ||||
|   auto mean_dy_xmu = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_dy_xmu_); | ||||
|   auto sum_dy = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_); | ||||
|   auto sum_dy_xmu = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_); | ||||
|  | ||||
|   auto batch_size = input_reshaped.size(0); | ||||
|   auto feature_size = input_reshaped.size(2); | ||||
| @ -778,10 +776,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ | ||||
|   const dim3 grid(n_input); | ||||
|  | ||||
|   batch_norm_backward_reduce_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<grid, block, 0, stream>>> | ||||
|     (input, grad_output, mean, invstd, mean_dy, mean_dy_xmu, grad_weight, grad_bias); | ||||
|     (input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias); | ||||
|   AT_CUDA_CHECK(cudaGetLastError()); | ||||
|  | ||||
|   return std::make_tuple(mean_dy_, mean_dy_xmu_, grad_weight_, grad_bias_); | ||||
|   return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_); | ||||
| } | ||||
|  | ||||
| template<typename input_scalar_t, typename stat_scalar_t, typename index_t> | ||||
|  | ||||
| @ -307,6 +307,15 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) { | ||||
|                 "tensor ", i); | ||||
|   } | ||||
|  | ||||
|   // Dtypes should be the same | ||||
|   const auto first_in_cat = inputs[0]; | ||||
|   for (int64_t i = 1; i < inputs.size(); i++) { | ||||
|     TORCH_CHECK(first_in_cat.dtype() == inputs[i].dtype(), | ||||
|               "Expected object of scalar type ", first_in_cat.dtype(), | ||||
|               " but got scalar type ", inputs[i].dtype(), | ||||
|               " for sequence element ", i, "."); | ||||
|   } | ||||
|  | ||||
|   for (int i = 0; i < inputs.size(); i++) | ||||
|   { | ||||
|     if (should_skip(inputs[i])) { | ||||
| @ -325,6 +334,12 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) { | ||||
|   TORCH_CHECK(inputs.size() > 0, "invalid number of inputs ", inputs.size()); | ||||
|   TORCH_CHECK(dimension >= 0, "invalid dimension ", dimension); | ||||
|  | ||||
|   for (const Tensor& t: inputs) { | ||||
|     TORCH_CHECK(t.device() == notSkippedTensor->device(), | ||||
|                 "All input tensors must be on the same device. Received ", | ||||
|                 t.device(), " and ", notSkippedTensor->device()); | ||||
|   } | ||||
|  | ||||
|   c10::MemoryFormat memory_format = compute_output_memory_format(inputs); | ||||
|  | ||||
|   std::vector<int64_t> size(notSkippedTensor->sizes().vec()); | ||||
| @ -355,17 +370,11 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) { | ||||
|   // 4. The number of dimensions is <= 4 | ||||
|   // 5. All input tensors are contiguous (output tensor may be non-contig) | ||||
|   // 6. All input tensors can use 32-bit indexing | ||||
|   // 7. All input tensors are on the same device | ||||
|  | ||||
|   const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(), | ||||
|     [] (const Tensor& t) { | ||||
|       return at::cuda::detail::canUse32BitIndexMath(t); | ||||
|     }); | ||||
|   Device firstDevice = notSkippedTensor->device(); | ||||
|   const bool allSameDevice = std::all_of(inputs.begin(), inputs.end(), | ||||
|     [firstDevice](const Tensor& t) { | ||||
|       return t.device() == firstDevice; | ||||
|     }); | ||||
|   const bool allContiguous = std::all_of(inputs.begin(), inputs.end(), | ||||
|     [=](const Tensor& t) { | ||||
|       return !t.defined() || t.is_contiguous(memory_format); | ||||
| @ -375,8 +384,7 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) { | ||||
|       out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && | ||||
|       at::cuda::detail::canUse32BitIndexMath(out) && | ||||
|       allContiguous && | ||||
|       all32BitIndexable && | ||||
|       allSameDevice) { | ||||
|       all32BitIndexable) { | ||||
|  | ||||
|     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( | ||||
|         at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, | ||||
|  | ||||
| @ -65,7 +65,7 @@ struct TopKTypeConfig<int16_t> { | ||||
|   typedef uint32_t RadixType; | ||||
|  | ||||
|   static inline __device__ RadixType convert(int16_t v) { | ||||
|     assert(sizeof(short) == 2); | ||||
|     static_assert(sizeof(short) == 2, ""); | ||||
|     return 32768u + v; | ||||
|   } | ||||
|  | ||||
| @ -79,7 +79,7 @@ struct TopKTypeConfig<int32_t> { | ||||
|   typedef uint32_t RadixType; | ||||
|  | ||||
|   static inline __device__ RadixType convert(int32_t v) { | ||||
|     assert(sizeof(int) == 4); | ||||
|     static_assert(sizeof(int) == 4, ""); | ||||
|     return 2147483648u + v; | ||||
|   } | ||||
|  | ||||
| @ -93,7 +93,7 @@ struct TopKTypeConfig<int64_t> { | ||||
|   typedef uint64_t RadixType; | ||||
|  | ||||
|   static inline __device__ RadixType convert(int64_t v) { | ||||
|     assert(sizeof(int64_t) == 8); | ||||
|     static_assert(sizeof(int64_t) == 8, ""); | ||||
|     return 9223372036854775808ull + v; | ||||
|   } | ||||
|  | ||||
| @ -125,7 +125,7 @@ struct TopKTypeConfig<at::Half> { | ||||
|   static inline __device__ RadixType convert(at::Half v) { | ||||
| #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) | ||||
|     RadixType x = __half_as_ushort(v); | ||||
|     RadixType mask = -((x >> 15)) | 0x8000; | ||||
|     RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000; | ||||
|     return (v == v) ? (x ^ mask) : 0xffff; | ||||
| #else | ||||
|     assert(false); | ||||
| @ -135,7 +135,7 @@ struct TopKTypeConfig<at::Half> { | ||||
|  | ||||
|   static inline __device__ at::Half deconvert(RadixType v) { | ||||
| #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) | ||||
|     RadixType mask = ((v >> 15) - 1) | 0x8000; | ||||
|     RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff; | ||||
|     return __ushort_as_half(v ^ mask); | ||||
| #else | ||||
|     assert(false); | ||||
|  | ||||
| @ -44,6 +44,7 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) { | ||||
| } | ||||
|  | ||||
| Tensor empty_cuda(IntArrayRef size, const TensorOptions& options, c10::optional<MemoryFormat> optional_memory_format) { | ||||
|   TORCH_CHECK(!isComplexType(at::typeMetaToScalarType(options.dtype())), "Complex dtype not supported."); | ||||
|   AT_ASSERT(options.device().type() == at::DeviceType::CUDA); | ||||
|   TORCH_INTERNAL_ASSERT(impl::variable_excluded_from_dispatch()); | ||||
|   TORCH_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned"); | ||||
|  | ||||
| @ -238,18 +238,12 @@ | ||||
|  | ||||
| - func: real(Tensor self) -> Tensor | ||||
|   use_c10_dispatcher: full | ||||
|   variants: function, method | ||||
|   supports_named_tensor: True | ||||
|  | ||||
| - func: real.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   variants: function | ||||
|   supports_named_tensor: True | ||||
|  | ||||
| - func: imag(Tensor self) -> Tensor | ||||
|   use_c10_dispatcher: full | ||||
|   variants: function, method | ||||
|   supports_named_tensor: True | ||||
|  | ||||
| - func: imag.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   variants: function | ||||
|   supports_named_tensor: True | ||||
|  | ||||
| - func: conj(Tensor self) -> Tensor | ||||
| @ -2872,7 +2866,7 @@ | ||||
|  | ||||
| - func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor | ||||
|   use_c10_dispatcher: full | ||||
|   variants: function | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     CPU: true_divide | ||||
|     CUDA: true_divide | ||||
| @ -2880,6 +2874,15 @@ | ||||
|     SparseCUDA: true_divide_sparse | ||||
|   supports_named_tensor: True | ||||
|  | ||||
| - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) | ||||
|   variants: method | ||||
|   dispatch: | ||||
|     CPU: true_divide_ | ||||
|     CUDA: true_divide_ | ||||
|     SparseCPU: true_divide_sparse_ | ||||
|     SparseCUDA: true_divide_sparse_ | ||||
|   supports_named_tensor: True | ||||
|  | ||||
| - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   dispatch: | ||||
|     CPU: true_divide_out | ||||
| @ -2890,7 +2893,11 @@ | ||||
|  | ||||
| - func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor | ||||
|   use_c10_dispatcher: full | ||||
|   variants: function | ||||
|   variants: function, method | ||||
|   supports_named_tensor: True | ||||
|  | ||||
| - func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) | ||||
|   variants: method | ||||
|   supports_named_tensor: True | ||||
|  | ||||
| - func: trunc(Tensor self) -> Tensor | ||||
|  | ||||
| @ -272,6 +272,10 @@ SparseTensor& true_divide_out_sparse_scalar( | ||||
|   return true_divide_out_sparse_zerodim(result, dividend, wrapped_scalar_tensor(divisor)); | ||||
| } | ||||
|  | ||||
| Tensor& true_divide_sparse_(Tensor& self, const Tensor& divisor) { | ||||
|   return true_divide_out_sparse_zerodim(self, self, divisor); | ||||
| } | ||||
|  | ||||
| // -------------------------------------------------------------------- | ||||
| // floor_divide(SparseTensor, Scalar) | ||||
| // -------------------------------------------------------------------- | ||||
|  | ||||
| @ -138,7 +138,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) { | ||||
|       // broadcasting logic; instead, it will blast the elements from one | ||||
|       // to the other so long as the numel is the same | ||||
|       indicesSlice.copy_(indices1D); | ||||
|       indices1D.div_(self.size(d)); | ||||
|       indices1D.floor_divide_(self.size(d)); | ||||
|       indicesSlice.add_(indices1D, -self.size(d)); | ||||
|     } | ||||
|   } | ||||
|  | ||||
| @ -14,7 +14,7 @@ namespace xnnpack { | ||||
| namespace { | ||||
| torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_class() { | ||||
|   static auto register_linear_op_context_class = | ||||
|       torch::jit::class_<XNNPackLinearOpContext>("XNNPackLinearOpContext") | ||||
|       torch::jit::class_<XNNPackLinearOpContext>("xnnpack", "XNNPackLinearOpContext") | ||||
|           .def_pickle( | ||||
|               [](const c10::intrusive_ptr<XNNPackLinearOpContext>& op_context) | ||||
|                   -> SerializationTypeLinearPrePack { // __getstate__ | ||||
| @ -38,7 +38,7 @@ torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_cl | ||||
|  | ||||
| torch::jit::class_<XNNPackConv2dOpContext> register_xnnpack_conv2d_op_context_class() { | ||||
|   static auto register_conv2d_op_context_class = | ||||
|       torch::jit::class_<XNNPackConv2dOpContext>("XNNPackConv2dOpContext") | ||||
|       torch::jit::class_<XNNPackConv2dOpContext>("xnnpack", "XNNPackConv2dOpContext") | ||||
|           .def_pickle( | ||||
|               [](const c10::intrusive_ptr<XNNPackConv2dOpContext>& op_context) | ||||
|                   -> SerializationTypeConv2dPrePack { // __getstate__ | ||||
| @ -74,25 +74,25 @@ static auto registry = | ||||
|   // Registering under _xnnpack namespace for now. As we add more backend requiring similar functionality | ||||
|   // We can refactor the code and use a better namespace. | ||||
|     torch::RegisterOperators() | ||||
|         .op("_xnnpack::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.XNNPackLinearOpContext", | ||||
|         .op("_xnnpack::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.xnnpack.XNNPackLinearOpContext", | ||||
|             torch::RegisterOperators::options() | ||||
|             .aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION) | ||||
|             .kernel<internal::linear::LinearPrePack>( | ||||
|                 DispatchKey::CPUTensorId)) | ||||
|         .op("_xnnpack::linear_packed(Tensor X, __torch__.torch.classes.XNNPackLinearOpContext W_prepack) -> Tensor Y", | ||||
|         .op("_xnnpack::linear_packed(Tensor X, __torch__.torch.classes.xnnpack.XNNPackLinearOpContext W_prepack) -> Tensor Y", | ||||
|             torch::RegisterOperators::options() | ||||
|             .aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION) | ||||
|             .kernel<internal::linear::LinearPacked>( | ||||
|                 DispatchKey::CPUTensorId)) | ||||
|         .op("_xnnpack::conv2d_prepack(Tensor W, Tensor? B, int[2] stride, " | ||||
|             "int[2] padding, int[2] dilation, int groups) " | ||||
|             "-> __torch__.torch.classes.XNNPackConv2dOpContext", | ||||
|             "-> __torch__.torch.classes.xnnpack.XNNPackConv2dOpContext", | ||||
|             torch::RegisterOperators::options() | ||||
|             .aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION) | ||||
|             .kernel<internal::convolution2d::Conv2dPrePack>( | ||||
|                 DispatchKey::CPUTensorId)) | ||||
|         .op("_xnnpack::conv2d_packed(Tensor X, " | ||||
|             "__torch__.torch.classes.XNNPackConv2dOpContext W_prepack) -> Tensor Y", | ||||
|             "__torch__.torch.classes.xnnpack.XNNPackConv2dOpContext W_prepack) -> Tensor Y", | ||||
|             torch::RegisterOperators::options() | ||||
|             .aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION) | ||||
|             .kernel<internal::convolution2d::Conv2dPacked>( | ||||
|  | ||||
| @ -423,6 +423,85 @@ class CAFFE2_API Tensor { | ||||
|  | ||||
|   // ~~~~~ Autograd API ~~~~~ | ||||
|  | ||||
|   /// \fn bool is_leaf() const; | ||||
|   /// | ||||
|   /// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention. | ||||
|   /// | ||||
|   /// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were | ||||
|   /// created by the user. This means that they are not the result of an operation and so | ||||
|   /// `grad_fn()` is `nullptr`. | ||||
|   /// | ||||
|   /// Only leaf Tensors will have their `grad()` populated during a call to `backward()`. | ||||
|   /// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`. | ||||
|   /// | ||||
|   /// Example: | ||||
|   /// @code | ||||
|   /// auto a = torch::rand(10, torch::requires_grad()); | ||||
|   /// std::cout << a.is_leaf() << std::endl; // prints `true` | ||||
|   /// | ||||
|   /// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA); | ||||
|   /// std::cout << b.is_leaf() << std::endl; // prints `false` | ||||
|   /// // b was created by the operation that cast a cpu Tensor into a cuda Tensor | ||||
|   /// | ||||
|   /// auto c = torch::rand(10, torch::requires_grad()) + 2; | ||||
|   /// std::cout << c.is_leaf() << std::endl; // prints `false` | ||||
|   /// // c was created by the addition operation | ||||
|   /// | ||||
|   /// auto d = torch::rand(10).cuda(); | ||||
|   /// std::cout << d.is_leaf() << std::endl; // prints `true` | ||||
|   /// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine) | ||||
|   /// | ||||
|   /// auto e = torch::rand(10).cuda().requires_grad_(); | ||||
|   /// std::cout << e.is_leaf() << std::endl; // prints `true` | ||||
|   /// // e requires gradients and has no operations creating it | ||||
|   /// | ||||
|   /// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true)); | ||||
|   /// std::cout << f.is_leaf() << std::endl; // prints `true` | ||||
|   /// // f requires grad, has no operation creating it | ||||
|   /// @endcode | ||||
|  | ||||
|   /// \fn void backward(const Tensor & gradient={}, bool keep_graph=false, bool create_graph=false) const; | ||||
|   /// | ||||
|   /// Computes the gradient of current tensor with respect to graph leaves. | ||||
|   /// | ||||
|   /// The graph is differentiated using the chain rule. If the tensor is | ||||
|   /// non-scalar (i.e. its data has more than one element) and requires | ||||
|   /// gradient, the function additionally requires specifying ``gradient``. | ||||
|   /// It should be a tensor of matching type and location, that contains | ||||
|   /// the gradient of the differentiated function w.r.t. this Tensor. | ||||
|   /// | ||||
|   /// This function accumulates gradients in the leaves - you might need to | ||||
|   /// zero them before calling it. | ||||
|   /// | ||||
|   /// \param gradient Gradient w.r.t. the | ||||
|   ///     tensor. If it is a tensor, it will be automatically converted | ||||
|   ///     to a Tensor that does not require grad unless ``create_graph`` is True. | ||||
|   ///     None values can be specified for scalar Tensors or ones that | ||||
|   ///     don't require grad. If a None value would be acceptable then | ||||
|   ///     this argument is optional. | ||||
|   /// \param keep_graph If ``false``, the graph used to compute | ||||
|   ///     the grads will be freed. Note that in nearly all cases setting | ||||
|   ///     this option to True is not needed and often can be worked around | ||||
|   ///     in a much more efficient way. Defaults to the value of | ||||
|   ///     ``create_graph``. | ||||
|   /// \param create_graph If ``true``, graph of the derivative will | ||||
|   ///     be constructed, allowing to compute higher order derivative | ||||
|   ///     products. Defaults to ``false``. | ||||
|  | ||||
|   /// \fn Tensor detach() const; | ||||
|   /// | ||||
|   /// Returns a new Tensor, detached from the current graph. | ||||
|   /// The result will never require gradient. | ||||
|  | ||||
|   /// \fn Tensor & detach_() const; | ||||
|   /// | ||||
|   /// Detaches the Tensor from the graph that created it, making it a leaf. | ||||
|   /// Views cannot be detached in-place. | ||||
|  | ||||
|   /// \fn void retain_grad() const; | ||||
|   /// | ||||
|   /// Enables .grad() for non-leaf Tensors. | ||||
|  | ||||
|   Tensor& set_requires_grad(bool requires_grad) { | ||||
|     impl_->set_requires_grad(requires_grad); | ||||
|     return *this; | ||||
| @ -431,9 +510,16 @@ class CAFFE2_API Tensor { | ||||
|     return impl_->requires_grad(); | ||||
|   } | ||||
|  | ||||
|   /// Return a mutable reference to the gradient. This is conventionally | ||||
|   /// used as `t.grad() = x` to set a gradient to a completely new tensor. | ||||
|   Tensor& grad() { | ||||
|     return impl_->grad(); | ||||
|   } | ||||
|  | ||||
|   /// This function returns an undefined tensor by default and returns a defined tensor | ||||
|   /// the first time a call to `backward()` computes gradients for this Tensor. | ||||
|   /// The attribute will then contain the gradients computed and future calls | ||||
|   /// to `backward()` will accumulate (add) gradients into it. | ||||
|   const Tensor& grad() const { | ||||
|     return impl_->grad(); | ||||
|   } | ||||
| @ -505,11 +591,38 @@ class CAFFE2_API Tensor { | ||||
|   template <typename T> | ||||
|   using hook_return_var_t = std::enable_if_t<std::is_same<typename std::result_of<T&(Tensor)>::type, Tensor>::value, unsigned>; | ||||
|  | ||||
|   // Returns the index of the hook in the list which can be used to remove hook | ||||
|   // Register a hook with no return value | ||||
|   /// Registers a backward hook. | ||||
|   /// | ||||
|   /// The hook will be called every time a gradient with respect to the Tensor is computed. | ||||
|   /// The hook should have one of the following signature: | ||||
|   /// ``` | ||||
|   /// hook(Tensor grad) -> Tensor | ||||
|   /// ``` | ||||
|   /// ``` | ||||
|   /// hook(Tensor grad) -> void | ||||
|   /// ``` | ||||
|   /// The hook should not modify its argument, but it can optionally return a new gradient | ||||
|   /// which will be used in place of `grad`. | ||||
|   /// | ||||
|   /// This function returns the index of the hook in the list which can be used to remove hook. | ||||
|   /// | ||||
|   /// Example: | ||||
|   /// @code | ||||
|   /// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad()); | ||||
|   /// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient | ||||
|   /// v.backward(torch::tensor({1., 2., 3.})); | ||||
|   /// // This prints: | ||||
|   /// // ``` | ||||
|   /// //  2 | ||||
|   /// //  4 | ||||
|   /// //  6 | ||||
|   /// // [ CPUFloatType{3} ] | ||||
|   /// // ``` | ||||
|   /// std::cout << v.grad() << std::endl; | ||||
|   /// v.remove_hook(h);  // removes the hook | ||||
|   /// @endcode | ||||
|   template <typename T> | ||||
|   hook_return_void_t<T> register_hook(T&& hook) const; | ||||
|   // Register a hook with variable return value | ||||
|   template <typename T> | ||||
|   hook_return_var_t<T> register_hook(T&& hook) const; | ||||
|  | ||||
| @ -518,7 +631,7 @@ private: | ||||
|  | ||||
| public: | ||||
|  | ||||
|   // Remove hook at given position | ||||
|   /// Remove hook at given position | ||||
|   void remove_hook(unsigned pos) const; | ||||
|  | ||||
|   // View Variables | ||||
|  | ||||
| @ -69,12 +69,6 @@ | ||||
| # define TH_UNUSED | ||||
| #endif | ||||
|  | ||||
| #if defined(__clang__) | ||||
| #define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero"))) | ||||
| #else | ||||
| #define __ubsan_ignore_float_divide_by_zero__ | ||||
| #endif | ||||
|  | ||||
| #ifndef M_PI | ||||
| # define M_PI 3.14159265358979323846 | ||||
| #endif | ||||
|  | ||||
| @ -9,7 +9,7 @@ set(extra_src) | ||||
| # loop over all types | ||||
| foreach(THC_TYPE Byte Char Short Int Long Half Float Double) | ||||
|    # loop over files which need to be split between types (because of long compile times) | ||||
|    foreach(THC_FILE TensorSort TensorMathPointwise TensorMathReduce TensorMasked) | ||||
|    foreach(THC_FILE TensorSort TensorMathPointwise TensorMathReduce TensorMasked TensorTopK) | ||||
|       if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu") | ||||
|          FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu" | ||||
|            "#include <THC/THC${THC_FILE}.cuh>\n#include <THC/THCTensor.hpp>\n\n#include <THC/generic/THC${THC_FILE}.cu>\n#include <THC/THCGenerate${THC_TYPE}Type.h>\n") | ||||
| @ -56,7 +56,6 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorIndex.cu | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cu | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorScatterGather.cu | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTopK.cu | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu | ||||
|   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu | ||||
|  | ||||
| @ -73,7 +73,7 @@ TensorInfo<T, IndexType>::TensorInfo(T* p, | ||||
| template <typename T, typename IndexType> | ||||
| void | ||||
| TensorInfo<T, IndexType>::reduceDim(int dim) { | ||||
|   assert(dim < dims && dim >= 0); | ||||
|   TORCH_INTERNAL_ASSERT(dim < dims && dim >= 0); | ||||
|   sizes[dim] = 1; | ||||
| } | ||||
|  | ||||
| @ -81,7 +81,7 @@ template <typename T, typename IndexType> | ||||
| int | ||||
| TensorInfo<T, IndexType>::collapseDims(const int excludeDim) { | ||||
|  | ||||
|   assert(excludeDim >= -1 && excludeDim < dims); | ||||
|   TORCH_INTERNAL_ASSERT(excludeDim >= -1 && excludeDim < dims); | ||||
|  | ||||
|   int stopDim = (excludeDim == -1) ? dims : excludeDim; | ||||
|   int newIndex = -1; | ||||
|  | ||||
| @ -1,19 +0,0 @@ | ||||
| #include <THC/THC.h> | ||||
| #include <THC/THCReduceApplyUtils.cuh> | ||||
| #include <THC/THCTensorCopy.h> | ||||
| #include <THC/THCTensorMath.h> | ||||
| #include <THC/THCAsmUtils.cuh> | ||||
| #include <THC/THCScanUtils.cuh> | ||||
| #include <THC/THCTensorTypeUtils.cuh> | ||||
| #include <THC/THCTensorMathReduce.cuh> | ||||
| #include <ATen/WrapDimUtils.h> | ||||
| #include <algorithm> // for std::min | ||||
|  | ||||
| #if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__ | ||||
| #include <thrust/system/cuda/execution_policy.h> | ||||
| #endif | ||||
|  | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateAllTypes.h> | ||||
| @ -1,6 +1,21 @@ | ||||
| #ifndef THC_TENSOR_TOPK_CUH | ||||
| #define THC_TENSOR_TOPK_CUH | ||||
|  | ||||
| #include <THC/THC.h> | ||||
| #include <THC/THCReduceApplyUtils.cuh> | ||||
| #include <THC/THCTensorCopy.h> | ||||
| #include <THC/THCTensorMath.h> | ||||
| #include <THC/THCAsmUtils.cuh> | ||||
| #include <THC/THCScanUtils.cuh> | ||||
| #include <THC/THCTensorTypeUtils.cuh> | ||||
| #include <THC/THCTensorMathReduce.cuh> | ||||
| #include <ATen/WrapDimUtils.h> | ||||
| #include <algorithm> // for std::min | ||||
|  | ||||
| #if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__ | ||||
| #include <thrust/system/cuda/execution_policy.h> | ||||
| #endif | ||||
|  | ||||
| #include <c10/macros/Macros.h> | ||||
| #include <ATen/native/cuda/SortingRadixSelect.cuh> | ||||
|  | ||||
| @ -52,6 +67,7 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input, | ||||
|     inputSliceStart, outputSliceSize, | ||||
|     inputSliceSize, inputWithinSliceStride, | ||||
|     smem, &topKValue); | ||||
|   const auto topKConverted = at::native::TopKTypeConfig<T>::convert(topKValue); | ||||
|  | ||||
|   // Every value that is strictly less/greater than `pattern` | ||||
|   // (depending on sort dir) in sorted int format is in the top-K. | ||||
| @ -74,11 +90,12 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input, | ||||
|     bool inRange = (i < inputSliceSize); | ||||
|     T v = | ||||
|       inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0); | ||||
|     const auto convertedV = at::native::TopKTypeConfig<T>::convert(v); | ||||
|     bool hasTopK; | ||||
|     if (Order) { | ||||
|       hasTopK = inRange && (THCNumerics<T>::gt(v, topKValue)); | ||||
|       hasTopK = inRange && (convertedV > topKConverted); | ||||
|     } else { | ||||
|       hasTopK = inRange && (THCNumerics<T>::lt(v, topKValue)); | ||||
|       hasTopK = inRange && (convertedV < topKConverted); | ||||
|     } | ||||
|  | ||||
|     int index; | ||||
| @ -111,7 +128,8 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input, | ||||
|     bool inRange = (i < inputSliceSize); | ||||
|     T v = | ||||
|       inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0); | ||||
|     bool hasTopK = inRange && (THCNumerics<T>::eq(v, topKValue)); | ||||
|     const auto convertedV = at::native::TopKTypeConfig<T>::convert(v); | ||||
|     bool hasTopK = inRange && (convertedV == topKConverted); | ||||
|  | ||||
|     int index; | ||||
|     int carry; | ||||
|  | ||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKByte.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKByte.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
| #include <THC/THCTensor.hpp> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateByteType.h> | ||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKChar.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKChar.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
| #include <THC/THCTensor.hpp> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateCharType.h> | ||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKDouble.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKDouble.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
| #include <THC/THCTensor.hpp> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateDoubleType.h> | ||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKFloat.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKFloat.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
| #include <THC/THCTensor.hpp> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateFloatType.h> | ||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKHalf.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKHalf.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
| #include <THC/THCTensor.hpp> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateHalfType.h> | ||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKInt.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKInt.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
| #include <THC/THCTensor.hpp> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateIntType.h> | ||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKLong.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKLong.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
| #include <THC/THCTensor.hpp> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateLongType.h> | ||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKShort.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKShort.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| #include <THC/THCTensorTopK.cuh> | ||||
| #include <THC/THCTensor.hpp> | ||||
|  | ||||
| #include <THC/generic/THCTensorTopK.cu> | ||||
| #include <THC/THCGenerateShortType.h> | ||||
| @ -269,7 +269,7 @@ void THCTensor_(mode)(THCState *state, | ||||
|         break; | ||||
|       case 1: | ||||
|       default: | ||||
|         assert(false); | ||||
|         TORCH_INTERNAL_ASSERT(false); | ||||
|     } | ||||
|     THCudaCheck(cudaGetLastError()); | ||||
|  | ||||
|  | ||||
| @ -101,7 +101,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state, | ||||
|       /* Nothing to do, data already sorted */          \ | ||||
|       break;                                            \ | ||||
|       default:                                          \ | ||||
|       assert(false);                                    \ | ||||
|       TORCH_INTERNAL_ASSERT(false);                                    \ | ||||
|     }                                                   \ | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -23,6 +23,14 @@ | ||||
|  | ||||
| #include "c10/macros/Export.h" | ||||
|  | ||||
| #if defined(__clang__) | ||||
|   #define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero"))) | ||||
|   #define __ubsan_ignore_float_cast_overflow__ __attribute__((no_sanitize("float-cast-overflow"))) | ||||
| #else | ||||
|   #define __ubsan_ignore_float_divide_by_zero__ | ||||
|   #define __ubsan_ignore_float_cast_overflow__ | ||||
| #endif | ||||
|  | ||||
| // Disable the copy and assignment operator for a class. Note that this will | ||||
| // disable the usage of the class in std containers. | ||||
| #define C10_DISABLE_COPY_AND_ASSIGN(classname) \ | ||||
| @ -196,25 +204,29 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; | ||||
| #define __func__ __FUNCTION__ | ||||
| #endif | ||||
|  | ||||
| // CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda | ||||
| // kernels. This is not supported by Apple platforms so we special case it. | ||||
| // See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion | ||||
| #if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__) | ||||
| #define CUDA_KERNEL_ASSERT(...) | ||||
| #else // __APPLE__ | ||||
| #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__) | ||||
| #endif // __APPLE__ | ||||
|  | ||||
| // CUDA_ALWAYS_ASSERT is similar to CUDA_KERNEL_ASSERT but checks the assertion | ||||
| // CUDA_KERNEL_ASSERT checks the assertion | ||||
| // even when NDEBUG is defined. This is useful for important assertions in CUDA | ||||
| // code that when building Release. | ||||
| #if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__) | ||||
| // Those platforms do not support assert() | ||||
| #define CUDA_ALWAYS_ASSERT(cond) | ||||
| #define CUDA_KERNEL_ASSERT(cond) | ||||
| #elif defined(_MSC_VER) | ||||
| // TODO: This should be defined but I don't have the environment to properly | ||||
| // test it. See e.g., https://github.com/pytorch/pytorch/pull/32719#discussion_r379918384 | ||||
| #define CUDA_ALWAYS_ASSERT(cond) | ||||
| #if defined(NDEBUG) | ||||
| extern "C" { | ||||
|   C10_IMPORT | ||||
| #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) || defined(__HIP__) | ||||
|     __host__ __device__ | ||||
| #endif // __CUDA_ARCH__ | ||||
|  void _wassert( | ||||
|     wchar_t const* _Message, | ||||
|     wchar_t const* _File, | ||||
|     unsigned _Line); | ||||
| } | ||||
| #endif | ||||
| #define CUDA_KERNEL_ASSERT(cond)                                                                 \ | ||||
|   if (C10_UNLIKELY(!(cond))) {                                                                   \ | ||||
|     (void)(_wassert(_CRT_WIDE(#cond), _CRT_WIDE(__FILE__), static_cast<unsigned>(__LINE__)), 0); \ | ||||
|   } | ||||
| #else // __APPLE__, _MSC_VER | ||||
| #if defined(NDEBUG) | ||||
| extern "C" { | ||||
| @ -233,7 +245,7 @@ __host__ __device__ | ||||
|         const char* function) throw(); | ||||
| } | ||||
| #endif // NDEBUG | ||||
| #define CUDA_ALWAYS_ASSERT(cond)                                         \ | ||||
| #define CUDA_KERNEL_ASSERT(cond)                                         \ | ||||
|   if (C10_UNLIKELY(!(cond))) {                                           \ | ||||
|     __assert_fail(#cond, __FILE__, static_cast<unsigned int>(__LINE__),  \ | ||||
|                   __func__);                                             \ | ||||
|  | ||||
| @ -66,24 +66,44 @@ void Error::AppendMessage(const std::string& new_msg) { | ||||
| namespace Warning { | ||||
|  | ||||
| namespace { | ||||
|   WarningHandler* getHandler() { | ||||
|   WarningHandler* getBaseHandler() { | ||||
|     static WarningHandler base_warning_handler_ = WarningHandler(); | ||||
|     return &base_warning_handler_; | ||||
|   }; | ||||
|   static thread_local WarningHandler* warning_handler_ = getHandler(); | ||||
|  | ||||
|   class ThreadWarningHandler { | ||||
|     public: | ||||
|       ThreadWarningHandler() = delete; | ||||
|  | ||||
|       static WarningHandler* get_handler() { | ||||
|         if (!warning_handler_) { | ||||
|           warning_handler_ = getBaseHandler(); | ||||
|         } | ||||
|         return warning_handler_; | ||||
|       } | ||||
|  | ||||
|       static void set_handler(WarningHandler* handler) { | ||||
|         warning_handler_ = handler; | ||||
|       } | ||||
|  | ||||
|     private: | ||||
|       static thread_local WarningHandler* warning_handler_; | ||||
|   }; | ||||
|  | ||||
|   thread_local WarningHandler* ThreadWarningHandler::warning_handler_ = nullptr; | ||||
|  | ||||
| } | ||||
|  | ||||
| void warn(SourceLocation source_location, const std::string& msg) { | ||||
|   warning_handler_->process(source_location, msg); | ||||
|   ThreadWarningHandler::get_handler()->process(source_location, msg); | ||||
| } | ||||
|  | ||||
| void set_warning_handler(WarningHandler* handler) noexcept(true) { | ||||
|   warning_handler_ = handler; | ||||
|   ThreadWarningHandler::set_handler(handler); | ||||
| } | ||||
|  | ||||
| WarningHandler* get_warning_handler() noexcept(true) { | ||||
|   return warning_handler_; | ||||
|   return ThreadWarningHandler::get_handler(); | ||||
| } | ||||
|  | ||||
| } // namespace Warning | ||||
|  | ||||
| @ -67,7 +67,7 @@ struct maybe_real<true, src_t> { | ||||
|  | ||||
| template <typename dest_t, typename src_t> | ||||
| struct static_cast_with_inter_type { | ||||
|   C10_HOST_DEVICE static inline dest_t apply(src_t src) { | ||||
|   C10_HOST_DEVICE __ubsan_ignore_float_cast_overflow__ static inline dest_t apply(src_t src) { | ||||
|     constexpr bool real = needs_real<dest_t, src_t>::value; | ||||
|     return static_cast<dest_t>( | ||||
|       static_cast<inter_copy_type_t<dest_t>>(maybe_real<real, src_t>::apply(src))); | ||||
|  | ||||
| @ -748,7 +748,7 @@ if (NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) | ||||
|       target_include_directories(torch_cuda PUBLIC "${NVTOOLEXT_HOME}/include") | ||||
|       # -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it. | ||||
|       # Related issue: https://github.com/pytorch/pytorch/issues/31611 | ||||
|       target_link_libraries(torch_cuda INTERFACE "-INCLUDE:\"?warp_size@cuda@at@@YAHXZ\"") | ||||
|       target_link_libraries(torch_cuda INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ") | ||||
|  | ||||
|     elseif(APPLE) | ||||
|       set(TORCH_CUDA_LIBRARIES | ||||
| @ -949,6 +949,31 @@ if (USE_OPENMP AND OPENMP_FOUND) | ||||
|   target_link_libraries(torch_cpu PRIVATE ${OpenMP_CXX_LIBRARIES}) | ||||
| endif() | ||||
|  | ||||
| if ($ENV{TH_BINARY_BUILD}) | ||||
|   if (NOT MSVC AND USE_CUDA AND NOT APPLE) | ||||
|     # Note [Extra MKL symbols for MAGMA in torch_cpu] | ||||
|     # | ||||
|     # When we build CUDA libraries and link against MAGMA, MAGMA makes use of | ||||
|     # some BLAS symbols in its CPU fallbacks when it has no GPU versions | ||||
|     # of kernels.  Previously, we ensured the BLAS symbols were filled in by | ||||
|     # MKL by linking torch_cuda with BLAS, but when we are statically linking | ||||
|     # against MKL (when we do wheel builds), this actually ends up pulling in a | ||||
|     # decent chunk of MKL into torch_cuda, inflating our torch_cuda binary | ||||
|     # size by 8M.  torch_cpu exposes most of the MKL symbols we need, but | ||||
|     # empirically we determined that there are four which it doesn't provide.  If | ||||
|     # we link torch_cpu with these --undefined symbols, we can ensure they | ||||
|     # do get pulled in, and then we can avoid statically linking in MKL to | ||||
|     # torch_cuda at all! | ||||
|     # | ||||
|     # We aren't really optimizing for binary size on Windows (and this link | ||||
|     # line doesn't work on Windows), so don't do it there. | ||||
|     # | ||||
|     # These linker commands do not work on OS X, do not attempt this there. | ||||
|     # (It shouldn't matter anyway, though, because OS X has dropped CUDA support) | ||||
|     set_target_properties(torch_cpu PROPERTIES LINK_FLAGS "-Wl,--undefined=mkl_lapack_slaed0 -Wl,--undefined=mkl_lapack_dlaed0 -Wl,--undefined=mkl_lapack_dormql -Wl,--undefined=mkl_lapack_sormql") | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
| target_link_libraries(torch_cpu PUBLIC c10) | ||||
| target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS}) | ||||
| target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS}) | ||||
|  | ||||
| @ -261,15 +261,6 @@ CAFFE2_CUDA_API const char* curandGetErrorString(curandStatus_t error); | ||||
|     for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \ | ||||
|          j += blockDim.y * gridDim.y) | ||||
|  | ||||
| // CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda | ||||
| // kernels. This is not supported by Apple platforms so we special case it. | ||||
| // See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion | ||||
| #if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__) | ||||
| #define CUDA_KERNEL_ASSERT(...) | ||||
| #else // __APPLE__ | ||||
| #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__) | ||||
| #endif // __APPLE__ | ||||
|  | ||||
| // The following helper functions are here so that you can write a kernel call | ||||
| // when you are not particularly interested in maxing out the kernels' | ||||
| // performance. Usually, this will give you a reasonable speed, but if you | ||||
|  | ||||
| @ -1,6 +1,8 @@ | ||||
| #include "caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h" | ||||
| #include <fp16.h> | ||||
| #ifdef __AVX__ | ||||
| #include <immintrin.h> | ||||
| #endif | ||||
| #include "c10/util/Registry.h" | ||||
|  | ||||
| namespace caffe2 { | ||||
|  | ||||
| @ -50,8 +50,13 @@ __global__ void ReluCUDAKernel<half2>(const int N, const half2* X, half2* Y) { | ||||
|     Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i)); | ||||
| #else | ||||
|     const float2 xx = __half22float2(X[i]); | ||||
|     Y[i] = | ||||
|         __floats2half2_rn(xx.x > 0.0f ? xx.x : 0.0f, xx.y > 0.0f ? xx.y : 0.0f); | ||||
|     // There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered | ||||
|     // sometimes: | ||||
|     // | ||||
|     //   error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 0>' can be | ||||
|     //   converted to 'float' and vice versa | ||||
|     Y[i] = __floats2half2_rn(xx.x > 0.0f ? static_cast<float>(xx.x) : 0.0f, | ||||
|                              xx.y > 0.0f ? static_cast<float>(xx.y) : 0.0f); | ||||
| #endif | ||||
|   } | ||||
| } | ||||
| @ -100,8 +105,14 @@ __global__ void ReluGradientCUDAKernel<half2>( | ||||
| #else | ||||
|     const float2 dy = __half22float2(dY[i]); | ||||
|     const float2 yy = __half22float2(Y[i]); | ||||
|     dX[i] = | ||||
|         __floats2half2_rn(yy.x > 0.0f ? dy.x : 0.0f, yy.y > 0.0f ? dy.y : 0.0f); | ||||
|     // There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered | ||||
|     // sometimes: | ||||
|     // | ||||
|     //   error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 1>' can be | ||||
|     //   converted to 'float' and vice versa | ||||
|  | ||||
|      dX[i] = __floats2half2_rn(yy.x > 0.0f ? static_cast<float>(dy.x) : 0.0f, | ||||
|                                yy.y > 0.0f ? static_cast<float>(dy.y) : 0.0f); | ||||
| #endif | ||||
|   } | ||||
| } | ||||
|  | ||||
| @ -76,7 +76,7 @@ struct TopKTypeConfig<short> { | ||||
|   typedef unsigned int RadixType; | ||||
|  | ||||
|   static inline __device__ RadixType convert(short v) { | ||||
|     CUDA_KERNEL_ASSERT(sizeof(short) == 2); | ||||
|     static_assert(sizeof(short) == 2, ""); | ||||
|     return 32768u + v; | ||||
|   } | ||||
|  | ||||
| @ -90,7 +90,7 @@ struct TopKTypeConfig<int> { | ||||
|   typedef unsigned int RadixType; | ||||
|  | ||||
|   static inline __device__ RadixType convert(int v) { | ||||
|     CUDA_KERNEL_ASSERT(sizeof(int) == 4); | ||||
|     static_assert(sizeof(int) == 4, ""); | ||||
|     return 2147483648u + v; | ||||
|   } | ||||
|  | ||||
| @ -104,6 +104,7 @@ struct TopKTypeConfig<long> { | ||||
|   typedef unsigned long long int RadixType; | ||||
|  | ||||
|   static inline __device__ RadixType convert(long v) { | ||||
|     //static_assert fails on windows, so leave it as CUDA_KERNEL_ASSERT | ||||
|     CUDA_KERNEL_ASSERT(sizeof(long) == 8); | ||||
|     return 9223372036854775808ull + v; | ||||
|   } | ||||
|  | ||||
							
								
								
									
										40
									
								
								cmake/External/nccl.cmake
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										40
									
								
								cmake/External/nccl.cmake
									
									
									
									
										vendored
									
									
								
							| @ -15,6 +15,7 @@ if (NOT __NCCL_INCLUDED) | ||||
|     # this second replacement is needed when there are multiple archs | ||||
|     string(REPLACE ";-gencode" " -gencode" NVCC_GENCODE "${NVCC_GENCODE}") | ||||
|  | ||||
|     set(__NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl") | ||||
|     ExternalProject_Add(nccl_external | ||||
|       SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl/nccl | ||||
|       BUILD_IN_SOURCE 1 | ||||
| @ -30,20 +31,49 @@ if (NOT __NCCL_INCLUDED) | ||||
|         "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" | ||||
|         "NVCC=${CUDA_NVCC_EXECUTABLE}" | ||||
|         "NVCC_GENCODE=${NVCC_GENCODE}" | ||||
|         "BUILDDIR=${CMAKE_CURRENT_BINARY_DIR}/nccl" | ||||
|         "BUILDDIR=${__NCCL_BUILD_DIR}" | ||||
|         "VERBOSE=0" | ||||
|         "-j" | ||||
|       BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/nccl/lib/libnccl_static.a" | ||||
|         BUILD_BYPRODUCTS "${__NCCL_BUILD_DIR}/lib/libnccl_static.a" | ||||
|       INSTALL_COMMAND "" | ||||
|       ) | ||||
|  | ||||
|     # Detect objcopy version | ||||
|     execute_process (COMMAND "${CMAKE_OBJCOPY}" "--version" OUTPUT_VARIABLE OBJCOPY_VERSION_STR) | ||||
|     string(REGEX REPLACE "GNU objcopy version ([0-9])\\.([0-9]+).*" "\\1" OBJCOPY_VERSION_MAJOR ${OBJCOPY_VERSION_STR}) | ||||
|     string(REGEX REPLACE "GNU objcopy version ([0-9])\\.([0-9]+).*" "\\2" OBJCOPY_VERSION_MINOR ${OBJCOPY_VERSION_STR}) | ||||
|  | ||||
|     if ((${OBJCOPY_VERSION_MAJOR} GREATER 2) OR ((${OBJCOPY_VERSION_MAJOR} EQUAL 2) AND (${OBJCOPY_VERSION_MINOR} GREATER 27))) | ||||
|       message(WARNING "Enabling NCCL library slimming") | ||||
|       add_custom_command( | ||||
|         OUTPUT "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a" | ||||
|         DEPENDS nccl_external | ||||
|         COMMAND "${CMAKE_COMMAND}" -E make_directory "${__NCCL_BUILD_DIR}/objects" | ||||
|         COMMAND cd objects | ||||
|         COMMAND "${CMAKE_AR}" x "${__NCCL_BUILD_DIR}/lib/libnccl_static.a" | ||||
|         COMMAND for obj in all_gather_* all_reduce_* broadcast_* reduce_*.o$<SEMICOLON> do "${CMAKE_OBJCOPY}" --remove-relocations .nvFatBinSegment --remove-section __nv_relfatbin $$obj$<SEMICOLON> done | ||||
|        COMMAND "${CMAKE_AR}" cr "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a" "*.o" | ||||
|         COMMAND cd - | ||||
|         COMMAND "${CMAKE_COMMAND}" -E remove_directory "${__NCCL_BUILD_DIR}/objects" | ||||
|         WORKING_DIRECTORY "${__NCCL_BUILD_DIR}" | ||||
|         COMMENT "Slimming NCCL" | ||||
|       ) | ||||
|     add_custom_target(nccl_slim_external DEPENDS "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a") | ||||
|     set(__NCCL_LIBRARY_DEP nccl_slim_external) | ||||
|     set(NCCL_LIBRARIES ${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a) | ||||
|   else() | ||||
|     message(WARNING "Objcopy version is too old to support NCCL library slimming") | ||||
|     set(__NCCL_LIBRARY_DEP nccl_external) | ||||
|     set(NCCL_LIBRARIES ${__NCCL_BUILD_DIR}/lib/libnccl_static.a) | ||||
|   endif() | ||||
|  | ||||
|  | ||||
|     set(NCCL_FOUND TRUE) | ||||
|     add_library(__caffe2_nccl INTERFACE) | ||||
|     # The following old-style variables are set so that other libs, such as Gloo, | ||||
|     # can still use it. | ||||
|     set(NCCL_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/nccl/include) | ||||
|     set(NCCL_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/nccl/lib/libnccl_static.a) | ||||
|     add_dependencies(__caffe2_nccl nccl_external) | ||||
|     set(NCCL_INCLUDE_DIRS ${__NCCL_BUILD_DIR}/include) | ||||
|     add_dependencies(__caffe2_nccl ${__NCCL_LIBRARY_DEP}) | ||||
|     target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES}) | ||||
|     target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS}) | ||||
|   endif() | ||||
|  | ||||
| @ -56,6 +56,10 @@ INPUT                  = ../../../aten/src/ATen/ATen.h \ | ||||
|                          ../../../c10/cuda/CUDAStream.h \ | ||||
|                          ../../../torch/csrc/api/include \ | ||||
|                          ../../../torch/csrc/api/src \ | ||||
|                          ../../../torch/csrc/autograd/autograd.h \ | ||||
|                          ../../../torch/csrc/autograd/custom_function.h \ | ||||
|                          ../../../torch/csrc/autograd/function.h \ | ||||
|                          ../../../torch/csrc/autograd/variable.h \ | ||||
|                          ../../../torch/csrc/autograd/generated/variable_factories.h \ | ||||
|                          ../../../torch/csrc/jit/runtime/custom_operator.h \ | ||||
|                          ../../../torch/csrc/jit/serialization/import.h \ | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	