mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-11-04 08:00:58 +08:00 
			
		
		
		
	Compare commits
	
		
			122 Commits
		
	
	
		
			mlazos/dat
			...
			v1.5.0-rc4
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 9a51e477ac | |||
| 375566fb78 | |||
| dfdc788076 | |||
| 9e6ef814cc | |||
| 31461800f6 | |||
| e741839b0e | |||
| 8eb39c9cfd | |||
| b5e4c0993d | |||
| 6bc6832bda | |||
| 593594839c | |||
| cf65c8ef15 | |||
| ca0dc1fcdc | |||
| b58f89b2e4 | |||
| 87b6685c6b | |||
| f746f1b746 | |||
| 1379415150 | |||
| 7d638d2596 | |||
| bad005d331 | |||
| 16d8a52407 | |||
| a33b264588 | |||
| 3a67e00889 | |||
| 6bd039551d | |||
| b6c3058d61 | |||
| ed908b4fbc | |||
| b66e0af58b | |||
| bf8a5ede96 | |||
| c2bc5c56c5 | |||
| db3c3ed662 | |||
| 9de4770bbd | |||
| 911a2a6b63 | |||
| 60375bcfdf | |||
| 63dcd9eccc | |||
| e8236d2ed4 | |||
| 0058b1bb7e | |||
| 419283e291 | |||
| 0e6f6ba218 | |||
| ec8dbaf920 | |||
| 7e168d134f | |||
| 6daae58871 | |||
| fee0ff1bf6 | |||
| deaf3b65cf | |||
| dca9c2501d | |||
| 842cd47416 | |||
| a30b49085c | |||
| 82626f8ad9 | |||
| 27fddfda4f | |||
| 7ecf6a1c10 | |||
| beb07a44c4 | |||
| a01c3bd1fe | |||
| ffd010f8a0 | |||
| 8ad59f03a8 | |||
| ed3640df68 | |||
| fb88942f6c | |||
| 5d05c51887 | |||
| df5986fbf3 | |||
| 165403f614 | |||
| fbf18c34ff | |||
| 84f806c821 | |||
| 94139a7d95 | |||
| 75e36186b2 | |||
| f4a0b406dd | |||
| e884e720f0 | |||
| dacdbc22d1 | |||
| 2a789cd0e0 | |||
| f9b010f399 | |||
| 55614ff306 | |||
| b12579da53 | |||
| 920e3eb761 | |||
| bec01e755a | |||
| 6a880e1bc9 | |||
| fa86e32a4e | |||
| 5aabaf2b18 | |||
| 4a707e8f95 | |||
| db127b21eb | |||
| 45313cd9e1 | |||
| df531973e1 | |||
| 9e3c577caa | |||
| 5357b8e4d9 | |||
| 0f23d23db4 | |||
| 7c24280a3f | |||
| 7100f0be13 | |||
| f7f611c2ec | |||
| acb982d0b0 | |||
| aa8b7ad989 | |||
| 2d403ed8be | |||
| c25a664f77 | |||
| ab660ae394 | |||
| 3c476a8858 | |||
| 651fa88645 | |||
| 565c3400b4 | |||
| 3e332778b4 | |||
| f598738920 | |||
| 4c6bfa0187 | |||
| 6f25003682 | |||
| 752c129fa1 | |||
| fb59a9caca | |||
| 4d30dbdd35 | |||
| b7f4a1a397 | |||
| afda1dc943 | |||
| d506ae882b | |||
| 36e5abe531 | |||
| 6e6f62230e | |||
| 5d15577e6c | |||
| 6aa5298c5c | |||
| f3df13725b | |||
| 4eee3caa11 | |||
| 4d96463130 | |||
| 246b824644 | |||
| 5ca9014318 | |||
| 48590d6a9b | |||
| 75148df1f5 | |||
| b89eb7c654 | |||
| 8877885454 | |||
| e2184ba083 | |||
| 8ef47ad2f0 | |||
| 6725b6f503 | |||
| bcd3f6da1a | |||
| 0b3d2f7b7d | |||
| f522651a7e | |||
| 01c8ef2757 | |||
| 7cfe68ce3a | |||
| 6f3120c6b9 | 
@ -466,7 +466,7 @@ But if you want to try, then I’d recommend
 | 
			
		||||
# Always install miniconda 3, even if building for Python <3
 | 
			
		||||
new_conda="~/my_new_conda"
 | 
			
		||||
conda_sh="$new_conda/install_miniconda.sh"
 | 
			
		||||
curl -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 | 
			
		||||
curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 | 
			
		||||
chmod +x "$conda_sh"
 | 
			
		||||
"$conda_sh" -b -p "$MINICONDA_ROOT"
 | 
			
		||||
rm -f "$conda_sh"
 | 
			
		||||
 | 
			
		||||
@ -34,8 +34,6 @@ def get_processor_arch_name(cuda_version):
 | 
			
		||||
 | 
			
		||||
LINUX_PACKAGE_VARIANTS = OrderedDict(
 | 
			
		||||
    manywheel=[
 | 
			
		||||
        "2.7m",
 | 
			
		||||
        "2.7mu",
 | 
			
		||||
        "3.5m",
 | 
			
		||||
        "3.6m",
 | 
			
		||||
        "3.7m",
 | 
			
		||||
@ -43,7 +41,7 @@ LINUX_PACKAGE_VARIANTS = OrderedDict(
 | 
			
		||||
    ],
 | 
			
		||||
    conda=dimensions.STANDARD_PYTHON_VERSIONS,
 | 
			
		||||
    libtorch=[
 | 
			
		||||
        "2.7m",
 | 
			
		||||
        "3.7m",
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
@ -53,11 +51,21 @@ CONFIG_TREE_DATA = OrderedDict(
 | 
			
		||||
        wheel=dimensions.STANDARD_PYTHON_VERSIONS,
 | 
			
		||||
        conda=dimensions.STANDARD_PYTHON_VERSIONS,
 | 
			
		||||
        libtorch=[
 | 
			
		||||
            "2.7",
 | 
			
		||||
            "3.7",
 | 
			
		||||
        ],
 | 
			
		||||
    )),
 | 
			
		||||
    windows=(dimensions.CUDA_VERSIONS, OrderedDict(
 | 
			
		||||
        wheel=dimensions.STANDARD_PYTHON_VERSIONS,
 | 
			
		||||
        conda=dimensions.STANDARD_PYTHON_VERSIONS,
 | 
			
		||||
        libtorch=[
 | 
			
		||||
            "3.7",
 | 
			
		||||
        ],
 | 
			
		||||
    )),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
CONFIG_TREE_DATA_NO_WINDOWS = CONFIG_TREE_DATA.copy()
 | 
			
		||||
CONFIG_TREE_DATA_NO_WINDOWS.pop("windows")
 | 
			
		||||
 | 
			
		||||
# GCC config variants:
 | 
			
		||||
#
 | 
			
		||||
# All the nightlies (except libtorch with new gcc ABI) are built with devtoolset7,
 | 
			
		||||
@ -74,6 +82,11 @@ LINUX_GCC_CONFIG_VARIANTS = OrderedDict(
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
WINDOWS_LIBTORCH_CONFIG_VARIANTS = [
 | 
			
		||||
    "debug",
 | 
			
		||||
    "release",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TopLevelNode(ConfigNode):
 | 
			
		||||
    def __init__(self, node_name, config_tree_data, smoke):
 | 
			
		||||
@ -108,6 +121,8 @@ class PackageFormatConfigNode(ConfigNode):
 | 
			
		||||
    def get_children(self):
 | 
			
		||||
        if self.find_prop("os_name") == "linux":
 | 
			
		||||
            return [LinuxGccConfigNode(self, v) for v in LINUX_GCC_CONFIG_VARIANTS[self.find_prop("package_format")]]
 | 
			
		||||
        elif self.find_prop("os_name") == "windows" and self.find_prop("package_format") == "libtorch":
 | 
			
		||||
            return [WindowsLibtorchConfigNode(self, v) for v in WINDOWS_LIBTORCH_CONFIG_VARIANTS]
 | 
			
		||||
        else:
 | 
			
		||||
            return [ArchConfigNode(self, v) for v in self.find_prop("cuda_versions")]
 | 
			
		||||
 | 
			
		||||
@ -129,6 +144,16 @@ class LinuxGccConfigNode(ConfigNode):
 | 
			
		||||
        return [ArchConfigNode(self, v) for v in cuda_versions]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class WindowsLibtorchConfigNode(ConfigNode):
 | 
			
		||||
    def __init__(self, parent, libtorch_config_variant):
 | 
			
		||||
        super(WindowsLibtorchConfigNode, self).__init__(parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant))
 | 
			
		||||
 | 
			
		||||
        self.props["libtorch_config_variant"] = libtorch_config_variant
 | 
			
		||||
 | 
			
		||||
    def get_children(self):
 | 
			
		||||
        return [ArchConfigNode(self, v) for v in self.find_prop("cuda_versions")]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArchConfigNode(ConfigNode):
 | 
			
		||||
    def __init__(self, parent, cu):
 | 
			
		||||
        super(ArchConfigNode, self).__init__(parent, get_processor_arch_name(cu))
 | 
			
		||||
 | 
			
		||||
@ -6,7 +6,7 @@ import cimodel.lib.miniutils as miniutils
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Conf(object):
 | 
			
		||||
    def __init__(self, os, cuda_version, pydistro, parms, smoke, libtorch_variant, gcc_config_variant):
 | 
			
		||||
    def __init__(self, os, cuda_version, pydistro, parms, smoke, libtorch_variant, gcc_config_variant, libtorch_config_variant):
 | 
			
		||||
 | 
			
		||||
        self.os = os
 | 
			
		||||
        self.cuda_version = cuda_version
 | 
			
		||||
@ -15,11 +15,14 @@ class Conf(object):
 | 
			
		||||
        self.smoke = smoke
 | 
			
		||||
        self.libtorch_variant = libtorch_variant
 | 
			
		||||
        self.gcc_config_variant = gcc_config_variant
 | 
			
		||||
        self.libtorch_config_variant = libtorch_config_variant
 | 
			
		||||
 | 
			
		||||
    def gen_build_env_parms(self):
 | 
			
		||||
        elems = [self.pydistro] + self.parms + [binary_build_data.get_processor_arch_name(self.cuda_version)]
 | 
			
		||||
        if self.gcc_config_variant is not None:
 | 
			
		||||
            elems.append(str(self.gcc_config_variant))
 | 
			
		||||
        if self.libtorch_config_variant is not None:
 | 
			
		||||
            elems.append(str(self.libtorch_config_variant))
 | 
			
		||||
        return elems
 | 
			
		||||
 | 
			
		||||
    def gen_docker_image(self):
 | 
			
		||||
@ -67,9 +70,14 @@ class Conf(object):
 | 
			
		||||
            job_def["requires"].append("update_s3_htmls_for_nightlies_devtoolset7")
 | 
			
		||||
            job_def["filters"] = {"branches": {"only": "postnightly"}}
 | 
			
		||||
        else:
 | 
			
		||||
            filter_branches = ["nightly"]
 | 
			
		||||
            # we only want to add the release branch filter if we aren't
 | 
			
		||||
            # uploading
 | 
			
		||||
            if phase not in ["upload"]:
 | 
			
		||||
                filter_branches.append(r"/release\/.*/")
 | 
			
		||||
            job_def["filters"] = {
 | 
			
		||||
                "branches": {
 | 
			
		||||
                    "only": "nightly"
 | 
			
		||||
                    "only": filter_branches
 | 
			
		||||
                },
 | 
			
		||||
                # Will run on tags like v1.5.0-rc1, etc.
 | 
			
		||||
                "tags": {
 | 
			
		||||
@ -105,11 +113,18 @@ class Conf(object):
 | 
			
		||||
 | 
			
		||||
def get_root(smoke, name):
 | 
			
		||||
 | 
			
		||||
    return binary_build_data.TopLevelNode(
 | 
			
		||||
        name,
 | 
			
		||||
        binary_build_data.CONFIG_TREE_DATA,
 | 
			
		||||
        smoke,
 | 
			
		||||
    )
 | 
			
		||||
    if smoke:
 | 
			
		||||
        return binary_build_data.TopLevelNode(
 | 
			
		||||
            name,
 | 
			
		||||
            binary_build_data.CONFIG_TREE_DATA_NO_WINDOWS,
 | 
			
		||||
            smoke,
 | 
			
		||||
        )
 | 
			
		||||
    else:
 | 
			
		||||
        return binary_build_data.TopLevelNode(
 | 
			
		||||
            name,
 | 
			
		||||
            binary_build_data.CONFIG_TREE_DATA,
 | 
			
		||||
            smoke,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def gen_build_env_list(smoke):
 | 
			
		||||
@ -127,6 +142,7 @@ def gen_build_env_list(smoke):
 | 
			
		||||
            c.find_prop("smoke"),
 | 
			
		||||
            c.find_prop("libtorch_variant"),
 | 
			
		||||
            c.find_prop("gcc_config_variant"),
 | 
			
		||||
            c.find_prop("libtorch_config_variant"),
 | 
			
		||||
        )
 | 
			
		||||
        newlist.append(conf)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -4,7 +4,6 @@ from cimodel.lib.conf_tree import Ver
 | 
			
		||||
 | 
			
		||||
CONFIG_TREE_DATA = [
 | 
			
		||||
    (Ver("ubuntu", "16.04"), [
 | 
			
		||||
        ([Ver("gcc", "5")], [XImportant("onnx_py2")]),
 | 
			
		||||
        ([Ver("clang", "7")], [XImportant("onnx_main_py3.6"),
 | 
			
		||||
                               XImportant("onnx_ort1_py3.6"),
 | 
			
		||||
                               XImportant("onnx_ort2_py3.6")]),
 | 
			
		||||
 | 
			
		||||
@ -33,8 +33,7 @@ class Conf:
 | 
			
		||||
    # TODO: Eventually we can probably just remove the cudnn7 everywhere.
 | 
			
		||||
    def get_cudnn_insertion(self):
 | 
			
		||||
 | 
			
		||||
        omit = self.language == "onnx_py2" \
 | 
			
		||||
            or self.language == "onnx_main_py3.6" \
 | 
			
		||||
        omit = self.language == "onnx_main_py3.6" \
 | 
			
		||||
            or self.language == "onnx_ort1_py3.6" \
 | 
			
		||||
            or self.language == "onnx_ort2_py3.6" \
 | 
			
		||||
            or set(self.compiler_names).intersection({"android", "mkl", "clang"}) \
 | 
			
		||||
@ -71,11 +70,10 @@ class Conf:
 | 
			
		||||
    def gen_docker_image(self):
 | 
			
		||||
 | 
			
		||||
        lang_substitutions = {
 | 
			
		||||
            "onnx_py2": "py2",
 | 
			
		||||
            "onnx_main_py3.6": "py3.6",
 | 
			
		||||
            "onnx_ort1_py3.6": "py3.6",
 | 
			
		||||
            "onnx_ort2_py3.6": "py3.6",
 | 
			
		||||
            "cmake": "py2",
 | 
			
		||||
            "cmake": "py3",
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        lang = miniutils.override(self.language, lang_substitutions)
 | 
			
		||||
@ -85,7 +83,7 @@ class Conf:
 | 
			
		||||
    def gen_workflow_params(self, phase):
 | 
			
		||||
        parameters = OrderedDict()
 | 
			
		||||
        lang_substitutions = {
 | 
			
		||||
            "onnx_py2": "onnx-py2",
 | 
			
		||||
            "onnx_py3": "onnx-py3",
 | 
			
		||||
            "onnx_main_py3.6": "onnx-main-py3.6",
 | 
			
		||||
            "onnx_ort1_py3.6": "onnx-ort1-py3.6",
 | 
			
		||||
            "onnx_ort2_py3.6": "onnx-ort2-py3.6",
 | 
			
		||||
@ -129,7 +127,7 @@ class Conf:
 | 
			
		||||
            job_name = "caffe2_" + self.get_platform() + "_build"
 | 
			
		||||
 | 
			
		||||
        if not self.is_important:
 | 
			
		||||
            job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/"]}}
 | 
			
		||||
            job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/", r"/release\/.*/"]}}
 | 
			
		||||
        job_def.update(self.gen_workflow_params(phase))
 | 
			
		||||
        return {job_name : job_def}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -8,7 +8,6 @@ CUDA_VERSIONS = [
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
STANDARD_PYTHON_VERSIONS = [
 | 
			
		||||
    "2.7",
 | 
			
		||||
    "3.5",
 | 
			
		||||
    "3.6",
 | 
			
		||||
    "3.7",
 | 
			
		||||
 | 
			
		||||
@ -114,7 +114,7 @@ class Conf:
 | 
			
		||||
        if not self.is_important:
 | 
			
		||||
            # If you update this, update
 | 
			
		||||
            # caffe2_build_definitions.py too
 | 
			
		||||
            job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/"]}}
 | 
			
		||||
            job_def["filters"] = {"branches": {"only": ["master", r"/ci-all\/.*/", r"/release\/.*/"]}}
 | 
			
		||||
        job_def.update(self.gen_workflow_params(phase))
 | 
			
		||||
 | 
			
		||||
        return {job_name : job_def}
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										3286
									
								
								.circleci/config.yml
									
									
									
									
									
								
							
							
						
						
									
										3286
									
								
								.circleci/config.yml
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -4,7 +4,7 @@ set -ex
 | 
			
		||||
 | 
			
		||||
# Optionally install conda
 | 
			
		||||
if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 | 
			
		||||
  BASE_URL="https://repo.continuum.io/miniconda"
 | 
			
		||||
  BASE_URL="https://repo.anaconda.com/miniconda"
 | 
			
		||||
 | 
			
		||||
  MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -10,6 +10,11 @@ retry () {
 | 
			
		||||
if [[ "$(uname)" == Darwin ]]; then
 | 
			
		||||
  # macos executor (builds and tests)
 | 
			
		||||
  workdir="/Users/distiller/project"
 | 
			
		||||
elif [[ "$OSTYPE" == "msys" ]]; then
 | 
			
		||||
  # windows executor (builds and tests)
 | 
			
		||||
  rm -rf /c/w
 | 
			
		||||
  ln -s "/c/Users/circleci/project" /c/w
 | 
			
		||||
  workdir="/c/w"
 | 
			
		||||
elif [[ -d "/home/circleci/project" ]]; then
 | 
			
		||||
  # machine executor (binary tests)
 | 
			
		||||
  workdir="/home/circleci/project"
 | 
			
		||||
@ -19,8 +24,14 @@ else
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# It is very important that this stays in sync with binary_populate_env.sh
 | 
			
		||||
export PYTORCH_ROOT="$workdir/pytorch"
 | 
			
		||||
export BUILDER_ROOT="$workdir/builder"
 | 
			
		||||
if [[ "$OSTYPE" == "msys" ]]; then
 | 
			
		||||
  # We need to make the paths as short as possible on Windows
 | 
			
		||||
  export PYTORCH_ROOT="$workdir/p"
 | 
			
		||||
  export BUILDER_ROOT="$workdir/b"
 | 
			
		||||
else
 | 
			
		||||
  export PYTORCH_ROOT="$workdir/pytorch"
 | 
			
		||||
  export BUILDER_ROOT="$workdir/builder"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Clone the Pytorch branch
 | 
			
		||||
retry git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT"
 | 
			
		||||
 | 
			
		||||
@ -31,9 +31,9 @@ fi
 | 
			
		||||
 | 
			
		||||
conda_sh="$workdir/install_miniconda.sh"
 | 
			
		||||
if [[ "$(uname)" == Darwin ]]; then
 | 
			
		||||
  curl --retry 3 -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 | 
			
		||||
  curl --retry 3 -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 | 
			
		||||
else
 | 
			
		||||
  curl --retry 3 -o "$conda_sh" https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
 | 
			
		||||
  curl --retry 3 -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 | 
			
		||||
fi
 | 
			
		||||
chmod +x "$conda_sh"
 | 
			
		||||
"$conda_sh" -b -p "$MINICONDA_ROOT"
 | 
			
		||||
 | 
			
		||||
@ -2,11 +2,31 @@
 | 
			
		||||
set -eux -o pipefail
 | 
			
		||||
export TZ=UTC
 | 
			
		||||
 | 
			
		||||
tagged_version() {
 | 
			
		||||
  # Grabs version from either the env variable CIRCLE_TAG
 | 
			
		||||
  # or the pytorch git described version
 | 
			
		||||
  if [[ "$OSTYPE" == "msys" ]]; then
 | 
			
		||||
    GIT_DESCRIBE="git --git-dir ${workdir}/p/.git describe"
 | 
			
		||||
  else
 | 
			
		||||
    GIT_DESCRIBE="git --git-dir ${workdir}/pytorch/.git describe"
 | 
			
		||||
  fi
 | 
			
		||||
  if [[ -n "${CIRCLE_TAG:-}" ]]; then
 | 
			
		||||
    echo "${CIRCLE_TAG}"
 | 
			
		||||
  elif ${GIT_DESCRIBE} --exact --tags >/dev/null; then
 | 
			
		||||
    ${GIT_DESCRIBE} --tags
 | 
			
		||||
  else
 | 
			
		||||
    return 1
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# We need to write an envfile to persist these variables to following
 | 
			
		||||
# steps, but the location of the envfile depends on the circleci executor
 | 
			
		||||
if [[ "$(uname)" == Darwin ]]; then
 | 
			
		||||
  # macos executor (builds and tests)
 | 
			
		||||
  workdir="/Users/distiller/project"
 | 
			
		||||
elif [[ "$OSTYPE" == "msys" ]]; then
 | 
			
		||||
  # windows executor (builds and tests)
 | 
			
		||||
  workdir="/c/w"
 | 
			
		||||
elif [[ -d "/home/circleci/project" ]]; then
 | 
			
		||||
  # machine executor (binary tests)
 | 
			
		||||
  workdir="/home/circleci/project"
 | 
			
		||||
@ -23,7 +43,15 @@ configs=($BUILD_ENVIRONMENT)
 | 
			
		||||
export PACKAGE_TYPE="${configs[0]}"
 | 
			
		||||
export DESIRED_PYTHON="${configs[1]}"
 | 
			
		||||
export DESIRED_CUDA="${configs[2]}"
 | 
			
		||||
export DESIRED_DEVTOOLSET="${configs[3]:-}"
 | 
			
		||||
if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
 | 
			
		||||
  export DESIRED_DEVTOOLSET=""
 | 
			
		||||
  export LIBTORCH_CONFIG="${configs[3]:-}"
 | 
			
		||||
  if [[ "$LIBTORCH_CONFIG" == 'debug' ]]; then
 | 
			
		||||
    export DEBUG=1
 | 
			
		||||
  fi
 | 
			
		||||
else
 | 
			
		||||
  export DESIRED_DEVTOOLSET="${configs[3]:-}"
 | 
			
		||||
fi
 | 
			
		||||
if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
 | 
			
		||||
  export BUILD_PYTHONLESS=1
 | 
			
		||||
fi
 | 
			
		||||
@ -47,15 +75,17 @@ export DATE="$(date -u +%Y%m%d)"
 | 
			
		||||
#TODO: We should be pulling semver version from the base version.txt
 | 
			
		||||
BASE_BUILD_VERSION="1.5.0.dev$DATE"
 | 
			
		||||
# Change BASE_BUILD_VERSION to git tag when on a git tag
 | 
			
		||||
if git describe --tags --exact >/dev/null 2>/dev/null; then
 | 
			
		||||
# Use 'git -C' to make doubly sure we're in the correct directory for checking
 | 
			
		||||
# the git tag
 | 
			
		||||
if tagged_version >/dev/null; then
 | 
			
		||||
  # Switch upload folder to 'test/' if we are on a tag
 | 
			
		||||
  PIP_UPLOAD_FOLDER='test/'
 | 
			
		||||
  # Grab git tag, remove prefixed v and remove everything after -
 | 
			
		||||
  # Used to clean up tags that are for release candidates like v1.5.0-rc1
 | 
			
		||||
  # Turns tag v1.5.0-rc1 -> v1.5.0
 | 
			
		||||
  BASE_BUILD_VERSION="$(git describe --tags | sed -e 's/^v//' -e 's/-.*$//')"
 | 
			
		||||
  BASE_BUILD_VERSION="$(tagged_version | sed -e 's/^v//' -e 's/-.*$//')"
 | 
			
		||||
fi
 | 
			
		||||
if [[ "$(uname)" == 'Darwin' ]] || [[ "$DESIRED_CUDA" == "cu101" ]] || [[ "$PACKAGE_TYPE" == conda ]]; then
 | 
			
		||||
if [[ "$(uname)" == 'Darwin' ]] || [[ "$DESIRED_CUDA" == "cu102" ]] || [[ "$PACKAGE_TYPE" == conda ]]; then
 | 
			
		||||
  export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}"
 | 
			
		||||
else
 | 
			
		||||
  export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA"
 | 
			
		||||
@ -94,6 +124,10 @@ export DESIRED_CUDA="$DESIRED_CUDA"
 | 
			
		||||
export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 | 
			
		||||
export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
 | 
			
		||||
export DESIRED_DEVTOOLSET="$DESIRED_DEVTOOLSET"
 | 
			
		||||
if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
 | 
			
		||||
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
 | 
			
		||||
  export DEBUG="${DEBUG:-}"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
export DATE="$DATE"
 | 
			
		||||
export NIGHTLIES_DATE_PREAMBLE=1.5.0.dev
 | 
			
		||||
@ -113,8 +147,13 @@ export DOCKER_IMAGE="$DOCKER_IMAGE"
 | 
			
		||||
 | 
			
		||||
export workdir="$workdir"
 | 
			
		||||
export MAC_PACKAGE_WORK_DIR="$workdir"
 | 
			
		||||
export PYTORCH_ROOT="$workdir/pytorch"
 | 
			
		||||
export BUILDER_ROOT="$workdir/builder"
 | 
			
		||||
if [[ "$OSTYPE" == "msys" ]]; then
 | 
			
		||||
  export PYTORCH_ROOT="$workdir/p"
 | 
			
		||||
  export BUILDER_ROOT="$workdir/b"
 | 
			
		||||
else
 | 
			
		||||
  export PYTORCH_ROOT="$workdir/pytorch"
 | 
			
		||||
  export BUILDER_ROOT="$workdir/builder"
 | 
			
		||||
fi
 | 
			
		||||
export MINICONDA_ROOT="$workdir/miniconda"
 | 
			
		||||
export PYTORCH_FINAL_PACKAGE_DIR="$workdir/final_pkgs"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										33
									
								
								.circleci/scripts/binary_windows_build.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								.circleci/scripts/binary_windows_build.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,33 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
set -eux -o pipefail
 | 
			
		||||
 | 
			
		||||
source "/c/w/env"
 | 
			
		||||
mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
 | 
			
		||||
 | 
			
		||||
export CUDA_VERSION="${DESIRED_CUDA/cu/}"
 | 
			
		||||
export VC_YEAR=2017
 | 
			
		||||
export USE_SCCACHE=1
 | 
			
		||||
export SCCACHE_BUCKET=ossci-compiler-cache-windows
 | 
			
		||||
export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
 | 
			
		||||
 | 
			
		||||
set +x
 | 
			
		||||
export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
 | 
			
		||||
export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
 | 
			
		||||
set -x
 | 
			
		||||
 | 
			
		||||
if [[ "$CIRCLECI" == 'true' && -d "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019" ]]; then
 | 
			
		||||
  rm -rf "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "Free space on filesystem before build:"
 | 
			
		||||
df -h
 | 
			
		||||
 | 
			
		||||
pushd "$BUILDER_ROOT"
 | 
			
		||||
if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
 | 
			
		||||
  ./windows/internal/build_conda.bat
 | 
			
		||||
elif [[ "$PACKAGE_TYPE" == 'wheel' || "$PACKAGE_TYPE" == 'libtorch' ]]; then
 | 
			
		||||
  ./windows/internal/build_wheels.bat
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "Free space on filesystem after build:"
 | 
			
		||||
df -h
 | 
			
		||||
							
								
								
									
										37
									
								
								.circleci/scripts/binary_windows_upload.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								.circleci/scripts/binary_windows_upload.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,37 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
set -eu -o pipefail
 | 
			
		||||
set +x
 | 
			
		||||
declare -x "AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
 | 
			
		||||
declare -x "AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
 | 
			
		||||
 | 
			
		||||
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
 | 
			
		||||
# DO NOT TURN -x ON BEFORE THIS LINE
 | 
			
		||||
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
 | 
			
		||||
set -eux -o pipefail
 | 
			
		||||
 | 
			
		||||
source "/env"
 | 
			
		||||
 | 
			
		||||
# This gets set in binary_populate_env.sh, but lets have a sane default just in case
 | 
			
		||||
PIP_UPLOAD_FOLDER=${PIP_UPLOAD_FOLDER:-nightly/}
 | 
			
		||||
# TODO: Combine CONDA_UPLOAD_CHANNEL and PIP_UPLOAD_FOLDER into one variable
 | 
			
		||||
#       The only difference is the trailing slash
 | 
			
		||||
# Strip trailing slashes if there
 | 
			
		||||
CONDA_UPLOAD_CHANNEL=$(echo "${PIP_UPLOAD_FOLDER}" | sed 's:/*$::')
 | 
			
		||||
 | 
			
		||||
pushd /root/workspace/final_pkgs
 | 
			
		||||
# Upload the package to the final location
 | 
			
		||||
if [[ "$PACKAGE_TYPE" == conda ]]; then
 | 
			
		||||
  retry conda install -yq anaconda-client
 | 
			
		||||
  anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload  "$(ls)" -u "pytorch-${CONDA_UPLOAD_CHANNEL}" --label main --no-progress --force
 | 
			
		||||
elif [[ "$PACKAGE_TYPE" == libtorch ]]; then
 | 
			
		||||
  retry conda install -c conda-forge -yq awscli
 | 
			
		||||
  s3_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
 | 
			
		||||
  for pkg in $(ls); do
 | 
			
		||||
    retry aws s3 cp "$pkg" "$s3_dir" --acl public-read
 | 
			
		||||
  done
 | 
			
		||||
else
 | 
			
		||||
  retry conda install -c conda-forge -yq awscli
 | 
			
		||||
  s3_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${DESIRED_CUDA}/"
 | 
			
		||||
  retry aws s3 cp "$(ls)" "$s3_dir" --acl public-read
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
@ -72,10 +72,10 @@ time python tools/setup_helpers/generate_code.py \
 | 
			
		||||
 | 
			
		||||
# Build the docs
 | 
			
		||||
pushd docs/cpp
 | 
			
		||||
pip install breathe>=4.13.0 bs4 lxml six
 | 
			
		||||
pip install breathe==4.13.0 bs4 lxml six
 | 
			
		||||
pip install --no-cache-dir -e "git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme"
 | 
			
		||||
pip install exhale>=0.2.1
 | 
			
		||||
pip install sphinx>=2.0
 | 
			
		||||
pip install sphinx==2.4.4
 | 
			
		||||
# Uncomment once it is fixed
 | 
			
		||||
# pip install -r requirements.txt
 | 
			
		||||
time make VERBOSE=1 html -j
 | 
			
		||||
 | 
			
		||||
@ -52,3 +52,12 @@ binary_mac_params: &binary_mac_params
 | 
			
		||||
  environment:
 | 
			
		||||
    BUILD_ENVIRONMENT: << parameters.build_environment >>
 | 
			
		||||
 | 
			
		||||
binary_windows_params: &binary_windows_params
 | 
			
		||||
  parameters:
 | 
			
		||||
    build_environment:
 | 
			
		||||
      type: string
 | 
			
		||||
      default: ""
 | 
			
		||||
  environment:
 | 
			
		||||
    BUILD_ENVIRONMENT: << parameters.build_environment >>
 | 
			
		||||
    BUILD_FOR_SYSTEM: windows
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -275,3 +275,46 @@
 | 
			
		||||
          script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
 | 
			
		||||
          cat "$script"
 | 
			
		||||
          source "$script"
 | 
			
		||||
 | 
			
		||||
  binary_windows_build:
 | 
			
		||||
    <<: *binary_windows_params
 | 
			
		||||
    executor:
 | 
			
		||||
      name: windows-cpu-with-nvidia-cuda
 | 
			
		||||
    steps:
 | 
			
		||||
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
 | 
			
		||||
    - attach_scripts
 | 
			
		||||
    - run:
 | 
			
		||||
        <<: *binary_checkout
 | 
			
		||||
    - run:
 | 
			
		||||
        <<: *binary_populate_env
 | 
			
		||||
    - run:
 | 
			
		||||
        name: Build
 | 
			
		||||
        no_output_timeout: "1h"
 | 
			
		||||
        command: |
 | 
			
		||||
          set -eux -o pipefail
 | 
			
		||||
          script="/c/w/p/.circleci/scripts/binary_windows_build.sh"
 | 
			
		||||
          cat "$script"
 | 
			
		||||
          source "$script"
 | 
			
		||||
    - persist_to_workspace:
 | 
			
		||||
        root: "C:/w"
 | 
			
		||||
        paths: final_pkgs
 | 
			
		||||
 | 
			
		||||
  binary_windows_upload:
 | 
			
		||||
    <<: *binary_windows_params
 | 
			
		||||
    docker:
 | 
			
		||||
      - image: continuumio/miniconda
 | 
			
		||||
    steps:
 | 
			
		||||
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
 | 
			
		||||
    - attach_scripts
 | 
			
		||||
    - run:
 | 
			
		||||
        <<: *binary_checkout
 | 
			
		||||
    - run:
 | 
			
		||||
        <<: *binary_populate_env
 | 
			
		||||
    - run:
 | 
			
		||||
        name: Upload
 | 
			
		||||
        no_output_timeout: "10m"
 | 
			
		||||
        command: |
 | 
			
		||||
          set -eux -o pipefail
 | 
			
		||||
          script="/pytorch/.circleci/scripts/binary_windows_upload.sh"
 | 
			
		||||
          cat "$script"
 | 
			
		||||
          source "$script"
 | 
			
		||||
 | 
			
		||||
@ -151,7 +151,7 @@
 | 
			
		||||
            # Install Anaconda if we need to
 | 
			
		||||
            if [ -n "${CAFFE2_USE_ANACONDA}" ]; then
 | 
			
		||||
              rm -rf ${TMPDIR}/anaconda
 | 
			
		||||
              curl --retry 3 -o ${TMPDIR}/conda.sh https://repo.continuum.io/miniconda/Miniconda${ANACONDA_VERSION}-latest-MacOSX-x86_64.sh
 | 
			
		||||
              curl --retry 3 -o ${TMPDIR}/conda.sh https://repo.anaconda.com/miniconda/Miniconda${ANACONDA_VERSION}-latest-MacOSX-x86_64.sh
 | 
			
		||||
              chmod +x ${TMPDIR}/conda.sh
 | 
			
		||||
              /bin/bash ${TMPDIR}/conda.sh -b -p ${TMPDIR}/anaconda
 | 
			
		||||
              rm -f ${TMPDIR}/conda.sh
 | 
			
		||||
 | 
			
		||||
@ -20,16 +20,16 @@ jobs:
 | 
			
		||||
          export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
 | 
			
		||||
 | 
			
		||||
          # TODO We may want to move the rebase logic to a separate step after checkout
 | 
			
		||||
          # Rebase to master only if in xenial_py3_6_gcc5_4 case
 | 
			
		||||
          if [[ "${CIRCLE_BRANCH}" != "master" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then
 | 
			
		||||
            echo "Merge master branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT"
 | 
			
		||||
          # Rebase to release/1.5 only if in xenial_py3_6_gcc5_4 case
 | 
			
		||||
          if [[ "${CIRCLE_BRANCH}" != "release/1.5" && "${BUILD_ENVIRONMENT}" == *"gcc5"* ]]; then
 | 
			
		||||
            echo "Merge release/1.5 branch into $CIRCLE_BRANCH before build in environment $BUILD_ENVIRONMENT"
 | 
			
		||||
            set -x
 | 
			
		||||
            git config --global user.email "circleci.ossci@gmail.com"
 | 
			
		||||
            git config --global user.name "CircleCI"
 | 
			
		||||
            git config remote.origin.url https://github.com/pytorch/pytorch.git
 | 
			
		||||
            git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
 | 
			
		||||
            git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
 | 
			
		||||
            export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master`
 | 
			
		||||
            git config --add remote.origin.fetch +refs/heads/release/1.5:refs/remotes/origin/release/1.5
 | 
			
		||||
            git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/release/1.5:refs/remotes/origin/release/1.5 --depth=100 --quiet
 | 
			
		||||
            export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/release/1.5`
 | 
			
		||||
            echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
 | 
			
		||||
            export GIT_COMMIT=${CIRCLE_SHA1}
 | 
			
		||||
            echo "GIT_COMMIT: " ${GIT_COMMIT}
 | 
			
		||||
@ -38,7 +38,7 @@ jobs:
 | 
			
		||||
            git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
 | 
			
		||||
            set +x
 | 
			
		||||
          else
 | 
			
		||||
            echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
 | 
			
		||||
            echo "Do NOT merge release/1.5 branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
 | 
			
		||||
          fi
 | 
			
		||||
 | 
			
		||||
          git submodule sync && git submodule update -q --init --recursive
 | 
			
		||||
 | 
			
		||||
@ -15,6 +15,7 @@
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      - pytorch_windows_test:
 | 
			
		||||
          name: pytorch_windows_vs2017_14.11_py36_cuda10.1_test1
 | 
			
		||||
          test_name: pytorch-windows-test1
 | 
			
		||||
@ -32,6 +33,7 @@
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      - pytorch_windows_test:
 | 
			
		||||
          name: pytorch_windows_vs2017_14.11_py36_cuda10.1_test2
 | 
			
		||||
          test_name: pytorch-windows-test2
 | 
			
		||||
@ -49,6 +51,7 @@
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      - pytorch_windows_build:
 | 
			
		||||
          name: pytorch_windows_vs2017_14.16_py36_cuda10.1_build
 | 
			
		||||
          cuda_version: "10"
 | 
			
		||||
@ -64,6 +67,7 @@
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      - pytorch_windows_test:
 | 
			
		||||
          name: pytorch_windows_vs2017_14.16_py36_cuda10.1_test1
 | 
			
		||||
          test_name: pytorch-windows-test1
 | 
			
		||||
@ -81,6 +85,7 @@
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      - pytorch_windows_test:
 | 
			
		||||
          name: pytorch_windows_vs2017_14.16_py36_cuda10.1_test2
 | 
			
		||||
          test_name: pytorch-windows-test2
 | 
			
		||||
@ -98,6 +103,7 @@
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      - pytorch_windows_build:
 | 
			
		||||
          name: pytorch_windows_vs2019_py36_cuda10.1_build
 | 
			
		||||
          cuda_version: "10"
 | 
			
		||||
 | 
			
		||||
@ -7,12 +7,6 @@
 | 
			
		||||
      # pytorch-ci-hud to adjust the list of whitelisted builds
 | 
			
		||||
      # at https://github.com/ezyang/pytorch-ci-hud/blob/master/src/BuildHistoryDisplay.js
 | 
			
		||||
 | 
			
		||||
      - binary_linux_build:
 | 
			
		||||
          name: binary_linux_manywheel_2_7mu_cpu_devtoolset7_build
 | 
			
		||||
          build_environment: "manywheel 2.7mu cpu devtoolset7"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
          docker_image: "pytorch/manylinux-cuda102"
 | 
			
		||||
      - binary_linux_build:
 | 
			
		||||
          name: binary_linux_manywheel_3_7m_cu102_devtoolset7_build
 | 
			
		||||
          build_environment: "manywheel 3.7m cu102 devtoolset7"
 | 
			
		||||
@ -23,24 +17,21 @@
 | 
			
		||||
            branches:
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
      - binary_linux_build:
 | 
			
		||||
          name: binary_linux_conda_2_7_cpu_devtoolset7_build
 | 
			
		||||
          build_environment: "conda 2.7 cpu devtoolset7"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
          docker_image: "pytorch/conda-cuda"
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      # This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710
 | 
			
		||||
      # - binary_linux_conda_3_6_cu90_devtoolset7_build
 | 
			
		||||
      # TODO rename to remove python version for libtorch
 | 
			
		||||
      - binary_linux_build:
 | 
			
		||||
          name: binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_build
 | 
			
		||||
          build_environment: "libtorch 2.7m cpu devtoolset7"
 | 
			
		||||
          name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build
 | 
			
		||||
          build_environment: "libtorch 3.7m cpu devtoolset7"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
          libtorch_variant: "shared-with-deps"
 | 
			
		||||
          docker_image: "pytorch/manylinux-cuda102"
 | 
			
		||||
      - binary_linux_build:
 | 
			
		||||
          name: binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
 | 
			
		||||
          build_environment: "libtorch 2.7m cpu gcc5.4_cxx11-abi"
 | 
			
		||||
          name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
 | 
			
		||||
          build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
          libtorch_variant: "shared-with-deps"
 | 
			
		||||
@ -48,45 +39,51 @@
 | 
			
		||||
      # TODO we should test a libtorch cuda build, but they take too long
 | 
			
		||||
      # - binary_linux_libtorch_2_7m_cu90_devtoolset7_static-without-deps_build
 | 
			
		||||
      - binary_mac_build:
 | 
			
		||||
          name: binary_macos_wheel_3_6_cpu_build
 | 
			
		||||
          build_environment: "wheel 3.6 cpu"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
          filters:
 | 
			
		||||
            branches:
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
      - binary_mac_build:
 | 
			
		||||
          name: binary_macos_conda_2_7_cpu_build
 | 
			
		||||
          build_environment: "conda 2.7 cpu"
 | 
			
		||||
          name: binary_macos_wheel_3_7_cpu_build
 | 
			
		||||
          build_environment: "wheel 3.7 cpu"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
          filters:
 | 
			
		||||
            branches:
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      # This job has an average run time of 3 hours o.O
 | 
			
		||||
      # Now only running this on master to reduce overhead
 | 
			
		||||
      # TODO rename to remove python version for libtorch
 | 
			
		||||
      - binary_mac_build:
 | 
			
		||||
          name: binary_macos_libtorch_2_7_cpu_build
 | 
			
		||||
          build_environment: "libtorch 2.7 cpu"
 | 
			
		||||
          name: binary_macos_libtorch_3_7_cpu_build
 | 
			
		||||
          build_environment: "libtorch 3.7 cpu"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
          filters:
 | 
			
		||||
            branches:
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
      - binary_linux_test:
 | 
			
		||||
          name: binary_linux_manywheel_2_7mu_cpu_devtoolset7_test
 | 
			
		||||
          build_environment: "manywheel 2.7mu cpu devtoolset7"
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      - binary_windows_build:
 | 
			
		||||
          name: binary_windows_libtorch_3_7_cpu_debug_build
 | 
			
		||||
          build_environment: "libtorch 3.7 cpu debug"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
      - binary_windows_build:
 | 
			
		||||
          name: binary_windows_libtorch_3_7_cpu_release_build
 | 
			
		||||
          build_environment: "libtorch 3.7 cpu release"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
      - binary_windows_build:
 | 
			
		||||
          name: binary_windows_wheel_3_7_cu102_build
 | 
			
		||||
          build_environment: "wheel 3.7 cu102"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
            - binary_linux_manywheel_2_7mu_cpu_devtoolset7_build
 | 
			
		||||
          docker_image: "pytorch/manylinux-cuda102"
 | 
			
		||||
          filters:
 | 
			
		||||
            branches:
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      - binary_linux_test:
 | 
			
		||||
          name: binary_linux_manywheel_3_7m_cu102_devtoolset7_test
 | 
			
		||||
          build_environment: "manywheel 3.7m cu102 devtoolset7"
 | 
			
		||||
@ -100,29 +97,25 @@
 | 
			
		||||
            branches:
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
      - binary_linux_test:
 | 
			
		||||
          name: binary_linux_conda_2_7_cpu_devtoolset7_test
 | 
			
		||||
          build_environment: "conda 2.7 cpu devtoolset7"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
            - binary_linux_conda_2_7_cpu_devtoolset7_build
 | 
			
		||||
          docker_image: "pytorch/conda-cuda"
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
      # This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710
 | 
			
		||||
      # - binary_linux_conda_3_6_cu90_devtoolset7_test:
 | 
			
		||||
      # TODO rename to remove python version for libtorch
 | 
			
		||||
      - binary_linux_test:
 | 
			
		||||
          name: binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_test
 | 
			
		||||
          build_environment: "libtorch 2.7m cpu devtoolset7"
 | 
			
		||||
          name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test
 | 
			
		||||
          build_environment: "libtorch 3.7m cpu devtoolset7"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
            - binary_linux_libtorch_2_7m_cpu_devtoolset7_shared-with-deps_build
 | 
			
		||||
            - binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build
 | 
			
		||||
          libtorch_variant: "shared-with-deps"
 | 
			
		||||
          docker_image: "pytorch/manylinux-cuda102"
 | 
			
		||||
      - binary_linux_test:
 | 
			
		||||
          name: binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test
 | 
			
		||||
          build_environment: "libtorch 2.7m cpu gcc5.4_cxx11-abi"
 | 
			
		||||
          name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test
 | 
			
		||||
          build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi"
 | 
			
		||||
          requires:
 | 
			
		||||
            - setup
 | 
			
		||||
            - binary_linux_libtorch_2_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
 | 
			
		||||
            - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
 | 
			
		||||
          libtorch_variant: "shared-with-deps"
 | 
			
		||||
          docker_image: "pytorch/pytorch-binary-docker-image-ubuntu16.04:latest"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -20,21 +20,12 @@
 | 
			
		||||
      - docker_build_job:
 | 
			
		||||
          name: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
 | 
			
		||||
          image_name: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
 | 
			
		||||
      - docker_build_job:
 | 
			
		||||
          name: "pytorch-linux-xenial-cuda9-cudnn7-py2"
 | 
			
		||||
          image_name: "pytorch-linux-xenial-cuda9-cudnn7-py2"
 | 
			
		||||
      - docker_build_job:
 | 
			
		||||
          name: "pytorch-linux-xenial-cuda9-cudnn7-py3"
 | 
			
		||||
          image_name: "pytorch-linux-xenial-cuda9-cudnn7-py3"
 | 
			
		||||
      - docker_build_job:
 | 
			
		||||
          name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
 | 
			
		||||
          image_name: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
 | 
			
		||||
      - docker_build_job:
 | 
			
		||||
          name: "pytorch-linux-xenial-py2.7.9"
 | 
			
		||||
          image_name: "pytorch-linux-xenial-py2.7.9"
 | 
			
		||||
      - docker_build_job:
 | 
			
		||||
          name: "pytorch-linux-xenial-py2.7"
 | 
			
		||||
          image_name: "pytorch-linux-xenial-py2.7"
 | 
			
		||||
      - docker_build_job:
 | 
			
		||||
          name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
 | 
			
		||||
          image_name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
 | 
			
		||||
 | 
			
		||||
@ -4,6 +4,8 @@
 | 
			
		||||
            branches:
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build
 | 
			
		||||
 | 
			
		||||
@ -13,6 +15,8 @@
 | 
			
		||||
            branches:
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
          requires:
 | 
			
		||||
            - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build
 | 
			
		||||
            - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build
 | 
			
		||||
 | 
			
		||||
@ -31,6 +31,7 @@
 | 
			
		||||
              only:
 | 
			
		||||
                - master
 | 
			
		||||
                - /ci-all\/.*/
 | 
			
		||||
                - /release\/.*/
 | 
			
		||||
          build_environment: "pytorch-linux-xenial-py3-clang5-mobile-code-analysis"
 | 
			
		||||
          build_only: "1"
 | 
			
		||||
          # Use LLVM-DEV toolchain in android-ndk-r19c docker image
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										40
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										40
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							@ -81,44 +81,6 @@ jobs:
 | 
			
		||||
        env:
 | 
			
		||||
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 | 
			
		||||
 | 
			
		||||
  flake8-py2:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Setup Python
 | 
			
		||||
        uses: actions/setup-python@v1
 | 
			
		||||
        with:
 | 
			
		||||
          python-version: 2.x
 | 
			
		||||
          architecture: x64
 | 
			
		||||
      - name: Fetch PyTorch
 | 
			
		||||
        uses: actions/checkout@v1
 | 
			
		||||
      - name: Checkout PR tip
 | 
			
		||||
        run: |
 | 
			
		||||
          set -eux
 | 
			
		||||
          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
 | 
			
		||||
            # We are on a PR, so actions/checkout leaves us on a merge commit.
 | 
			
		||||
            # Check out the actual tip of the branch.
 | 
			
		||||
            git checkout ${{ github.event.pull_request.head.sha }}
 | 
			
		||||
          fi
 | 
			
		||||
          echo ::set-output name=commit_sha::$(git rev-parse HEAD)
 | 
			
		||||
        id: get_pr_tip
 | 
			
		||||
      - name: Run flake8
 | 
			
		||||
        run: |
 | 
			
		||||
          set -eux
 | 
			
		||||
          pip install flake8
 | 
			
		||||
          rm -rf .circleci tools/clang_format_new.py
 | 
			
		||||
          flake8 --exit-zero > ${GITHUB_WORKSPACE}/flake8-output.txt
 | 
			
		||||
          cat ${GITHUB_WORKSPACE}/flake8-output.txt
 | 
			
		||||
      - name: Add annotations
 | 
			
		||||
        uses: pytorch/add-annotations-github-action@master
 | 
			
		||||
        with:
 | 
			
		||||
          check_name: 'flake8-py2'
 | 
			
		||||
          linter_output_path: 'flake8-output.txt'
 | 
			
		||||
          commit_sha: ${{ steps.get_pr_tip.outputs.commit_sha }}
 | 
			
		||||
          regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w\d+) (?<errorDesc>.*)'
 | 
			
		||||
        env:
 | 
			
		||||
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  clang-tidy:
 | 
			
		||||
    if: github.event_name == 'pull_request'
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
@ -198,6 +160,8 @@ jobs:
 | 
			
		||||
            -g"-torch/csrc/jit/export.cpp"            \
 | 
			
		||||
            -g"-torch/csrc/jit/import.cpp"            \
 | 
			
		||||
            -g"-torch/csrc/jit/netdef_converter.cpp"  \
 | 
			
		||||
            -g"-torch/csrc/cuda/nccl.*"               \
 | 
			
		||||
            -g"-torch/csrc/cuda/python_nccl.cpp"      \
 | 
			
		||||
            "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
 | 
			
		||||
 | 
			
		||||
          cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
 | 
			
		||||
 | 
			
		||||
@ -167,7 +167,7 @@ fi
 | 
			
		||||
 | 
			
		||||
# Patch required to build xla
 | 
			
		||||
if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
 | 
			
		||||
  git clone --recursive https://github.com/pytorch/xla.git
 | 
			
		||||
  git clone --recursive -b r1.5 https://github.com/pytorch/xla.git
 | 
			
		||||
  ./xla/scripts/apply_patches.sh
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -13,12 +13,12 @@ mkdir -p ${WORKSPACE_DIR}
 | 
			
		||||
# If a local installation of conda doesn't exist, we download and install conda
 | 
			
		||||
if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
 | 
			
		||||
  mkdir -p ${WORKSPACE_DIR}
 | 
			
		||||
  curl --retry 3 https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${WORKSPACE_DIR}/miniconda3.sh
 | 
			
		||||
  curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o ${WORKSPACE_DIR}/miniconda3.sh
 | 
			
		||||
  retry bash ${WORKSPACE_DIR}/miniconda3.sh -b -p ${WORKSPACE_DIR}/miniconda3
 | 
			
		||||
fi
 | 
			
		||||
export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
 | 
			
		||||
source ${WORKSPACE_DIR}/miniconda3/bin/activate
 | 
			
		||||
retry conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja
 | 
			
		||||
retry conda install -y mkl mkl-include numpy pyyaml=5.3 setuptools=46.0.0 cmake cffi ninja
 | 
			
		||||
 | 
			
		||||
# The torch.hub tests make requests to GitHub.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
@ -20,7 +20,7 @@ if [ -n "${IN_CIRCLECI}" ]; then
 | 
			
		||||
    sudo apt-get install -y --allow-downgrades --allow-change-held-packages libnccl-dev=2.5.6-1+cuda10.1 libnccl2=2.5.6-1+cuda10.1
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then
 | 
			
		||||
  if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda10.1-cudnn7-py3* ]]; then
 | 
			
		||||
    # TODO: move this to Docker
 | 
			
		||||
    sudo apt-get update
 | 
			
		||||
    sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
 | 
			
		||||
 | 
			
		||||
@ -21,7 +21,7 @@ if [ -n "${IN_CIRCLECI}" ]; then
 | 
			
		||||
    sudo apt-get -qq install --allow-downgrades --allow-change-held-packages libnccl-dev=2.5.6-1+cuda10.1 libnccl2=2.5.6-1+cuda10.1
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then
 | 
			
		||||
  if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda10.1-cudnn7-py3* ]]; then
 | 
			
		||||
    # TODO: move this to Docker
 | 
			
		||||
    sudo apt-get -qq update
 | 
			
		||||
    sudo apt-get -qq install --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
 | 
			
		||||
@ -244,7 +244,7 @@ test_backward_compatibility() {
 | 
			
		||||
  pushd test/backward_compatibility
 | 
			
		||||
  python dump_all_function_schemas.py --filename new_schemas.txt
 | 
			
		||||
  pip_uninstall torch
 | 
			
		||||
  pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 | 
			
		||||
  pip_install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
 | 
			
		||||
  python check_backward_compatibility.py --new-schemas new_schemas.txt
 | 
			
		||||
  popd
 | 
			
		||||
  set +x
 | 
			
		||||
 | 
			
		||||
@ -5,7 +5,7 @@ if "%BUILD_ENVIRONMENT%"=="" (
 | 
			
		||||
)
 | 
			
		||||
if "%REBUILD%"=="" (
 | 
			
		||||
  IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 )
 | 
			
		||||
  curl --retry 3 -k https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
 | 
			
		||||
  curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
 | 
			
		||||
  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
 | 
			
		||||
)
 | 
			
		||||
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
 | 
			
		||||
 | 
			
		||||
@ -13,7 +13,7 @@ if "%BUILD_ENVIRONMENT%"=="" (
 | 
			
		||||
)
 | 
			
		||||
if NOT "%BUILD_ENVIRONMENT%"=="" (
 | 
			
		||||
    IF EXIST %CONDA_PARENT_DIR%\Miniconda3 ( rd /s /q %CONDA_PARENT_DIR%\Miniconda3 )
 | 
			
		||||
    curl --retry 3 https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
 | 
			
		||||
    curl --retry 3 https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
 | 
			
		||||
    %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
 | 
			
		||||
)
 | 
			
		||||
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
 | 
			
		||||
 | 
			
		||||
@ -160,20 +160,18 @@ ENDIF(BLAS_FOUND)
 | 
			
		||||
 | 
			
		||||
IF(LAPACK_FOUND)
 | 
			
		||||
  list(APPEND ATen_CPU_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
 | 
			
		||||
  if(USE_CUDA)
 | 
			
		||||
  if(USE_CUDA AND MSVC)
 | 
			
		||||
    # Although Lapack provides CPU (and thus, one might expect that ATen_cuda
 | 
			
		||||
    # would not need this at all), some of our libraries (magma in particular)
 | 
			
		||||
    # backend to CPU BLAS/LAPACK implementations, and so it is very important
 | 
			
		||||
    # we get the *right* implementation, because even if the symbols are the
 | 
			
		||||
    # same, LAPACK implementions may have different calling conventions.
 | 
			
		||||
    # This caused https://github.com/pytorch/pytorch/issues/7353
 | 
			
		||||
    #
 | 
			
		||||
    # We do NOT do this on Linux, since we just rely on torch_cpu to
 | 
			
		||||
    # provide all of the symbols we need
 | 
			
		||||
    list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
 | 
			
		||||
  endif()
 | 
			
		||||
  if(USE_ROCM)
 | 
			
		||||
    # It's not altogether clear that HIP behaves the same way, but it
 | 
			
		||||
    # seems safer to assume that it needs it too
 | 
			
		||||
    list(APPEND ATen_HIP_DEPENDENCY_LIBS ${LAPACK_LIBRARIES})
 | 
			
		||||
  endif()
 | 
			
		||||
ENDIF(LAPACK_FOUND)
 | 
			
		||||
 | 
			
		||||
IF (UNIX AND NOT APPLE)
 | 
			
		||||
@ -331,8 +329,12 @@ IF(USE_CUDA AND NOT USE_ROCM)
 | 
			
		||||
  IF(USE_MAGMA)
 | 
			
		||||
    list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${MAGMA_LIBRARIES})
 | 
			
		||||
    IF ($ENV{TH_BINARY_BUILD})
 | 
			
		||||
      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
 | 
			
		||||
        "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
 | 
			
		||||
      IF (MSVC)
 | 
			
		||||
        # Do not do this on Linux: see Note [Extra MKL symbols for MAGMA in torch_cpu]
 | 
			
		||||
        # in caffe2/CMakeLists.txt
 | 
			
		||||
        list(APPEND ATen_CUDA_DEPENDENCY_LIBS
 | 
			
		||||
          "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
 | 
			
		||||
      ENDIF(MSVC)
 | 
			
		||||
    ENDIF($ENV{TH_BINARY_BUILD})
 | 
			
		||||
  ENDIF(USE_MAGMA)
 | 
			
		||||
  IF ($ENV{ATEN_STATIC_CUDA})
 | 
			
		||||
 | 
			
		||||
@ -125,13 +125,15 @@ void _parallel_run(
 | 
			
		||||
  std::tie(num_tasks, chunk_size) =
 | 
			
		||||
      internal::calc_num_tasks_and_chunk_size(begin, end, grain_size);
 | 
			
		||||
 | 
			
		||||
  std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
 | 
			
		||||
  std::exception_ptr eptr;
 | 
			
		||||
  std::vector<std::shared_ptr<c10::ivalue::Future>> futures(num_tasks);
 | 
			
		||||
  for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
 | 
			
		||||
    futures[task_id] = std::make_shared<c10::ivalue::Future>(c10::NoneType::get());
 | 
			
		||||
  }
 | 
			
		||||
  auto task = [f, &eptr, &err_flag, &futures, begin, end, chunk_size]
 | 
			
		||||
  struct {
 | 
			
		||||
    std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
 | 
			
		||||
    std::exception_ptr eptr;
 | 
			
		||||
    std::mutex mutex;
 | 
			
		||||
    volatile size_t remaining;
 | 
			
		||||
    std::condition_variable cv;
 | 
			
		||||
  } state;
 | 
			
		||||
 | 
			
		||||
  auto task = [f, &state, begin, end, chunk_size]
 | 
			
		||||
      (int /* unused */, size_t task_id) {
 | 
			
		||||
    int64_t local_start = begin + task_id * chunk_size;
 | 
			
		||||
    if (local_start < end) {
 | 
			
		||||
@ -140,21 +142,30 @@ void _parallel_run(
 | 
			
		||||
        ParallelRegionGuard guard(task_id);
 | 
			
		||||
        f(local_start, local_end, task_id);
 | 
			
		||||
      } catch (...) {
 | 
			
		||||
        if (!err_flag.test_and_set()) {
 | 
			
		||||
          eptr = std::current_exception();
 | 
			
		||||
        if (!state.err_flag.test_and_set()) {
 | 
			
		||||
          state.eptr = std::current_exception();
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    futures[task_id]->markCompleted();
 | 
			
		||||
    {
 | 
			
		||||
      std::unique_lock<std::mutex> lk(state.mutex);
 | 
			
		||||
      if (--state.remaining == 0) {
 | 
			
		||||
        state.cv.notify_one();
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  state.remaining = num_tasks;
 | 
			
		||||
  _run_with_pool(task, num_tasks);
 | 
			
		||||
 | 
			
		||||
  // Wait for all tasks to finish.
 | 
			
		||||
  for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
 | 
			
		||||
    futures[task_id]->wait();
 | 
			
		||||
  {
 | 
			
		||||
    std::unique_lock<std::mutex> lk(state.mutex);
 | 
			
		||||
    if (state.remaining != 0) {
 | 
			
		||||
      state.cv.wait(lk);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if (eptr) {
 | 
			
		||||
    std::rethrow_exception(eptr);
 | 
			
		||||
  if (state.eptr) {
 | 
			
		||||
    std::rethrow_exception(state.eptr);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -16,14 +16,6 @@
 | 
			
		||||
#include <numeric>
 | 
			
		||||
#include <memory>
 | 
			
		||||
 | 
			
		||||
#if defined(__clang__)
 | 
			
		||||
#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero")))
 | 
			
		||||
#define __ubsan_ignore_vptr__ __attribute__((no_sanitize("vptr")))
 | 
			
		||||
#else
 | 
			
		||||
#define __ubsan_ignore_float_divide_by_zero__
 | 
			
		||||
#define __ubsan_ignore_vptr__
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define AT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
 | 
			
		||||
  TypeName(const TypeName&) = delete; \
 | 
			
		||||
  void operator=(const TypeName&) = delete
 | 
			
		||||
 | 
			
		||||
@ -20,6 +20,10 @@ void registerCustomClass(at::ClassTypePtr class_type) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
at::ClassTypePtr getCustomClass(const std::string& name) {
 | 
			
		||||
  // BC hack so we can upgrade a binary internally
 | 
			
		||||
  if (name == "__torch__.torch.classes.SentencePiece") {
 | 
			
		||||
    return getCustomClass("__torch__.torch.classes.fb.SentencePiece");
 | 
			
		||||
  }
 | 
			
		||||
  return customClasses().count(name) ? customClasses()[name] : nullptr;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -15,6 +15,7 @@
 | 
			
		||||
#include <c10/util/math_compat.h>
 | 
			
		||||
#include <ATen/native/cpu/zmath.h>
 | 
			
		||||
#include <c10/util/TypeCast.h>
 | 
			
		||||
#include <c10/macros/Macros.h>
 | 
			
		||||
 | 
			
		||||
#if defined(__GNUC__)
 | 
			
		||||
#define __at_align32__ __attribute__((aligned(32)))
 | 
			
		||||
 | 
			
		||||
@ -145,7 +145,7 @@ private:
 | 
			
		||||
 | 
			
		||||
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
 | 
			
		||||
 | 
			
		||||
class FilterDescriptor
 | 
			
		||||
class TORCH_CUDA_API FilterDescriptor
 | 
			
		||||
  : public Descriptor<cudnnFilterStruct,
 | 
			
		||||
                      &cudnnCreateFilterDescriptor,
 | 
			
		||||
                      &cudnnDestroyFilterDescriptor>
 | 
			
		||||
 | 
			
		||||
@ -698,17 +698,34 @@ Tensor leaky_relu_backward(
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::tuple<Tensor, Tensor> log_sigmoid_forward_cpu(const Tensor& input) {
 | 
			
		||||
  auto result = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 | 
			
		||||
  auto buffer = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 | 
			
		||||
  // FIXME: do these actually need to be zeros_like or can they be empty_like?
 | 
			
		||||
  auto result = at::zeros_like(input, at::MemoryFormat::Contiguous);
 | 
			
		||||
  auto buffer = at::zeros_like(input, at::MemoryFormat::Contiguous);
 | 
			
		||||
  log_sigmoid_cpu_stub(kCPU, result, buffer, input.contiguous());
 | 
			
		||||
  return std::make_tuple(result, buffer);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_cpu(Tensor& result, Tensor& buffer, const Tensor& input) {
 | 
			
		||||
  log_sigmoid_cpu_stub(kCPU, result, buffer, input);
 | 
			
		||||
  result.resize_as_(input);
 | 
			
		||||
  buffer.resize_as_(input, at::MemoryFormat::Contiguous);
 | 
			
		||||
  TORCH_CHECK(buffer.is_contiguous(), "Contiguous buffer required for log_sigmoid with out parameter");
 | 
			
		||||
  Tensor result_tmp = result.is_contiguous() ? result : at::empty_like(result, at::MemoryFormat::Contiguous);
 | 
			
		||||
  log_sigmoid_cpu_stub(kCPU, result_tmp, buffer, input.contiguous());
 | 
			
		||||
  if (!result.is_contiguous()) {
 | 
			
		||||
    result.copy_(result_tmp);
 | 
			
		||||
  }
 | 
			
		||||
  return std::forward_as_tuple(result, buffer);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) {
 | 
			
		||||
  Tensor buffer = at::empty({0}, self.options());
 | 
			
		||||
  return std::get<0>(at::log_sigmoid_forward_out(output, buffer, self));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor log_sigmoid(const Tensor & self) {
 | 
			
		||||
  return std::get<0>(at::log_sigmoid_forward(self));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor log_sigmoid_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& buffer) {
 | 
			
		||||
  Tensor grad_input;
 | 
			
		||||
  auto iter = at::TensorIterator();
 | 
			
		||||
 | 
			
		||||
@ -138,6 +138,10 @@ Tensor true_divide(const Tensor& self, const Tensor& divisor) {
 | 
			
		||||
  return iter.output();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& true_divide_(Tensor& self, const Tensor& divisor) {
 | 
			
		||||
  return native::true_divide_out(self, self, divisor);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& floor_divide_out(Tensor& result, const Tensor& self, const Tensor& other) {
 | 
			
		||||
  auto iter = TensorIterator::binary_op(result, self, other,
 | 
			
		||||
    /*check_mem_overlap=*/true);
 | 
			
		||||
@ -731,7 +735,11 @@ Tensor& fmod_(Tensor& self, Scalar other) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor true_divide(const Tensor& self, Scalar divisor) {
 | 
			
		||||
  return at::true_divide(self, wrapped_scalar_tensor(divisor)); // redispatch!
 | 
			
		||||
  return self.true_divide(wrapped_scalar_tensor(divisor)); // redispatch!
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& true_divide_(Tensor& self, Scalar divisor) {
 | 
			
		||||
  return self.true_divide_(wrapped_scalar_tensor(divisor)); // redispatch!
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -70,8 +70,8 @@ struct CAFFE2_API DispatchStub<rT (*)(Args...), T> {
 | 
			
		||||
      // they will still compute the same value for cpu_dispatch_ptr.
 | 
			
		||||
      if (!cpu_dispatch_ptr.load(std::memory_order_relaxed)) {
 | 
			
		||||
        FnPtr tmp_cpu_dispatch_ptr = nullptr;
 | 
			
		||||
        cpu_dispatch_ptr.compare_exchange_weak(
 | 
			
		||||
            tmp_cpu_dispatch_ptr, choose_cpu_impl(), std::memory_order_relaxed);
 | 
			
		||||
        while(!cpu_dispatch_ptr.compare_exchange_weak(
 | 
			
		||||
            tmp_cpu_dispatch_ptr, choose_cpu_impl(), std::memory_order_relaxed));
 | 
			
		||||
      }
 | 
			
		||||
      return (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
 | 
			
		||||
    } else if (device_type == DeviceType::CUDA) {
 | 
			
		||||
 | 
			
		||||
@ -31,15 +31,6 @@ Tensor nll_loss2d(const Tensor & self, const Tensor & target, const Tensor & wei
 | 
			
		||||
  return std::get<0>(at::nll_loss2d_forward(self, target, weight, reduction, ignore_index));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & log_sigmoid_out(Tensor & output, const Tensor & self) {
 | 
			
		||||
  Tensor buffer = at::empty({0}, self.options());
 | 
			
		||||
  return std::get<0>(at::log_sigmoid_forward_out(output, buffer, self));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor log_sigmoid(const Tensor & self) {
 | 
			
		||||
  return std::get<0>(at::log_sigmoid_forward(self));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor & thnn_conv2d_out(Tensor & output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding) {
 | 
			
		||||
  Tensor finput = at::empty({0}, self.options());
 | 
			
		||||
  Tensor fgrad_input = at::empty({0}, self.options());
 | 
			
		||||
 | 
			
		||||
@ -533,7 +533,7 @@ Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) {
 | 
			
		||||
    return at::norm(self, 2, dim, keepdim, self.scalar_type());
 | 
			
		||||
  }
 | 
			
		||||
  if (self.is_complex()){
 | 
			
		||||
    return at::sqrt(at::sum((self.conj() * self).real(), dim, keepdim));
 | 
			
		||||
    return at::sqrt(at::sum(at::real(self.conj() * self), dim, keepdim));
 | 
			
		||||
  } else {
 | 
			
		||||
    return at::sqrt(at::sum((self * self), dim, keepdim));
 | 
			
		||||
  }
 | 
			
		||||
@ -553,7 +553,7 @@ Tensor &frobenius_norm_out(
 | 
			
		||||
    return at::norm_out(result, self, 2, dim, keepdim, self.scalar_type());
 | 
			
		||||
  }
 | 
			
		||||
  if (self.is_complex()){
 | 
			
		||||
    return at::sqrt_out(result, at::sum((self.conj() * self).real(), dim, keepdim));
 | 
			
		||||
    return at::sqrt_out(result, at::sum(at::real(self.conj() * self), dim, keepdim));
 | 
			
		||||
  } else {
 | 
			
		||||
    return at::sqrt_out(result, at::sum((self * self), dim, keepdim));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@ -799,7 +799,7 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim,
 | 
			
		||||
 | 
			
		||||
  if (at::isComplexType(self.scalar_type())){
 | 
			
		||||
    ScalarType dtype = c10::toValueType(get_dtype(result, self, {}, true));
 | 
			
		||||
    Tensor real_in = self.real().to(dtype);
 | 
			
		||||
    Tensor real_in = at::real(self).to(dtype);
 | 
			
		||||
    Tensor real_out = at::empty({0}, self.options().dtype(dtype));
 | 
			
		||||
    auto iter = make_reduction("std or var", real_out, real_in, dim, keepdim, dtype);
 | 
			
		||||
    if (iter.numel() == 0) {
 | 
			
		||||
@ -807,7 +807,7 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim,
 | 
			
		||||
    } else {
 | 
			
		||||
      std_var_stub(iter.device_type(), iter, unbiased, false);
 | 
			
		||||
    }
 | 
			
		||||
    Tensor imag_in = self.imag().to(dtype);
 | 
			
		||||
    Tensor imag_in = at::imag(self).to(dtype);
 | 
			
		||||
    Tensor imag_out = at::empty({0}, self.options().dtype(dtype));
 | 
			
		||||
    iter = make_reduction("std or var", imag_out, imag_in, dim, keepdim, dtype);
 | 
			
		||||
    if (iter.numel() == 0) {
 | 
			
		||||
@ -845,7 +845,7 @@ static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &r
 | 
			
		||||
           ".");
 | 
			
		||||
  if (at::isComplexType(self.scalar_type())){
 | 
			
		||||
    ScalarType dtype = c10::toValueType(get_dtype(result1, self, {}, true));
 | 
			
		||||
    Tensor real_in = self.real().to(dtype);
 | 
			
		||||
    Tensor real_in = at::real(self).to(dtype);
 | 
			
		||||
    Tensor real_out_var = at::empty({0}, self.options().dtype(dtype));
 | 
			
		||||
    Tensor real_out_mean = at::empty({0}, self.options().dtype(dtype));
 | 
			
		||||
    auto iter = make_reduction(fname, real_out_var, real_out_mean, real_in, dim, keepdim, dtype);
 | 
			
		||||
@ -855,7 +855,7 @@ static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &r
 | 
			
		||||
    } else {
 | 
			
		||||
      std_var_stub(iter.device_type(), iter, unbiased, false);
 | 
			
		||||
    }
 | 
			
		||||
    Tensor imag_in = self.imag().to(dtype);
 | 
			
		||||
    Tensor imag_in = at::imag(self).to(dtype);
 | 
			
		||||
    Tensor imag_out_var = at::empty({0}, self.options().dtype(dtype));
 | 
			
		||||
    Tensor imag_out_mean = at::empty({0}, self.options().dtype(dtype));
 | 
			
		||||
    iter = make_reduction(fname, imag_out_var, imag_out_mean, imag_in, dim, keepdim, dtype);
 | 
			
		||||
 | 
			
		||||
@ -33,7 +33,7 @@ static inline Tensor to_impl(const Tensor& self, const TensorOptions& options, b
 | 
			
		||||
    if (self.is_non_overlapping_and_dense()) {
 | 
			
		||||
      // Copy all strides
 | 
			
		||||
      auto r = at::empty_strided(self.sizes(), self.strides(), options.memory_format(c10::nullopt));
 | 
			
		||||
      r.copy_(self);
 | 
			
		||||
      r.copy_(self, non_blocking);
 | 
			
		||||
      return r;
 | 
			
		||||
    } else {
 | 
			
		||||
      memory_format = self.suggest_memory_format();
 | 
			
		||||
 | 
			
		||||
@ -99,7 +99,7 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) {
 | 
			
		||||
 | 
			
		||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 | 
			
		||||
Tensor empty_cpu(IntArrayRef size, const TensorOptions& options_, c10::optional<c10::MemoryFormat> optional_memory_format) {
 | 
			
		||||
 | 
			
		||||
  TORCH_CHECK(!isComplexType(at::typeMetaToScalarType(options_.dtype())), "Complex dtype not supported.");
 | 
			
		||||
  TORCH_CHECK(
 | 
			
		||||
    !(options_.has_memory_format() && optional_memory_format.has_value()),
 | 
			
		||||
    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
 | 
			
		||||
 | 
			
		||||
@ -98,6 +98,15 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) {
 | 
			
		||||
        "output memory locations. Found overlap in input tensor ", i);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Dtypes should be the same
 | 
			
		||||
  const auto first_in_cat = tensors[0];
 | 
			
		||||
  for (int64_t i = 1; i < tensors.size(); i++) {
 | 
			
		||||
    TORCH_CHECK(first_in_cat.dtype() == tensors[i].dtype(),
 | 
			
		||||
              "Expected object of scalar type ", first_in_cat.dtype(),
 | 
			
		||||
              " but got scalar type ", tensors[i].dtype(),
 | 
			
		||||
              " for sequence element ", i, ".");
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  auto should_skip = [](const Tensor& t) { return t.numel() == 0 && t.dim() == 1; };
 | 
			
		||||
  for (auto const &tensor : tensors) {
 | 
			
		||||
    if (should_skip(tensor)) {
 | 
			
		||||
 | 
			
		||||
@ -73,11 +73,17 @@ Tensor& abs_(Tensor& self) { return unary_op_impl_(self, at::abs_out); }
 | 
			
		||||
Tensor& angle_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, angle_stub); }
 | 
			
		||||
Tensor angle(const Tensor& self) { return unary_op_impl(self, at::angle_out); }
 | 
			
		||||
 | 
			
		||||
Tensor& real_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, real_stub); }
 | 
			
		||||
Tensor real(const Tensor& self) { return unary_op_impl(self, at::real_out); }
 | 
			
		||||
Tensor real(const Tensor& self) {
 | 
			
		||||
  TORCH_CHECK(!self.is_complex(), "real is not yet implemented for complex tensors.");
 | 
			
		||||
  return self;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& imag_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, imag_stub); }
 | 
			
		||||
Tensor imag(const Tensor& self) { return unary_op_impl(self, at::imag_out); }
 | 
			
		||||
Tensor imag(const Tensor& self) {
 | 
			
		||||
  TORCH_CHECK(false, "imag is not yet implemented.");
 | 
			
		||||
 | 
			
		||||
  // Note: unreachable
 | 
			
		||||
  return at::zeros_like(self);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& conj_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, conj_stub); }
 | 
			
		||||
Tensor conj(const Tensor& self) { return unary_op_impl(self, at::conj_out); }
 | 
			
		||||
 | 
			
		||||
@ -7,6 +7,7 @@
 | 
			
		||||
#include <ATen/native/TensorIterator.h>
 | 
			
		||||
#include <ATen/native/BinaryOps.h>
 | 
			
		||||
#include <ATen/native/cpu/Loops.h>
 | 
			
		||||
#include <c10/macros/Macros.h>
 | 
			
		||||
 | 
			
		||||
namespace at { namespace native {
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
@ -4,7 +4,7 @@
 | 
			
		||||
#include <ATen/native/cuda/zmath.cuh>
 | 
			
		||||
#include <ATen/native/TensorIterator.h>
 | 
			
		||||
#include <ATen/native/BinaryOps.h>
 | 
			
		||||
 | 
			
		||||
#include <c10/macros/Macros.h>
 | 
			
		||||
 | 
			
		||||
// NOTE: CUDA on Windows requires that the enclosing function
 | 
			
		||||
// of a __device__ lambda not have internal linkage.
 | 
			
		||||
@ -69,7 +69,6 @@ void remainder_kernel_cuda(TensorIterator& iter) {
 | 
			
		||||
    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "remainder_cuda", [&]() {
 | 
			
		||||
      using thrust_t = typename ztype_cuda<scalar_t>::thrust_t;
 | 
			
		||||
      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(thrust_t a, thrust_t b) -> thrust_t {
 | 
			
		||||
        CUDA_KERNEL_ASSERT(b != 0);
 | 
			
		||||
        thrust_t r = a % b;
 | 
			
		||||
        if ((r != 0) && ((r < 0) != (b < 0))) {
 | 
			
		||||
          r += b;
 | 
			
		||||
 | 
			
		||||
@ -358,7 +358,7 @@ void max_pool2d_with_indices_out_cuda_template(
 | 
			
		||||
 | 
			
		||||
  Tensor input = input_.contiguous(memory_format);
 | 
			
		||||
 | 
			
		||||
  const int64_t in_stride_n = input.stride(-4);
 | 
			
		||||
  const int64_t in_stride_n = input_.ndimension() == 4 ? input.stride(-4) : 0;
 | 
			
		||||
  const int64_t in_stride_c = input.stride(-3);
 | 
			
		||||
  const int64_t in_stride_h = input.stride(-2);
 | 
			
		||||
  const int64_t in_stride_w = input.stride(-1);
 | 
			
		||||
@ -506,7 +506,7 @@ void max_pool2d_with_indices_backward_out_cuda_template(
 | 
			
		||||
  const int64_t inputHeight = input.size(-2);
 | 
			
		||||
  const int64_t inputWidth = input.size(-1);
 | 
			
		||||
 | 
			
		||||
  const int64_t in_stride_n = input.stride(-4);
 | 
			
		||||
  const int64_t in_stride_n = input.ndimension() == 4 ? input.stride(-4) : 0;
 | 
			
		||||
  const int64_t in_stride_c = input.stride(-3);
 | 
			
		||||
  const int64_t in_stride_h = input.stride(-2);
 | 
			
		||||
  const int64_t in_stride_w = input.stride(-1);
 | 
			
		||||
 | 
			
		||||
@ -198,7 +198,7 @@ void index_put_accum_kernel(Tensor & self, TensorList indices, const Tensor & va
 | 
			
		||||
      using device_ptr = thrust::device_ptr<int64_t>;
 | 
			
		||||
      const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 | 
			
		||||
 | 
			
		||||
      linearIndex.div_(sliceSize);
 | 
			
		||||
      linearIndex.floor_divide_(sliceSize);
 | 
			
		||||
      {
 | 
			
		||||
      sorted_indices.copy_(linearIndex);
 | 
			
		||||
      auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
 | 
			
		||||
 | 
			
		||||
@ -431,13 +431,12 @@ __global__ void batch_norm_backward_reduce_kernel(
 | 
			
		||||
    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
 | 
			
		||||
    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
 | 
			
		||||
    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
 | 
			
		||||
    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean_dy,
 | 
			
		||||
    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean_dy_xmu,
 | 
			
		||||
    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
 | 
			
		||||
    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
 | 
			
		||||
    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight,
 | 
			
		||||
    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias) {
 | 
			
		||||
 | 
			
		||||
  index_t plane = blockIdx.x;
 | 
			
		||||
  index_t N = input.size(0) * input.size(2);
 | 
			
		||||
 | 
			
		||||
  stat_accscalar_t r_mean = mean[plane];
 | 
			
		||||
  stat_accscalar_t factor = invstd[plane];
 | 
			
		||||
@ -446,7 +445,6 @@ __global__ void batch_norm_backward_reduce_kernel(
 | 
			
		||||
  Float2<input_scalar_t, stat_accscalar_t> res = reduce<Float2<input_scalar_t, stat_accscalar_t>, GradOp<input_scalar_t, stat_accscalar_t,
 | 
			
		||||
                                                                                   GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>>>(g, grad_output, plane);
 | 
			
		||||
 | 
			
		||||
  stat_accscalar_t norm = stat_accscalar_t(1) / N;
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    if (grad_weight.size(0) > 0) {
 | 
			
		||||
      grad_weight[plane] = static_cast<stat_scalar_t>(res.v2 * factor);
 | 
			
		||||
@ -454,11 +452,11 @@ __global__ void batch_norm_backward_reduce_kernel(
 | 
			
		||||
    if (grad_bias.size(0) > 0) {
 | 
			
		||||
      grad_bias[plane] = static_cast<stat_scalar_t>(res.v1);
 | 
			
		||||
    }
 | 
			
		||||
    if (mean_dy.size(0) > 0) {
 | 
			
		||||
      mean_dy[plane] = static_cast<stat_accscalar_t>(res.v1 * norm);
 | 
			
		||||
    if (sum_dy.size(0) > 0) {
 | 
			
		||||
      sum_dy[plane] = static_cast<stat_accscalar_t>(res.v1);
 | 
			
		||||
    }
 | 
			
		||||
    if (mean_dy_xmu.size(0) > 0) {
 | 
			
		||||
      mean_dy_xmu[plane] = static_cast<stat_accscalar_t>(res.v2 * norm);
 | 
			
		||||
    if (sum_dy_xmu.size(0) > 0) {
 | 
			
		||||
      sum_dy_xmu[plane] = static_cast<stat_accscalar_t>(res.v2);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@ -740,16 +738,16 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ
 | 
			
		||||
 | 
			
		||||
  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
 | 
			
		||||
  int64_t n_input = input_.size(1);
 | 
			
		||||
  Tensor mean_dy_;
 | 
			
		||||
  Tensor mean_dy_xmu_;
 | 
			
		||||
  Tensor sum_dy_;
 | 
			
		||||
  Tensor sum_dy_xmu_;
 | 
			
		||||
  Tensor grad_weight_;
 | 
			
		||||
  Tensor grad_bias_;
 | 
			
		||||
  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
 | 
			
		||||
  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
 | 
			
		||||
 | 
			
		||||
  if (input_g) {
 | 
			
		||||
    mean_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 | 
			
		||||
    mean_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 | 
			
		||||
    sum_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 | 
			
		||||
    sum_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 | 
			
		||||
  }
 | 
			
		||||
  if (weight_g) {
 | 
			
		||||
    grad_weight_ = at::empty({n_input}, weight_.options());
 | 
			
		||||
@ -764,8 +762,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ
 | 
			
		||||
  auto grad_bias = packed_accessor_or_dummy<stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_);
 | 
			
		||||
  auto mean = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_);
 | 
			
		||||
  auto invstd = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_);
 | 
			
		||||
  auto mean_dy = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_dy_);
 | 
			
		||||
  auto mean_dy_xmu = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_dy_xmu_);
 | 
			
		||||
  auto sum_dy = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_);
 | 
			
		||||
  auto sum_dy_xmu = packed_accessor_or_dummy<stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_);
 | 
			
		||||
 | 
			
		||||
  auto batch_size = input_reshaped.size(0);
 | 
			
		||||
  auto feature_size = input_reshaped.size(2);
 | 
			
		||||
@ -778,10 +776,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ
 | 
			
		||||
  const dim3 grid(n_input);
 | 
			
		||||
 | 
			
		||||
  batch_norm_backward_reduce_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<grid, block, 0, stream>>>
 | 
			
		||||
    (input, grad_output, mean, invstd, mean_dy, mean_dy_xmu, grad_weight, grad_bias);
 | 
			
		||||
    (input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias);
 | 
			
		||||
  AT_CUDA_CHECK(cudaGetLastError());
 | 
			
		||||
 | 
			
		||||
  return std::make_tuple(mean_dy_, mean_dy_xmu_, grad_weight_, grad_bias_);
 | 
			
		||||
  return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
 | 
			
		||||
 | 
			
		||||
@ -307,6 +307,15 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
 | 
			
		||||
                "tensor ", i);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Dtypes should be the same
 | 
			
		||||
  const auto first_in_cat = inputs[0];
 | 
			
		||||
  for (int64_t i = 1; i < inputs.size(); i++) {
 | 
			
		||||
    TORCH_CHECK(first_in_cat.dtype() == inputs[i].dtype(),
 | 
			
		||||
              "Expected object of scalar type ", first_in_cat.dtype(),
 | 
			
		||||
              " but got scalar type ", inputs[i].dtype(),
 | 
			
		||||
              " for sequence element ", i, ".");
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < inputs.size(); i++)
 | 
			
		||||
  {
 | 
			
		||||
    if (should_skip(inputs[i])) {
 | 
			
		||||
@ -325,6 +334,12 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
 | 
			
		||||
  TORCH_CHECK(inputs.size() > 0, "invalid number of inputs ", inputs.size());
 | 
			
		||||
  TORCH_CHECK(dimension >= 0, "invalid dimension ", dimension);
 | 
			
		||||
 | 
			
		||||
  for (const Tensor& t: inputs) {
 | 
			
		||||
    TORCH_CHECK(t.device() == notSkippedTensor->device(),
 | 
			
		||||
                "All input tensors must be on the same device. Received ",
 | 
			
		||||
                t.device(), " and ", notSkippedTensor->device());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  c10::MemoryFormat memory_format = compute_output_memory_format(inputs);
 | 
			
		||||
 | 
			
		||||
  std::vector<int64_t> size(notSkippedTensor->sizes().vec());
 | 
			
		||||
@ -355,17 +370,11 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
 | 
			
		||||
  // 4. The number of dimensions is <= 4
 | 
			
		||||
  // 5. All input tensors are contiguous (output tensor may be non-contig)
 | 
			
		||||
  // 6. All input tensors can use 32-bit indexing
 | 
			
		||||
  // 7. All input tensors are on the same device
 | 
			
		||||
 | 
			
		||||
  const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(),
 | 
			
		||||
    [] (const Tensor& t) {
 | 
			
		||||
      return at::cuda::detail::canUse32BitIndexMath(t);
 | 
			
		||||
    });
 | 
			
		||||
  Device firstDevice = notSkippedTensor->device();
 | 
			
		||||
  const bool allSameDevice = std::all_of(inputs.begin(), inputs.end(),
 | 
			
		||||
    [firstDevice](const Tensor& t) {
 | 
			
		||||
      return t.device() == firstDevice;
 | 
			
		||||
    });
 | 
			
		||||
  const bool allContiguous = std::all_of(inputs.begin(), inputs.end(),
 | 
			
		||||
    [=](const Tensor& t) {
 | 
			
		||||
      return !t.defined() || t.is_contiguous(memory_format);
 | 
			
		||||
@ -375,8 +384,7 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
 | 
			
		||||
      out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
 | 
			
		||||
      at::cuda::detail::canUse32BitIndexMath(out) &&
 | 
			
		||||
      allContiguous &&
 | 
			
		||||
      all32BitIndexable &&
 | 
			
		||||
      allSameDevice) {
 | 
			
		||||
      all32BitIndexable) {
 | 
			
		||||
 | 
			
		||||
    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
 | 
			
		||||
        at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
 | 
			
		||||
 | 
			
		||||
@ -125,7 +125,7 @@ struct TopKTypeConfig<at::Half> {
 | 
			
		||||
  static inline __device__ RadixType convert(at::Half v) {
 | 
			
		||||
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
 | 
			
		||||
    RadixType x = __half_as_ushort(v);
 | 
			
		||||
    RadixType mask = -((x >> 15)) | 0x8000;
 | 
			
		||||
    RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000;
 | 
			
		||||
    return (v == v) ? (x ^ mask) : 0xffff;
 | 
			
		||||
#else
 | 
			
		||||
    assert(false);
 | 
			
		||||
@ -135,7 +135,7 @@ struct TopKTypeConfig<at::Half> {
 | 
			
		||||
 | 
			
		||||
  static inline __device__ at::Half deconvert(RadixType v) {
 | 
			
		||||
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
 | 
			
		||||
    RadixType mask = ((v >> 15) - 1) | 0x8000;
 | 
			
		||||
    RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff;
 | 
			
		||||
    return __ushort_as_half(v ^ mask);
 | 
			
		||||
#else
 | 
			
		||||
    assert(false);
 | 
			
		||||
 | 
			
		||||
@ -44,6 +44,7 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor empty_cuda(IntArrayRef size, const TensorOptions& options, c10::optional<MemoryFormat> optional_memory_format) {
 | 
			
		||||
  TORCH_CHECK(!isComplexType(at::typeMetaToScalarType(options.dtype())), "Complex dtype not supported.");
 | 
			
		||||
  AT_ASSERT(options.device().type() == at::DeviceType::CUDA);
 | 
			
		||||
  TORCH_INTERNAL_ASSERT(impl::variable_excluded_from_dispatch());
 | 
			
		||||
  TORCH_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned");
 | 
			
		||||
 | 
			
		||||
@ -238,18 +238,12 @@
 | 
			
		||||
 | 
			
		||||
- func: real(Tensor self) -> Tensor
 | 
			
		||||
  use_c10_dispatcher: full
 | 
			
		||||
  variants: function, method
 | 
			
		||||
  supports_named_tensor: True
 | 
			
		||||
 | 
			
		||||
- func: real.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 | 
			
		||||
  variants: function
 | 
			
		||||
  supports_named_tensor: True
 | 
			
		||||
 | 
			
		||||
- func: imag(Tensor self) -> Tensor
 | 
			
		||||
  use_c10_dispatcher: full
 | 
			
		||||
  variants: function, method
 | 
			
		||||
  supports_named_tensor: True
 | 
			
		||||
 | 
			
		||||
- func: imag.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 | 
			
		||||
  variants: function
 | 
			
		||||
  supports_named_tensor: True
 | 
			
		||||
 | 
			
		||||
- func: conj(Tensor self) -> Tensor
 | 
			
		||||
@ -2872,7 +2866,7 @@
 | 
			
		||||
 | 
			
		||||
- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
 | 
			
		||||
  use_c10_dispatcher: full
 | 
			
		||||
  variants: function
 | 
			
		||||
  variants: function, method
 | 
			
		||||
  dispatch:
 | 
			
		||||
    CPU: true_divide
 | 
			
		||||
    CUDA: true_divide
 | 
			
		||||
@ -2880,6 +2874,15 @@
 | 
			
		||||
    SparseCUDA: true_divide_sparse
 | 
			
		||||
  supports_named_tensor: True
 | 
			
		||||
 | 
			
		||||
- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
 | 
			
		||||
  variants: method
 | 
			
		||||
  dispatch:
 | 
			
		||||
    CPU: true_divide_
 | 
			
		||||
    CUDA: true_divide_
 | 
			
		||||
    SparseCPU: true_divide_sparse_
 | 
			
		||||
    SparseCUDA: true_divide_sparse_
 | 
			
		||||
  supports_named_tensor: True
 | 
			
		||||
 | 
			
		||||
- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
 | 
			
		||||
  dispatch:
 | 
			
		||||
    CPU: true_divide_out
 | 
			
		||||
@ -2890,7 +2893,11 @@
 | 
			
		||||
 | 
			
		||||
- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
 | 
			
		||||
  use_c10_dispatcher: full
 | 
			
		||||
  variants: function
 | 
			
		||||
  variants: function, method
 | 
			
		||||
  supports_named_tensor: True
 | 
			
		||||
 | 
			
		||||
- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
 | 
			
		||||
  variants: method
 | 
			
		||||
  supports_named_tensor: True
 | 
			
		||||
 | 
			
		||||
- func: trunc(Tensor self) -> Tensor
 | 
			
		||||
 | 
			
		||||
@ -272,6 +272,10 @@ SparseTensor& true_divide_out_sparse_scalar(
 | 
			
		||||
  return true_divide_out_sparse_zerodim(result, dividend, wrapped_scalar_tensor(divisor));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
Tensor& true_divide_sparse_(Tensor& self, const Tensor& divisor) {
 | 
			
		||||
  return true_divide_out_sparse_zerodim(self, self, divisor);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// --------------------------------------------------------------------
 | 
			
		||||
// floor_divide(SparseTensor, Scalar)
 | 
			
		||||
// --------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
@ -138,7 +138,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
 | 
			
		||||
      // broadcasting logic; instead, it will blast the elements from one
 | 
			
		||||
      // to the other so long as the numel is the same
 | 
			
		||||
      indicesSlice.copy_(indices1D);
 | 
			
		||||
      indices1D.div_(self.size(d));
 | 
			
		||||
      indices1D.floor_divide_(self.size(d));
 | 
			
		||||
      indicesSlice.add_(indices1D, -self.size(d));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@ -14,7 +14,7 @@ namespace xnnpack {
 | 
			
		||||
namespace {
 | 
			
		||||
torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_class() {
 | 
			
		||||
  static auto register_linear_op_context_class =
 | 
			
		||||
      torch::jit::class_<XNNPackLinearOpContext>("XNNPackLinearOpContext")
 | 
			
		||||
      torch::jit::class_<XNNPackLinearOpContext>("xnnpack", "XNNPackLinearOpContext")
 | 
			
		||||
          .def_pickle(
 | 
			
		||||
              [](const c10::intrusive_ptr<XNNPackLinearOpContext>& op_context)
 | 
			
		||||
                  -> SerializationTypeLinearPrePack { // __getstate__
 | 
			
		||||
@ -38,7 +38,7 @@ torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_cl
 | 
			
		||||
 | 
			
		||||
torch::jit::class_<XNNPackConv2dOpContext> register_xnnpack_conv2d_op_context_class() {
 | 
			
		||||
  static auto register_conv2d_op_context_class =
 | 
			
		||||
      torch::jit::class_<XNNPackConv2dOpContext>("XNNPackConv2dOpContext")
 | 
			
		||||
      torch::jit::class_<XNNPackConv2dOpContext>("xnnpack", "XNNPackConv2dOpContext")
 | 
			
		||||
          .def_pickle(
 | 
			
		||||
              [](const c10::intrusive_ptr<XNNPackConv2dOpContext>& op_context)
 | 
			
		||||
                  -> SerializationTypeConv2dPrePack { // __getstate__
 | 
			
		||||
@ -74,25 +74,25 @@ static auto registry =
 | 
			
		||||
  // Registering under _xnnpack namespace for now. As we add more backend requiring similar functionality
 | 
			
		||||
  // We can refactor the code and use a better namespace.
 | 
			
		||||
    torch::RegisterOperators()
 | 
			
		||||
        .op("_xnnpack::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.XNNPackLinearOpContext",
 | 
			
		||||
        .op("_xnnpack::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.xnnpack.XNNPackLinearOpContext",
 | 
			
		||||
            torch::RegisterOperators::options()
 | 
			
		||||
            .aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
 | 
			
		||||
            .kernel<internal::linear::LinearPrePack>(
 | 
			
		||||
                DispatchKey::CPUTensorId))
 | 
			
		||||
        .op("_xnnpack::linear_packed(Tensor X, __torch__.torch.classes.XNNPackLinearOpContext W_prepack) -> Tensor Y",
 | 
			
		||||
        .op("_xnnpack::linear_packed(Tensor X, __torch__.torch.classes.xnnpack.XNNPackLinearOpContext W_prepack) -> Tensor Y",
 | 
			
		||||
            torch::RegisterOperators::options()
 | 
			
		||||
            .aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
 | 
			
		||||
            .kernel<internal::linear::LinearPacked>(
 | 
			
		||||
                DispatchKey::CPUTensorId))
 | 
			
		||||
        .op("_xnnpack::conv2d_prepack(Tensor W, Tensor? B, int[2] stride, "
 | 
			
		||||
            "int[2] padding, int[2] dilation, int groups) "
 | 
			
		||||
            "-> __torch__.torch.classes.XNNPackConv2dOpContext",
 | 
			
		||||
            "-> __torch__.torch.classes.xnnpack.XNNPackConv2dOpContext",
 | 
			
		||||
            torch::RegisterOperators::options()
 | 
			
		||||
            .aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
 | 
			
		||||
            .kernel<internal::convolution2d::Conv2dPrePack>(
 | 
			
		||||
                DispatchKey::CPUTensorId))
 | 
			
		||||
        .op("_xnnpack::conv2d_packed(Tensor X, "
 | 
			
		||||
            "__torch__.torch.classes.XNNPackConv2dOpContext W_prepack) -> Tensor Y",
 | 
			
		||||
            "__torch__.torch.classes.xnnpack.XNNPackConv2dOpContext W_prepack) -> Tensor Y",
 | 
			
		||||
            torch::RegisterOperators::options()
 | 
			
		||||
            .aliasAnalysis(at::AliasAnalysisKind::PURE_FUNCTION)
 | 
			
		||||
            .kernel<internal::convolution2d::Conv2dPacked>(
 | 
			
		||||
 | 
			
		||||
@ -423,6 +423,85 @@ class CAFFE2_API Tensor {
 | 
			
		||||
 | 
			
		||||
  // ~~~~~ Autograd API ~~~~~
 | 
			
		||||
 | 
			
		||||
  /// \fn bool is_leaf() const;
 | 
			
		||||
  ///
 | 
			
		||||
  /// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention.
 | 
			
		||||
  ///
 | 
			
		||||
  /// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were
 | 
			
		||||
  /// created by the user. This means that they are not the result of an operation and so
 | 
			
		||||
  /// `grad_fn()` is `nullptr`.
 | 
			
		||||
  ///
 | 
			
		||||
  /// Only leaf Tensors will have their `grad()` populated during a call to `backward()`.
 | 
			
		||||
  /// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`.
 | 
			
		||||
  ///
 | 
			
		||||
  /// Example:
 | 
			
		||||
  /// @code
 | 
			
		||||
  /// auto a = torch::rand(10, torch::requires_grad());
 | 
			
		||||
  /// std::cout << a.is_leaf() << std::endl; // prints `true`
 | 
			
		||||
  ///
 | 
			
		||||
  /// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA);
 | 
			
		||||
  /// std::cout << b.is_leaf() << std::endl; // prints `false`
 | 
			
		||||
  /// // b was created by the operation that cast a cpu Tensor into a cuda Tensor
 | 
			
		||||
  ///
 | 
			
		||||
  /// auto c = torch::rand(10, torch::requires_grad()) + 2;
 | 
			
		||||
  /// std::cout << c.is_leaf() << std::endl; // prints `false`
 | 
			
		||||
  /// // c was created by the addition operation
 | 
			
		||||
  ///
 | 
			
		||||
  /// auto d = torch::rand(10).cuda();
 | 
			
		||||
  /// std::cout << d.is_leaf() << std::endl; // prints `true`
 | 
			
		||||
  /// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
 | 
			
		||||
  ///
 | 
			
		||||
  /// auto e = torch::rand(10).cuda().requires_grad_();
 | 
			
		||||
  /// std::cout << e.is_leaf() << std::endl; // prints `true`
 | 
			
		||||
  /// // e requires gradients and has no operations creating it
 | 
			
		||||
  ///
 | 
			
		||||
  /// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true));
 | 
			
		||||
  /// std::cout << f.is_leaf() << std::endl; // prints `true`
 | 
			
		||||
  /// // f requires grad, has no operation creating it
 | 
			
		||||
  /// @endcode
 | 
			
		||||
 | 
			
		||||
  /// \fn void backward(const Tensor & gradient={}, bool keep_graph=false, bool create_graph=false) const;
 | 
			
		||||
  ///
 | 
			
		||||
  /// Computes the gradient of current tensor with respect to graph leaves.
 | 
			
		||||
  ///
 | 
			
		||||
  /// The graph is differentiated using the chain rule. If the tensor is
 | 
			
		||||
  /// non-scalar (i.e. its data has more than one element) and requires
 | 
			
		||||
  /// gradient, the function additionally requires specifying ``gradient``.
 | 
			
		||||
  /// It should be a tensor of matching type and location, that contains
 | 
			
		||||
  /// the gradient of the differentiated function w.r.t. this Tensor.
 | 
			
		||||
  ///
 | 
			
		||||
  /// This function accumulates gradients in the leaves - you might need to
 | 
			
		||||
  /// zero them before calling it.
 | 
			
		||||
  ///
 | 
			
		||||
  /// \param gradient Gradient w.r.t. the
 | 
			
		||||
  ///     tensor. If it is a tensor, it will be automatically converted
 | 
			
		||||
  ///     to a Tensor that does not require grad unless ``create_graph`` is True.
 | 
			
		||||
  ///     None values can be specified for scalar Tensors or ones that
 | 
			
		||||
  ///     don't require grad. If a None value would be acceptable then
 | 
			
		||||
  ///     this argument is optional.
 | 
			
		||||
  /// \param keep_graph If ``false``, the graph used to compute
 | 
			
		||||
  ///     the grads will be freed. Note that in nearly all cases setting
 | 
			
		||||
  ///     this option to True is not needed and often can be worked around
 | 
			
		||||
  ///     in a much more efficient way. Defaults to the value of
 | 
			
		||||
  ///     ``create_graph``.
 | 
			
		||||
  /// \param create_graph If ``true``, graph of the derivative will
 | 
			
		||||
  ///     be constructed, allowing to compute higher order derivative
 | 
			
		||||
  ///     products. Defaults to ``false``.
 | 
			
		||||
 | 
			
		||||
  /// \fn Tensor detach() const;
 | 
			
		||||
  ///
 | 
			
		||||
  /// Returns a new Tensor, detached from the current graph.
 | 
			
		||||
  /// The result will never require gradient.
 | 
			
		||||
 | 
			
		||||
  /// \fn Tensor & detach_() const;
 | 
			
		||||
  ///
 | 
			
		||||
  /// Detaches the Tensor from the graph that created it, making it a leaf.
 | 
			
		||||
  /// Views cannot be detached in-place.
 | 
			
		||||
 | 
			
		||||
  /// \fn void retain_grad() const;
 | 
			
		||||
  ///
 | 
			
		||||
  /// Enables .grad() for non-leaf Tensors.
 | 
			
		||||
 | 
			
		||||
  Tensor& set_requires_grad(bool requires_grad) {
 | 
			
		||||
    impl_->set_requires_grad(requires_grad);
 | 
			
		||||
    return *this;
 | 
			
		||||
@ -431,9 +510,16 @@ class CAFFE2_API Tensor {
 | 
			
		||||
    return impl_->requires_grad();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /// Return a mutable reference to the gradient. This is conventionally
 | 
			
		||||
  /// used as `t.grad() = x` to set a gradient to a completely new tensor.
 | 
			
		||||
  Tensor& grad() {
 | 
			
		||||
    return impl_->grad();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /// This function returns an undefined tensor by default and returns a defined tensor
 | 
			
		||||
  /// the first time a call to `backward()` computes gradients for this Tensor.
 | 
			
		||||
  /// The attribute will then contain the gradients computed and future calls
 | 
			
		||||
  /// to `backward()` will accumulate (add) gradients into it.
 | 
			
		||||
  const Tensor& grad() const {
 | 
			
		||||
    return impl_->grad();
 | 
			
		||||
  }
 | 
			
		||||
@ -505,11 +591,38 @@ class CAFFE2_API Tensor {
 | 
			
		||||
  template <typename T>
 | 
			
		||||
  using hook_return_var_t = std::enable_if_t<std::is_same<typename std::result_of<T&(Tensor)>::type, Tensor>::value, unsigned>;
 | 
			
		||||
 | 
			
		||||
  // Returns the index of the hook in the list which can be used to remove hook
 | 
			
		||||
  // Register a hook with no return value
 | 
			
		||||
  /// Registers a backward hook.
 | 
			
		||||
  ///
 | 
			
		||||
  /// The hook will be called every time a gradient with respect to the Tensor is computed.
 | 
			
		||||
  /// The hook should have one of the following signature:
 | 
			
		||||
  /// ```
 | 
			
		||||
  /// hook(Tensor grad) -> Tensor
 | 
			
		||||
  /// ```
 | 
			
		||||
  /// ```
 | 
			
		||||
  /// hook(Tensor grad) -> void
 | 
			
		||||
  /// ```
 | 
			
		||||
  /// The hook should not modify its argument, but it can optionally return a new gradient
 | 
			
		||||
  /// which will be used in place of `grad`.
 | 
			
		||||
  ///
 | 
			
		||||
  /// This function returns the index of the hook in the list which can be used to remove hook.
 | 
			
		||||
  ///
 | 
			
		||||
  /// Example:
 | 
			
		||||
  /// @code
 | 
			
		||||
  /// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad());
 | 
			
		||||
  /// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient
 | 
			
		||||
  /// v.backward(torch::tensor({1., 2., 3.}));
 | 
			
		||||
  /// // This prints:
 | 
			
		||||
  /// // ```
 | 
			
		||||
  /// //  2
 | 
			
		||||
  /// //  4
 | 
			
		||||
  /// //  6
 | 
			
		||||
  /// // [ CPUFloatType{3} ]
 | 
			
		||||
  /// // ```
 | 
			
		||||
  /// std::cout << v.grad() << std::endl;
 | 
			
		||||
  /// v.remove_hook(h);  // removes the hook
 | 
			
		||||
  /// @endcode
 | 
			
		||||
  template <typename T>
 | 
			
		||||
  hook_return_void_t<T> register_hook(T&& hook) const;
 | 
			
		||||
  // Register a hook with variable return value
 | 
			
		||||
  template <typename T>
 | 
			
		||||
  hook_return_var_t<T> register_hook(T&& hook) const;
 | 
			
		||||
 | 
			
		||||
@ -518,7 +631,7 @@ private:
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  // Remove hook at given position
 | 
			
		||||
  /// Remove hook at given position
 | 
			
		||||
  void remove_hook(unsigned pos) const;
 | 
			
		||||
 | 
			
		||||
  // View Variables
 | 
			
		||||
 | 
			
		||||
@ -69,12 +69,6 @@
 | 
			
		||||
# define TH_UNUSED
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(__clang__)
 | 
			
		||||
#define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero")))
 | 
			
		||||
#else
 | 
			
		||||
#define __ubsan_ignore_float_divide_by_zero__
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifndef M_PI
 | 
			
		||||
# define M_PI 3.14159265358979323846
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@ -9,7 +9,7 @@ set(extra_src)
 | 
			
		||||
# loop over all types
 | 
			
		||||
foreach(THC_TYPE Byte Char Short Int Long Half Float Double)
 | 
			
		||||
   # loop over files which need to be split between types (because of long compile times)
 | 
			
		||||
   foreach(THC_FILE TensorSort TensorMathPointwise TensorMathReduce TensorMasked)
 | 
			
		||||
   foreach(THC_FILE TensorSort TensorMathPointwise TensorMathReduce TensorMasked TensorTopK)
 | 
			
		||||
      if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
 | 
			
		||||
         FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu"
 | 
			
		||||
           "#include <THC/THC${THC_FILE}.cuh>\n#include <THC/THCTensor.hpp>\n\n#include <THC/generic/THC${THC_FILE}.cu>\n#include <THC/THCGenerate${THC_TYPE}Type.h>\n")
 | 
			
		||||
@ -56,7 +56,6 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
 | 
			
		||||
  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorIndex.cu
 | 
			
		||||
  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cu
 | 
			
		||||
  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorScatterGather.cu
 | 
			
		||||
  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTopK.cu
 | 
			
		||||
  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu
 | 
			
		||||
  ${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu
 | 
			
		||||
  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu
 | 
			
		||||
 | 
			
		||||
@ -1,19 +0,0 @@
 | 
			
		||||
#include <THC/THC.h>
 | 
			
		||||
#include <THC/THCReduceApplyUtils.cuh>
 | 
			
		||||
#include <THC/THCTensorCopy.h>
 | 
			
		||||
#include <THC/THCTensorMath.h>
 | 
			
		||||
#include <THC/THCAsmUtils.cuh>
 | 
			
		||||
#include <THC/THCScanUtils.cuh>
 | 
			
		||||
#include <THC/THCTensorTypeUtils.cuh>
 | 
			
		||||
#include <THC/THCTensorMathReduce.cuh>
 | 
			
		||||
#include <ATen/WrapDimUtils.h>
 | 
			
		||||
#include <algorithm> // for std::min
 | 
			
		||||
 | 
			
		||||
#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
 | 
			
		||||
#include <thrust/system/cuda/execution_policy.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateAllTypes.h>
 | 
			
		||||
@ -1,6 +1,21 @@
 | 
			
		||||
#ifndef THC_TENSOR_TOPK_CUH
 | 
			
		||||
#define THC_TENSOR_TOPK_CUH
 | 
			
		||||
 | 
			
		||||
#include <THC/THC.h>
 | 
			
		||||
#include <THC/THCReduceApplyUtils.cuh>
 | 
			
		||||
#include <THC/THCTensorCopy.h>
 | 
			
		||||
#include <THC/THCTensorMath.h>
 | 
			
		||||
#include <THC/THCAsmUtils.cuh>
 | 
			
		||||
#include <THC/THCScanUtils.cuh>
 | 
			
		||||
#include <THC/THCTensorTypeUtils.cuh>
 | 
			
		||||
#include <THC/THCTensorMathReduce.cuh>
 | 
			
		||||
#include <ATen/WrapDimUtils.h>
 | 
			
		||||
#include <algorithm> // for std::min
 | 
			
		||||
 | 
			
		||||
#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
 | 
			
		||||
#include <thrust/system/cuda/execution_policy.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include <c10/macros/Macros.h>
 | 
			
		||||
#include <ATen/native/cuda/SortingRadixSelect.cuh>
 | 
			
		||||
 | 
			
		||||
@ -52,6 +67,7 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input,
 | 
			
		||||
    inputSliceStart, outputSliceSize,
 | 
			
		||||
    inputSliceSize, inputWithinSliceStride,
 | 
			
		||||
    smem, &topKValue);
 | 
			
		||||
  const auto topKConverted = at::native::TopKTypeConfig<T>::convert(topKValue);
 | 
			
		||||
 | 
			
		||||
  // Every value that is strictly less/greater than `pattern`
 | 
			
		||||
  // (depending on sort dir) in sorted int format is in the top-K.
 | 
			
		||||
@ -74,11 +90,12 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input,
 | 
			
		||||
    bool inRange = (i < inputSliceSize);
 | 
			
		||||
    T v =
 | 
			
		||||
      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
 | 
			
		||||
    const auto convertedV = at::native::TopKTypeConfig<T>::convert(v);
 | 
			
		||||
    bool hasTopK;
 | 
			
		||||
    if (Order) {
 | 
			
		||||
      hasTopK = inRange && (THCNumerics<T>::gt(v, topKValue));
 | 
			
		||||
      hasTopK = inRange && (convertedV > topKConverted);
 | 
			
		||||
    } else {
 | 
			
		||||
      hasTopK = inRange && (THCNumerics<T>::lt(v, topKValue));
 | 
			
		||||
      hasTopK = inRange && (convertedV < topKConverted);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    int index;
 | 
			
		||||
@ -111,7 +128,8 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input,
 | 
			
		||||
    bool inRange = (i < inputSliceSize);
 | 
			
		||||
    T v =
 | 
			
		||||
      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : ScalarConvert<int, T>::to(0);
 | 
			
		||||
    bool hasTopK = inRange && (THCNumerics<T>::eq(v, topKValue));
 | 
			
		||||
    const auto convertedV = at::native::TopKTypeConfig<T>::convert(v);
 | 
			
		||||
    bool hasTopK = inRange && (convertedV == topKConverted);
 | 
			
		||||
 | 
			
		||||
    int index;
 | 
			
		||||
    int carry;
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKByte.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKByte.cu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
#include <THC/THCTensor.hpp>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateByteType.h>
 | 
			
		||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKChar.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKChar.cu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
#include <THC/THCTensor.hpp>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateCharType.h>
 | 
			
		||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKDouble.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKDouble.cu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
#include <THC/THCTensor.hpp>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateDoubleType.h>
 | 
			
		||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKFloat.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKFloat.cu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
#include <THC/THCTensor.hpp>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateFloatType.h>
 | 
			
		||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKHalf.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKHalf.cu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
#include <THC/THCTensor.hpp>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateHalfType.h>
 | 
			
		||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKInt.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKInt.cu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
#include <THC/THCTensor.hpp>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateIntType.h>
 | 
			
		||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKLong.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKLong.cu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
#include <THC/THCTensor.hpp>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateLongType.h>
 | 
			
		||||
							
								
								
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKShort.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								aten/src/THC/generated/THCTensorTopKShort.cu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#include <THC/THCTensorTopK.cuh>
 | 
			
		||||
#include <THC/THCTensor.hpp>
 | 
			
		||||
 | 
			
		||||
#include <THC/generic/THCTensorTopK.cu>
 | 
			
		||||
#include <THC/THCGenerateShortType.h>
 | 
			
		||||
@ -23,6 +23,14 @@
 | 
			
		||||
 | 
			
		||||
#include "c10/macros/Export.h"
 | 
			
		||||
 | 
			
		||||
#if defined(__clang__)
 | 
			
		||||
  #define __ubsan_ignore_float_divide_by_zero__ __attribute__((no_sanitize("float-divide-by-zero")))
 | 
			
		||||
  #define __ubsan_ignore_float_cast_overflow__ __attribute__((no_sanitize("float-cast-overflow")))
 | 
			
		||||
#else
 | 
			
		||||
  #define __ubsan_ignore_float_divide_by_zero__
 | 
			
		||||
  #define __ubsan_ignore_float_cast_overflow__
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Disable the copy and assignment operator for a class. Note that this will
 | 
			
		||||
// disable the usage of the class in std containers.
 | 
			
		||||
#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
 | 
			
		||||
 | 
			
		||||
@ -66,24 +66,44 @@ void Error::AppendMessage(const std::string& new_msg) {
 | 
			
		||||
namespace Warning {
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
  WarningHandler* getHandler() {
 | 
			
		||||
  WarningHandler* getBaseHandler() {
 | 
			
		||||
    static WarningHandler base_warning_handler_ = WarningHandler();
 | 
			
		||||
    return &base_warning_handler_;
 | 
			
		||||
  };
 | 
			
		||||
  static thread_local WarningHandler* warning_handler_ = getHandler();
 | 
			
		||||
 | 
			
		||||
  class ThreadWarningHandler {
 | 
			
		||||
    public:
 | 
			
		||||
      ThreadWarningHandler() = delete;
 | 
			
		||||
 | 
			
		||||
      static WarningHandler* get_handler() {
 | 
			
		||||
        if (!warning_handler_) {
 | 
			
		||||
          warning_handler_ = getBaseHandler();
 | 
			
		||||
        }
 | 
			
		||||
        return warning_handler_;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      static void set_handler(WarningHandler* handler) {
 | 
			
		||||
        warning_handler_ = handler;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    private:
 | 
			
		||||
      static thread_local WarningHandler* warning_handler_;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  thread_local WarningHandler* ThreadWarningHandler::warning_handler_ = nullptr;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void warn(SourceLocation source_location, const std::string& msg) {
 | 
			
		||||
  warning_handler_->process(source_location, msg);
 | 
			
		||||
  ThreadWarningHandler::get_handler()->process(source_location, msg);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void set_warning_handler(WarningHandler* handler) noexcept(true) {
 | 
			
		||||
  warning_handler_ = handler;
 | 
			
		||||
  ThreadWarningHandler::set_handler(handler);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
WarningHandler* get_warning_handler() noexcept(true) {
 | 
			
		||||
  return warning_handler_;
 | 
			
		||||
  return ThreadWarningHandler::get_handler();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
} // namespace Warning
 | 
			
		||||
 | 
			
		||||
@ -67,7 +67,7 @@ struct maybe_real<true, src_t> {
 | 
			
		||||
 | 
			
		||||
template <typename dest_t, typename src_t>
 | 
			
		||||
struct static_cast_with_inter_type {
 | 
			
		||||
  C10_HOST_DEVICE static inline dest_t apply(src_t src) {
 | 
			
		||||
  C10_HOST_DEVICE __ubsan_ignore_float_cast_overflow__ static inline dest_t apply(src_t src) {
 | 
			
		||||
    constexpr bool real = needs_real<dest_t, src_t>::value;
 | 
			
		||||
    return static_cast<dest_t>(
 | 
			
		||||
      static_cast<inter_copy_type_t<dest_t>>(maybe_real<real, src_t>::apply(src)));
 | 
			
		||||
 | 
			
		||||
@ -748,7 +748,7 @@ if (NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
 | 
			
		||||
      target_include_directories(torch_cuda PUBLIC "${NVTOOLEXT_HOME}/include")
 | 
			
		||||
      # -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it.
 | 
			
		||||
      # Related issue: https://github.com/pytorch/pytorch/issues/31611
 | 
			
		||||
      target_link_libraries(torch_cuda INTERFACE "-INCLUDE:\"?warp_size@cuda@at@@YAHXZ\"")
 | 
			
		||||
      target_link_libraries(torch_cuda INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
 | 
			
		||||
 | 
			
		||||
    elseif(APPLE)
 | 
			
		||||
      set(TORCH_CUDA_LIBRARIES
 | 
			
		||||
@ -949,6 +949,31 @@ if (USE_OPENMP AND OPENMP_FOUND)
 | 
			
		||||
  target_link_libraries(torch_cpu PRIVATE ${OpenMP_CXX_LIBRARIES})
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
if ($ENV{TH_BINARY_BUILD})
 | 
			
		||||
  if (NOT MSVC AND USE_CUDA AND NOT APPLE)
 | 
			
		||||
    # Note [Extra MKL symbols for MAGMA in torch_cpu]
 | 
			
		||||
    #
 | 
			
		||||
    # When we build CUDA libraries and link against MAGMA, MAGMA makes use of
 | 
			
		||||
    # some BLAS symbols in its CPU fallbacks when it has no GPU versions
 | 
			
		||||
    # of kernels.  Previously, we ensured the BLAS symbols were filled in by
 | 
			
		||||
    # MKL by linking torch_cuda with BLAS, but when we are statically linking
 | 
			
		||||
    # against MKL (when we do wheel builds), this actually ends up pulling in a
 | 
			
		||||
    # decent chunk of MKL into torch_cuda, inflating our torch_cuda binary
 | 
			
		||||
    # size by 8M.  torch_cpu exposes most of the MKL symbols we need, but
 | 
			
		||||
    # empirically we determined that there are four which it doesn't provide.  If
 | 
			
		||||
    # we link torch_cpu with these --undefined symbols, we can ensure they
 | 
			
		||||
    # do get pulled in, and then we can avoid statically linking in MKL to
 | 
			
		||||
    # torch_cuda at all!
 | 
			
		||||
    #
 | 
			
		||||
    # We aren't really optimizing for binary size on Windows (and this link
 | 
			
		||||
    # line doesn't work on Windows), so don't do it there.
 | 
			
		||||
    #
 | 
			
		||||
    # These linker commands do not work on OS X, do not attempt this there.
 | 
			
		||||
    # (It shouldn't matter anyway, though, because OS X has dropped CUDA support)
 | 
			
		||||
    set_target_properties(torch_cpu PROPERTIES LINK_FLAGS "-Wl,--undefined=mkl_lapack_slaed0 -Wl,--undefined=mkl_lapack_dlaed0 -Wl,--undefined=mkl_lapack_dormql -Wl,--undefined=mkl_lapack_sormql")
 | 
			
		||||
  endif()
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
target_link_libraries(torch_cpu PUBLIC c10)
 | 
			
		||||
target_link_libraries(torch_cpu PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 | 
			
		||||
target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_LIBS})
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,8 @@
 | 
			
		||||
#include "caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h"
 | 
			
		||||
#include <fp16.h>
 | 
			
		||||
#ifdef __AVX__
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
#endif
 | 
			
		||||
#include "c10/util/Registry.h"
 | 
			
		||||
 | 
			
		||||
namespace caffe2 {
 | 
			
		||||
 | 
			
		||||
@ -50,8 +50,13 @@ __global__ void ReluCUDAKernel<half2>(const int N, const half2* X, half2* Y) {
 | 
			
		||||
    Y[i] = __hmul2(__hgt2(__ldg(X + i), kZero), __ldg(X + i));
 | 
			
		||||
#else
 | 
			
		||||
    const float2 xx = __half22float2(X[i]);
 | 
			
		||||
    Y[i] =
 | 
			
		||||
        __floats2half2_rn(xx.x > 0.0f ? xx.x : 0.0f, xx.y > 0.0f ? xx.y : 0.0f);
 | 
			
		||||
    // There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered
 | 
			
		||||
    // sometimes:
 | 
			
		||||
    //
 | 
			
		||||
    //   error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 0>' can be
 | 
			
		||||
    //   converted to 'float' and vice versa
 | 
			
		||||
    Y[i] = __floats2half2_rn(xx.x > 0.0f ? static_cast<float>(xx.x) : 0.0f,
 | 
			
		||||
                             xx.y > 0.0f ? static_cast<float>(xx.y) : 0.0f);
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@ -100,8 +105,14 @@ __global__ void ReluGradientCUDAKernel<half2>(
 | 
			
		||||
#else
 | 
			
		||||
    const float2 dy = __half22float2(dY[i]);
 | 
			
		||||
    const float2 yy = __half22float2(Y[i]);
 | 
			
		||||
    dX[i] =
 | 
			
		||||
        __floats2half2_rn(yy.x > 0.0f ? dy.x : 0.0f, yy.y > 0.0f ? dy.y : 0.0f);
 | 
			
		||||
    // There are explicit cast to float here, because it may otherwise cause ambiguity on ROCm and can be triggered
 | 
			
		||||
    // sometimes:
 | 
			
		||||
    //
 | 
			
		||||
    //   error: conditional expression is ambiguous; 'const hip_impl::Scalar_accessor<float, Native_vec_, 1>' can be
 | 
			
		||||
    //   converted to 'float' and vice versa
 | 
			
		||||
 | 
			
		||||
     dX[i] = __floats2half2_rn(yy.x > 0.0f ? static_cast<float>(dy.x) : 0.0f,
 | 
			
		||||
                               yy.y > 0.0f ? static_cast<float>(dy.y) : 0.0f);
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										40
									
								
								cmake/External/nccl.cmake
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										40
									
								
								cmake/External/nccl.cmake
									
									
									
									
										vendored
									
									
								
							@ -15,6 +15,7 @@ if (NOT __NCCL_INCLUDED)
 | 
			
		||||
    # this second replacement is needed when there are multiple archs
 | 
			
		||||
    string(REPLACE ";-gencode" " -gencode" NVCC_GENCODE "${NVCC_GENCODE}")
 | 
			
		||||
 | 
			
		||||
    set(__NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl")
 | 
			
		||||
    ExternalProject_Add(nccl_external
 | 
			
		||||
      SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl/nccl
 | 
			
		||||
      BUILD_IN_SOURCE 1
 | 
			
		||||
@ -30,20 +31,49 @@ if (NOT __NCCL_INCLUDED)
 | 
			
		||||
        "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}"
 | 
			
		||||
        "NVCC=${CUDA_NVCC_EXECUTABLE}"
 | 
			
		||||
        "NVCC_GENCODE=${NVCC_GENCODE}"
 | 
			
		||||
        "BUILDDIR=${CMAKE_CURRENT_BINARY_DIR}/nccl"
 | 
			
		||||
        "BUILDDIR=${__NCCL_BUILD_DIR}"
 | 
			
		||||
        "VERBOSE=0"
 | 
			
		||||
        "-j"
 | 
			
		||||
      BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/nccl/lib/libnccl_static.a"
 | 
			
		||||
        BUILD_BYPRODUCTS "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
 | 
			
		||||
      INSTALL_COMMAND ""
 | 
			
		||||
      )
 | 
			
		||||
 | 
			
		||||
    # Detect objcopy version
 | 
			
		||||
    execute_process (COMMAND "${CMAKE_OBJCOPY}" "--version" OUTPUT_VARIABLE OBJCOPY_VERSION_STR)
 | 
			
		||||
    string(REGEX REPLACE "GNU objcopy version ([0-9])\\.([0-9]+).*" "\\1" OBJCOPY_VERSION_MAJOR ${OBJCOPY_VERSION_STR})
 | 
			
		||||
    string(REGEX REPLACE "GNU objcopy version ([0-9])\\.([0-9]+).*" "\\2" OBJCOPY_VERSION_MINOR ${OBJCOPY_VERSION_STR})
 | 
			
		||||
 | 
			
		||||
    if ((${OBJCOPY_VERSION_MAJOR} GREATER 2) OR ((${OBJCOPY_VERSION_MAJOR} EQUAL 2) AND (${OBJCOPY_VERSION_MINOR} GREATER 27)))
 | 
			
		||||
      message(WARNING "Enabling NCCL library slimming")
 | 
			
		||||
      add_custom_command(
 | 
			
		||||
        OUTPUT "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a"
 | 
			
		||||
        DEPENDS nccl_external
 | 
			
		||||
        COMMAND "${CMAKE_COMMAND}" -E make_directory "${__NCCL_BUILD_DIR}/objects"
 | 
			
		||||
        COMMAND cd objects
 | 
			
		||||
        COMMAND "${CMAKE_AR}" x "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
 | 
			
		||||
        COMMAND for obj in all_gather_* all_reduce_* broadcast_* reduce_*.o$<SEMICOLON> do "${CMAKE_OBJCOPY}" --remove-relocations .nvFatBinSegment --remove-section __nv_relfatbin $$obj$<SEMICOLON> done
 | 
			
		||||
       COMMAND "${CMAKE_AR}" cr "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a" "*.o"
 | 
			
		||||
        COMMAND cd -
 | 
			
		||||
        COMMAND "${CMAKE_COMMAND}" -E remove_directory "${__NCCL_BUILD_DIR}/objects"
 | 
			
		||||
        WORKING_DIRECTORY "${__NCCL_BUILD_DIR}"
 | 
			
		||||
        COMMENT "Slimming NCCL"
 | 
			
		||||
      )
 | 
			
		||||
    add_custom_target(nccl_slim_external DEPENDS "${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a")
 | 
			
		||||
    set(__NCCL_LIBRARY_DEP nccl_slim_external)
 | 
			
		||||
    set(NCCL_LIBRARIES ${__NCCL_BUILD_DIR}/lib/libnccl_slim_static.a)
 | 
			
		||||
  else()
 | 
			
		||||
    message(WARNING "Objcopy version is too old to support NCCL library slimming")
 | 
			
		||||
    set(__NCCL_LIBRARY_DEP nccl_external)
 | 
			
		||||
    set(NCCL_LIBRARIES ${__NCCL_BUILD_DIR}/lib/libnccl_static.a)
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    set(NCCL_FOUND TRUE)
 | 
			
		||||
    add_library(__caffe2_nccl INTERFACE)
 | 
			
		||||
    # The following old-style variables are set so that other libs, such as Gloo,
 | 
			
		||||
    # can still use it.
 | 
			
		||||
    set(NCCL_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/nccl/include)
 | 
			
		||||
    set(NCCL_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/nccl/lib/libnccl_static.a)
 | 
			
		||||
    add_dependencies(__caffe2_nccl nccl_external)
 | 
			
		||||
    set(NCCL_INCLUDE_DIRS ${__NCCL_BUILD_DIR}/include)
 | 
			
		||||
    add_dependencies(__caffe2_nccl ${__NCCL_LIBRARY_DEP})
 | 
			
		||||
    target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES})
 | 
			
		||||
    target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS})
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
@ -56,6 +56,10 @@ INPUT                  = ../../../aten/src/ATen/ATen.h \
 | 
			
		||||
                         ../../../c10/cuda/CUDAStream.h \
 | 
			
		||||
                         ../../../torch/csrc/api/include \
 | 
			
		||||
                         ../../../torch/csrc/api/src \
 | 
			
		||||
                         ../../../torch/csrc/autograd/autograd.h \
 | 
			
		||||
                         ../../../torch/csrc/autograd/custom_function.h \
 | 
			
		||||
                         ../../../torch/csrc/autograd/function.h \
 | 
			
		||||
                         ../../../torch/csrc/autograd/variable.h \
 | 
			
		||||
                         ../../../torch/csrc/autograd/generated/variable_factories.h \
 | 
			
		||||
                         ../../../torch/csrc/jit/runtime/custom_operator.h \
 | 
			
		||||
                         ../../../torch/csrc/jit/serialization/import.h \
 | 
			
		||||
 | 
			
		||||
@ -281,7 +281,9 @@ change one property, this is quite practical.
 | 
			
		||||
In conclusion, we can now compare how ``TensorOptions`` defaults, together with
 | 
			
		||||
the abbreviated API for creating ``TensorOptions`` using free functions, allow
 | 
			
		||||
tensor creation in C++ with the same convenience as in Python. Compare this
 | 
			
		||||
call in Python::
 | 
			
		||||
call in Python:
 | 
			
		||||
 | 
			
		||||
.. code-block:: python
 | 
			
		||||
 | 
			
		||||
  torch.randn(3, 4, dtype=torch.float32, device=torch.device('cuda', 1), requires_grad=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										99
									
								
								docs/cpp/source/notes/tensor_indexing.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										99
									
								
								docs/cpp/source/notes/tensor_indexing.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,99 @@
 | 
			
		||||
Tensor Indexing API
 | 
			
		||||
===================
 | 
			
		||||
 | 
			
		||||
Indexing a tensor in the PyTorch C++ API works very similar to the Python API.
 | 
			
		||||
All index types such as ``None`` / ``...`` / integer / boolean / slice / tensor
 | 
			
		||||
are available in the C++ API, making translation from Python indexing code to C++
 | 
			
		||||
very simple. The main difference is that, instead of using the ``[]``-operator
 | 
			
		||||
similar to the Python API syntax, in the C++ API the indexing methods are:
 | 
			
		||||
 | 
			
		||||
- ``torch::Tensor::index`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4NK2at6Tensor5indexE8ArrayRefIN2at8indexing11TensorIndexEE>`_)
 | 
			
		||||
- ``torch::Tensor::index_put_`` (`link <https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#_CPPv4N2at6Tensor10index_put_E8ArrayRefIN2at8indexing11TensorIndexEERK6Tensor>`_)
 | 
			
		||||
 | 
			
		||||
It's also important to note that index types such as ``None`` / ``Ellipsis`` / ``Slice``
 | 
			
		||||
live in the ``torch::indexing`` namespace, and it's recommended to put ``using namespace torch::indexing``
 | 
			
		||||
before any indexing code for convenient use of those index types.
 | 
			
		||||
 | 
			
		||||
Here are some examples of translating Python indexing code to C++:
 | 
			
		||||
 | 
			
		||||
Getter
 | 
			
		||||
------
 | 
			
		||||
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| Python                                                   | C++  (assuming ``using namespace torch::indexing``)                                  |
 | 
			
		||||
+==========================================================+======================================================================================+
 | 
			
		||||
| ``tensor[None]``                                         | ``tensor.index({None})``                                                             |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[Ellipsis, ...]``                                | ``tensor.index({Ellipsis, "..."})``                                                  |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[1, 2]``                                         | ``tensor.index({1, 2})``                                                             |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[True, False]``                                  | ``tensor.index({true, false})``                                                      |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[1::2]``                                         | ``tensor.index({Slice(1, None, 2)})``                                                |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[torch.tensor([1, 2])]``                         | ``tensor.index({torch::tensor({1, 2})})``                                            |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[..., 0, True, 1::2, torch.tensor([1, 2])]``     | ``tensor.index({"...", 0, true, Slice(1, None, 2), torch::tensor({1, 2})})``         |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
 | 
			
		||||
Setter
 | 
			
		||||
------
 | 
			
		||||
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| Python                                                   | C++  (assuming ``using namespace torch::indexing``)                                  |
 | 
			
		||||
+==========================================================+======================================================================================+
 | 
			
		||||
| ``tensor[None] = 1``                                     | ``tensor.index_put_({None}, 1)``                                                     |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[Ellipsis, ...] = 1``                            | ``tensor.index_put_({Ellipsis, "..."}, 1)``                                          |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[1, 2] = 1``                                     | ``tensor.index_put_({1, 2}, 1)``                                                     |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[True, False] = 1``                              | ``tensor.index_put_({true, false}, 1)``                                              |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[1::2] = 1``                                     | ``tensor.index_put_({Slice(1, None, 2)}, 1)``                                        |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[torch.tensor([1, 2])] = 1``                     | ``tensor.index_put_({torch::tensor({1, 2})}, 1)``                                    |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
| ``tensor[..., 0, True, 1::2, torch.tensor([1, 2])] = 1`` | ``tensor.index_put_({"...", 0, true, Slice(1, None, 2), torch::tensor({1, 2})}, 1)`` |
 | 
			
		||||
+----------------------------------------------------------+--------------------------------------------------------------------------------------+
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Translating between Python/C++ index types
 | 
			
		||||
------------------------------------------
 | 
			
		||||
 | 
			
		||||
The one-to-one translation between Python and C++ index types is as follows:
 | 
			
		||||
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| Python                  | C++ (assuming ``using namespace torch::indexing``)                     |
 | 
			
		||||
+=========================+========================================================================+
 | 
			
		||||
| ``None``                | ``None``                                                               |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``Ellipsis``            | ``Ellipsis``                                                           |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``...``                 | ``"..."``                                                              |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``123``                 | ``123``                                                                |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``True``                | ``true``                                                               |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``False``               | ``false``                                                              |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``:`` or ``::``         | ``Slice()`` or ``Slice(None, None)`` or ``Slice(None, None, None)``    |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``1:`` or ``1::``       | ``Slice(1, None)`` or ``Slice(1, None, None)``                         |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``:3`` or ``:3:``       | ``Slice(None, 3)`` or ``Slice(None, 3, None)``                         |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``::2``                 | ``Slice(None, None, 2)``                                               |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``1:3``                 | ``Slice(1, 3)``                                                        |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``1::2``                | ``Slice(1, None, 2)``                                                  |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``:3:2``                | ``Slice(None, 3, 2)``                                                  |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``1:3:2``               | ``Slice(1, 3, 2)``                                                     |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
| ``torch.tensor([1, 2])``| ``torch::tensor({1, 2})``                                              |
 | 
			
		||||
+-------------------------+------------------------------------------------------------------------+
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
sphinx
 | 
			
		||||
sphinx==2.4.4
 | 
			
		||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 | 
			
		||||
sphinxcontrib.katex
 | 
			
		||||
matplotlib
 | 
			
		||||
 | 
			
		||||
@ -13,6 +13,13 @@ use ``torch.float16`` (``half``). Some operations, like linear layers and convol
 | 
			
		||||
are much faster in ``float16``. Other operations, like reductions, often require the dynamic
 | 
			
		||||
range of ``float32``. Networks running in mixed precision try to match each operation to its appropriate datatype.
 | 
			
		||||
 | 
			
		||||
.. warning::
 | 
			
		||||
    :class:`torch.cuda.amp.GradScaler` is not a complete implementation of automatic mixed precision.
 | 
			
		||||
    :class:`GradScaler` is only useful if you manually run regions of your model in ``float16``.
 | 
			
		||||
    If you aren't sure how to choose op precision manually, the master branch and nightly pip/conda
 | 
			
		||||
    builds include a context manager that chooses op precision automatically wherever it's enabled.
 | 
			
		||||
    See the `master documentation <https://pytorch.org/docs/master/amp.html>`_ for details.
 | 
			
		||||
 | 
			
		||||
.. contents:: :local:
 | 
			
		||||
 | 
			
		||||
.. _gradient-scaling:
 | 
			
		||||
 | 
			
		||||
@ -395,6 +395,8 @@ of 16
 | 
			
		||||
.. autofunction:: all_gather_multigpu
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. _distributed-launch:
 | 
			
		||||
 | 
			
		||||
Launch utility
 | 
			
		||||
--------------
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -16,7 +16,6 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
 | 
			
		||||
   :caption: Notes
 | 
			
		||||
 | 
			
		||||
   notes/*
 | 
			
		||||
   PyTorch on XLA Devices <http://pytorch.org/xla/>
 | 
			
		||||
 | 
			
		||||
.. toctree::
 | 
			
		||||
   :maxdepth: 1
 | 
			
		||||
@ -46,7 +45,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
 | 
			
		||||
   onnx
 | 
			
		||||
   optim
 | 
			
		||||
   quantization
 | 
			
		||||
   rpc
 | 
			
		||||
   rpc/index.rst
 | 
			
		||||
   torch.random <random>
 | 
			
		||||
   sparse
 | 
			
		||||
   storage
 | 
			
		||||
@ -62,24 +61,15 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
 | 
			
		||||
   name_inference
 | 
			
		||||
   torch.__config__ <__config__>
 | 
			
		||||
 | 
			
		||||
.. toctree::
 | 
			
		||||
   :glob:
 | 
			
		||||
   :maxdepth: 2
 | 
			
		||||
   :caption: torchvision Reference
 | 
			
		||||
 | 
			
		||||
   torchvision/index
 | 
			
		||||
 | 
			
		||||
.. toctree::
 | 
			
		||||
   :maxdepth: 1
 | 
			
		||||
   :caption: torchaudio Reference
 | 
			
		||||
 | 
			
		||||
   :caption: Libraries
 | 
			
		||||
   
 | 
			
		||||
   PyTorch on XLA Devices <http://pytorch.org/xla/>
 | 
			
		||||
   PyTorch Elastic (torchelastic) <https://pytorch.org/elastic/>
 | 
			
		||||
   torchaudio <https://pytorch.org/audio>
 | 
			
		||||
 | 
			
		||||
.. toctree::
 | 
			
		||||
   :maxdepth: 1
 | 
			
		||||
   :caption: torchtext Reference
 | 
			
		||||
 | 
			
		||||
   torchtext <https://pytorch.org/text>
 | 
			
		||||
   torchvision/index
 | 
			
		||||
 | 
			
		||||
.. toctree::
 | 
			
		||||
   :glob:
 | 
			
		||||
 | 
			
		||||
@ -790,21 +790,6 @@ New API:
 | 
			
		||||
 | 
			
		||||
    m = torch.jit.script(MyModule())
 | 
			
		||||
 | 
			
		||||
Python 2
 | 
			
		||||
""""""""
 | 
			
		||||
If you are stuck on Python 2 and cannot use the class annotation syntax, you can use the ``__annotations__`` class member to directly apply type annotations.
 | 
			
		||||
 | 
			
		||||
.. testcode::
 | 
			
		||||
 | 
			
		||||
    from typing import Dict
 | 
			
		||||
 | 
			
		||||
    class MyModule(torch.jit.ScriptModule):
 | 
			
		||||
        __annotations__ = {'my_dict': Dict[str, int]}
 | 
			
		||||
 | 
			
		||||
        def __init__(self):
 | 
			
		||||
            super(MyModule, self).__init__()
 | 
			
		||||
            self.my_dict = {}
 | 
			
		||||
            self.my_int = 20
 | 
			
		||||
 | 
			
		||||
Constants
 | 
			
		||||
^^^^^^^^^
 | 
			
		||||
 | 
			
		||||
@ -185,13 +185,10 @@ MyPy-style type annotations using the types listed above.
 | 
			
		||||
 | 
			
		||||
    ...
 | 
			
		||||
 | 
			
		||||
  In our examples, we use comment-based type hints to ensure Python 2
 | 
			
		||||
  compatibility as well.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
An empty list is assumed to be ``List[Tensor]`` and empty dicts
 | 
			
		||||
``Dict[str, Tensor]``. To instantiate an empty list or dict of other types,
 | 
			
		||||
use `Python 3 type hints`_. If you are on Python 2, you can use ``torch.jit.annotate``.
 | 
			
		||||
use `Python 3 type hints`_.
 | 
			
		||||
 | 
			
		||||
Example (type annotations for Python 3):
 | 
			
		||||
 | 
			
		||||
@ -217,31 +214,6 @@ Example (type annotations for Python 3):
 | 
			
		||||
    x = torch.jit.script(EmptyDataStructures())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Example (``torch.jit.annotate`` for Python 2):
 | 
			
		||||
 | 
			
		||||
.. testcode::
 | 
			
		||||
 | 
			
		||||
    import torch
 | 
			
		||||
    import torch.nn as nn
 | 
			
		||||
    from typing import Dict, List, Tuple
 | 
			
		||||
 | 
			
		||||
    class EmptyDataStructures(torch.nn.Module):
 | 
			
		||||
        def __init__(self):
 | 
			
		||||
            super(EmptyDataStructures, self).__init__()
 | 
			
		||||
 | 
			
		||||
        def forward(self, x):
 | 
			
		||||
            # type: (Tensor) -> Tuple[List[Tuple[int, float]], Dict[str, int]]
 | 
			
		||||
 | 
			
		||||
            # This annotates the list to be a `List[Tuple[int, float]]`
 | 
			
		||||
            my_list = torch.jit.annotate(List[Tuple[int, float]], [])
 | 
			
		||||
            for i in range(10):
 | 
			
		||||
                my_list.append((i, float(x.item())))
 | 
			
		||||
 | 
			
		||||
            my_dict = torch.jit.annotate(Dict[str, int], {})
 | 
			
		||||
            return my_list, my_dict
 | 
			
		||||
 | 
			
		||||
    x = torch.jit.script(EmptyDataStructures())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Optional Type Refinement
 | 
			
		||||
@ -856,28 +828,8 @@ Supported constant Python types are
 | 
			
		||||
* tuples containing supported types
 | 
			
		||||
* ``torch.nn.ModuleList`` which can be used in a TorchScript for loop
 | 
			
		||||
 | 
			
		||||
.. note::
 | 
			
		||||
    If you are on Python 2, you can mark an attribute as a constant by adding
 | 
			
		||||
    its name to the ``__constants__`` property of the class:
 | 
			
		||||
 | 
			
		||||
    .. testcode::
 | 
			
		||||
 | 
			
		||||
        import torch
 | 
			
		||||
        import torch.nn as nn
 | 
			
		||||
 | 
			
		||||
        class Foo(nn.Module):
 | 
			
		||||
            __constants__ = ['a']
 | 
			
		||||
 | 
			
		||||
            def __init__(self):
 | 
			
		||||
                super(Foo, self).__init__()
 | 
			
		||||
                self.a = 1 + 4
 | 
			
		||||
 | 
			
		||||
            def forward(self, input):
 | 
			
		||||
                return self.a + input
 | 
			
		||||
 | 
			
		||||
        f = torch.jit.script(Foo())
 | 
			
		||||
 | 
			
		||||
    |
 | 
			
		||||
 | 
			
		||||
.. _module attributes:
 | 
			
		||||
 | 
			
		||||
@ -924,32 +876,3 @@ Example:
 | 
			
		||||
 | 
			
		||||
    f = torch.jit.script(Foo({'hi': 2}))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. note::
 | 
			
		||||
    If you are on Python 2, you can mark an attribute's type by adding it to
 | 
			
		||||
    the ``__annotations__`` class property as a dictionary of attribute name to
 | 
			
		||||
    type
 | 
			
		||||
 | 
			
		||||
    .. testcode::
 | 
			
		||||
 | 
			
		||||
        from typing import List, Dict
 | 
			
		||||
 | 
			
		||||
        class Foo(nn.Module):
 | 
			
		||||
            __annotations__ = {'words': List[str], 'some_dict': Dict[str, int]}
 | 
			
		||||
 | 
			
		||||
            def __init__(self, a_dict):
 | 
			
		||||
                super(Foo, self).__init__()
 | 
			
		||||
                self.words = []
 | 
			
		||||
                self.some_dict = a_dict
 | 
			
		||||
 | 
			
		||||
                # `int`s can be inferred
 | 
			
		||||
                self.my_int = 10
 | 
			
		||||
 | 
			
		||||
            def forward(self, input):
 | 
			
		||||
                # type: (str) -> int
 | 
			
		||||
                self.words.append(input)
 | 
			
		||||
                return self.some_dict[input] + self.my_int
 | 
			
		||||
 | 
			
		||||
        f = torch.jit.script(Foo({'hi': 2}))
 | 
			
		||||
 | 
			
		||||
    |
 | 
			
		||||
 | 
			
		||||
@ -30,9 +30,7 @@ Sharing CUDA tensors
 | 
			
		||||
--------------------
 | 
			
		||||
 | 
			
		||||
Sharing CUDA tensors between processes is supported only in Python 3, using
 | 
			
		||||
a ``spawn`` or ``forkserver`` start methods. :mod:`python:multiprocessing` in
 | 
			
		||||
Python 2 can only create subprocesses using ``fork``, and it's not supported
 | 
			
		||||
by the CUDA runtime.
 | 
			
		||||
a ``spawn`` or ``forkserver`` start methods. 
 | 
			
		||||
 | 
			
		||||
Unlike CPU tensors, the sending process is required to keep the original tensor
 | 
			
		||||
as long as the receiving process retains a copy of the tensor. The refcounting is
 | 
			
		||||
 | 
			
		||||
@ -187,7 +187,7 @@ mentioning all of them as in required by :meth:`~Tensor.permute`.
 | 
			
		||||
    # Move the F (dim 5) and E dimension (dim 4) to the front while keeping
 | 
			
		||||
    # the rest in the same order
 | 
			
		||||
    >>> tensor.permute(5, 4, 0, 1, 2, 3)
 | 
			
		||||
    >>> named_tensor.align_to('F', 'E', ...)  # Use '...' instead in Python 2
 | 
			
		||||
    >>> named_tensor.align_to('F', 'E', ...)
 | 
			
		||||
 | 
			
		||||
Use :meth:`~Tensor.flatten` and :meth:`~Tensor.unflatten` to flatten and unflatten
 | 
			
		||||
dimensions, respectively. These methods are more verbose than :meth:`~Tensor.view`
 | 
			
		||||
@ -317,4 +317,3 @@ operators, see :ref:`name_inference_reference-doc`.
 | 
			
		||||
 | 
			
		||||
      .. warning::
 | 
			
		||||
          The named tensor API is experimental and subject to change.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -5,6 +5,13 @@ Automatic Mixed Precision examples
 | 
			
		||||
 | 
			
		||||
.. currentmodule:: torch.cuda.amp
 | 
			
		||||
 | 
			
		||||
.. warning::
 | 
			
		||||
    :class:`torch.cuda.amp.GradScaler` is not a complete implementation of automatic mixed precision.
 | 
			
		||||
    :class:`GradScaler` is only useful if you manually run regions of your model in ``float16``.
 | 
			
		||||
    If you aren't sure how to choose op precision manually, the master branch and nightly pip/conda
 | 
			
		||||
    builds include a context manager that chooses op precision automatically wherever it's enabled.
 | 
			
		||||
    See the `master documentation <https://pytorch.org/docs/master/amp.html>`_ for details.
 | 
			
		||||
 | 
			
		||||
.. contents:: :local:
 | 
			
		||||
 | 
			
		||||
.. _gradient-scaling-examples:
 | 
			
		||||
 | 
			
		||||
@ -306,20 +306,30 @@ to overlap data transfers with computation.
 | 
			
		||||
You can make the :class:`~torch.utils.data.DataLoader` return batches placed in
 | 
			
		||||
pinned memory by passing ``pin_memory=True`` to its constructor.
 | 
			
		||||
 | 
			
		||||
.. _cuda-nn-dataparallel-instead:
 | 
			
		||||
.. _cuda-nn-ddp-instead:
 | 
			
		||||
 | 
			
		||||
Use nn.DataParallel instead of multiprocessing
 | 
			
		||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
			
		||||
Use nn.parallel.DistributedDataParallel instead of multiprocessing or nn.DataParallel
 | 
			
		||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
			
		||||
 | 
			
		||||
Most use cases involving batched inputs and multiple GPUs should default to
 | 
			
		||||
using :class:`~torch.nn.DataParallel` to utilize more than one GPU. Even with
 | 
			
		||||
the GIL, a single Python process can saturate multiple GPUs.
 | 
			
		||||
 | 
			
		||||
As of version 0.1.9, large numbers of GPUs (8+) might not be fully utilized.
 | 
			
		||||
However, this is a known issue that is under active development. As always,
 | 
			
		||||
test your use case.
 | 
			
		||||
using :class:`~torch.nn.parallel.DistributedDataParallel` to utilize more
 | 
			
		||||
than one GPU.
 | 
			
		||||
 | 
			
		||||
There are significant caveats to using CUDA models with
 | 
			
		||||
:mod:`~torch.multiprocessing`; unless care is taken to meet the data handling
 | 
			
		||||
requirements exactly, it is likely that your program will have incorrect or
 | 
			
		||||
undefined behavior.
 | 
			
		||||
 | 
			
		||||
It is recommended to use :class:`~torch.nn.parallel.DistributedDataParallel`,
 | 
			
		||||
instead of :class:`~torch.nn.DataParallel` to do multi-GPU training, even if
 | 
			
		||||
there is only a single node.
 | 
			
		||||
 | 
			
		||||
The difference between :class:`~torch.nn.parallel.DistributedDataParallel` and
 | 
			
		||||
:class:`~torch.nn.DataParallel` is: :class:`~torch.nn.parallel.DistributedDataParallel`
 | 
			
		||||
uses multiprocessing where a process is created for each GPU, while
 | 
			
		||||
:class:`~torch.nn.DataParallel` uses multithreading. By using multiprocessing,
 | 
			
		||||
each GPU has its dedicated process, this avoids the performance overhead caused
 | 
			
		||||
by GIL of Python interpreter. 
 | 
			
		||||
 | 
			
		||||
If you use :class:`~torch.nn.parallel.DistributedDataParallel`, you could use 
 | 
			
		||||
`torch.distributed.launch` utility to launch your program, see :ref:`distributed-launch`.
 | 
			
		||||
 | 
			
		||||
@ -27,10 +27,7 @@ others that require asynchronous operation.
 | 
			
		||||
CUDA in multiprocessing
 | 
			
		||||
-----------------------
 | 
			
		||||
 | 
			
		||||
The CUDA runtime does not support the ``fork`` start method. However,
 | 
			
		||||
:mod:`python:multiprocessing` in Python 2 can only create subprocesses using
 | 
			
		||||
``fork``. So Python 3 and either ``spawn`` or ``forkserver`` start method are
 | 
			
		||||
required to use CUDA in subprocesses.
 | 
			
		||||
The CUDA runtime does not support the ``fork`` start method. In Python 3, either the ``spawn`` or ``forkserver`` start method are
 | 
			
		||||
 | 
			
		||||
.. note::
 | 
			
		||||
  The start method can be set via either creating a context with
 | 
			
		||||
@ -45,7 +42,7 @@ the consumer process has references to the tensor, and the refcounting can not
 | 
			
		||||
save you if the consumer process exits abnormally via a fatal signal. See
 | 
			
		||||
:ref:`this section <multiprocessing-cuda-sharing-details>`.
 | 
			
		||||
 | 
			
		||||
See also: :ref:`cuda-nn-dataparallel-instead`
 | 
			
		||||
See also: :ref:`cuda-nn-ddp-instead`
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Best practices and tips
 | 
			
		||||
 | 
			
		||||
@ -151,11 +151,6 @@ Package not found in win-32 channel.
 | 
			
		||||
PyTorch doesn't work on 32-bit system. Please use Windows and
 | 
			
		||||
Python 64-bit version.
 | 
			
		||||
 | 
			
		||||
Why are there no Python 2 packages for Windows?
 | 
			
		||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
			
		||||
 | 
			
		||||
Because it's not stable enough. There're some issues that need to
 | 
			
		||||
be solved before we officially release it. You can build it by yourself.
 | 
			
		||||
 | 
			
		||||
Import error
 | 
			
		||||
^^^^^^^^^^^^
 | 
			
		||||
@ -290,4 +285,3 @@ tensors cannot succeed, there are two alternatives for this.
 | 
			
		||||
 | 
			
		||||
2. Share CPU tensors instead. Make sure your custom
 | 
			
		||||
:class:`~torch.utils.data.DataSet` returns CPU tensors.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										24
									
								
								docs/source/rpc/index.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								docs/source/rpc/index.rst
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,24 @@
 | 
			
		||||
.. _rpc-index:
 | 
			
		||||
 | 
			
		||||
Distributed RPC Framework
 | 
			
		||||
==============================
 | 
			
		||||
 | 
			
		||||
The distributed RPC framework provides mechanisms for multi-machine model training through a set of primitives to allow for remote communication, and a higher-level API to automatically differentiate models split across several machines.
 | 
			
		||||
 | 
			
		||||
-  :ref:`distributed-rpc-framework`
 | 
			
		||||
 | 
			
		||||
Design Notes
 | 
			
		||||
-----------
 | 
			
		||||
The distributed autograd design note covers the design of the RPC-based distributed autograd framework that is useful for applications such as model parallel training.
 | 
			
		||||
 | 
			
		||||
-  :ref:`distributed-autograd-design`
 | 
			
		||||
 | 
			
		||||
The RRef design note covers the design of the :ref:`rref` (Remote REFerence) protocol used to refer to values on remote workers by the framework.
 | 
			
		||||
 | 
			
		||||
-  :ref:`remote-reference-protocol`
 | 
			
		||||
 | 
			
		||||
Tutorials
 | 
			
		||||
---------
 | 
			
		||||
The RPC tutorial introduces users to the RPC framework and provides two example applications using :ref:`torch.distributed.rpc<distributed-rpc-framework>` APIs.
 | 
			
		||||
 | 
			
		||||
-  `Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__
 | 
			
		||||
@ -8,6 +8,8 @@ training through a set of primitives to allow for remote communication, and a
 | 
			
		||||
higher-level API to automatically differentiate models split across several
 | 
			
		||||
machines.
 | 
			
		||||
 | 
			
		||||
.. warning ::
 | 
			
		||||
     APIs in the RPC package are stable. There are multiple ongoing work items to improve performance and error handling, which will ship in future releases.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Basics
 | 
			
		||||
@ -210,3 +210,25 @@ Example::
 | 
			
		||||
    (1, 5)
 | 
			
		||||
 | 
			
		||||
For more information on ``torch.sparse_coo`` tensors, see :ref:`sparse-docs`.
 | 
			
		||||
 | 
			
		||||
torch.memory_format
 | 
			
		||||
------------
 | 
			
		||||
 | 
			
		||||
.. class:: torch.memory_format
 | 
			
		||||
 | 
			
		||||
A :class:`torch.memory_format` is an object representing the memory format on which a :class:`torch.Tensor` is
 | 
			
		||||
or will be allocated.
 | 
			
		||||
 | 
			
		||||
Possible values are:
 | 
			
		||||
 | 
			
		||||
- ``torch.contiguous_format``:
 | 
			
		||||
  Tensor is or will be  allocated in dense non-overlapping memory. Strides represented by values in decreasing order.
 | 
			
		||||
 | 
			
		||||
- ``torch.channels_last``:
 | 
			
		||||
  Tensor is or will be  allocated in dense non-overlapping memory. Strides represented by values in 
 | 
			
		||||
  ``strides[0] > strides[2] > strides[3] > strides[1] == 1`` aka NHWC order. 
 | 
			
		||||
 | 
			
		||||
- ``torch.preserve_format``:
 | 
			
		||||
  Used in functions like `clone` to preserve the memory format of the input tensor. If input tensor is 
 | 
			
		||||
  allocated in dense non-overlapping memory, the output tensor strides will be copied from the input. 
 | 
			
		||||
  Otherwise output strides will follow ``torch.contiguous_format``
 | 
			
		||||
@ -49,8 +49,10 @@ For reference, here’s a full list of view ops in PyTorch:
 | 
			
		||||
 | 
			
		||||
- Basic slicing and indexing op, e.g. ``tensor[0, 2:, 1:7:2]`` returns a view of base ``tensor``, see note below.
 | 
			
		||||
- :meth:`~torch.Tensor.as_strided`
 | 
			
		||||
- :meth:`~torch.Tensor.detach`
 | 
			
		||||
- :meth:`~torch.Tensor.diagonal`
 | 
			
		||||
- :meth:`~torch.Tensor.expand`
 | 
			
		||||
- :meth:`~torch.Tensor.expand_as`
 | 
			
		||||
- :meth:`~torch.Tensor.narrow`
 | 
			
		||||
- :meth:`~torch.Tensor.permute`
 | 
			
		||||
- :meth:`~torch.Tensor.select`
 | 
			
		||||
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user