mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 20:34:54 +08:00 
			
		
		
		
	Compare commits
	
		
			32 Commits
		
	
	
		
			v1.9.0-rc3
			...
			v1.8.0
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 37c1f4a7fe | |||
| 49b74a52a4 | |||
| 11c78e9cb3 | |||
| d6943ea58d | |||
| 02b61b49ea | |||
| d553478c98 | |||
| 63333e2a25 | |||
| 8e7eebfc9a | |||
| f8afb8bdd0 | |||
| 0851cc42b0 | |||
| 804f7b6018 | |||
| 32758d30b3 | |||
| bcb64a8084 | |||
| f07991d396 | |||
| c458cd4852 | |||
| f7c4afc0f4 | |||
| 20554c00b6 | |||
| 3464d64f08 | |||
| c6972eb3ac | |||
| 25562d3d41 | |||
| cd63c37bc6 | |||
| c79decdbba | |||
| c307a3f336 | |||
| f071020756 | |||
| 4f436f8570 | |||
| ae11589710 | |||
| 9e5bcc1020 | |||
| fa8578241d | |||
| 1368809532 | |||
| 4073248fc2 | |||
| 75153cb730 | |||
| 5bb69b080c | 
| @ -52,6 +52,14 @@ CONFIG_TREE_DATA = OrderedDict( | ||||
|             "3.7", | ||||
|         ], | ||||
|     )), | ||||
|     macos_arm64=([None], OrderedDict( | ||||
|         wheel=[ | ||||
|             "3.8", | ||||
|         ], | ||||
|         conda=[ | ||||
|             "3.8", | ||||
|         ], | ||||
|     )), | ||||
|     # Skip CUDA-9.2 builds on Windows | ||||
|     windows=( | ||||
|         [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS], | ||||
|  | ||||
| @ -164,7 +164,7 @@ def gen_build_env_list(smoke): | ||||
|             c.find_prop("gpu"), | ||||
|             c.find_prop("package_format"), | ||||
|             [c.find_prop("pyver")], | ||||
|             c.find_prop("smoke"), | ||||
|             c.find_prop("smoke") and not (c.find_prop("os_name") == "macos_arm64"),  # don't test arm64 | ||||
|             c.find_prop("libtorch_variant"), | ||||
|             c.find_prop("gcc_config_variant"), | ||||
|             c.find_prop("libtorch_config_variant"), | ||||
| @ -216,7 +216,9 @@ def get_jobs(toplevel_key, smoke): | ||||
|     configs = gen_build_env_list(smoke) | ||||
|     phase = "build" if toplevel_key == "binarybuilds" else "test" | ||||
|     for build_config in configs: | ||||
|         jobs_list.append(build_config.gen_workflow_job(phase, nightly=True)) | ||||
|         # don't test for macos_arm64 as it's cross compiled | ||||
|         if phase != "test" or build_config.os != "macos_arm64": | ||||
|             jobs_list.append(build_config.gen_workflow_job(phase, nightly=True)) | ||||
|  | ||||
|     return jobs_list | ||||
|  | ||||
|  | ||||
| @ -3,7 +3,7 @@ PHASES = ["build", "test"] | ||||
| CUDA_VERSIONS = [ | ||||
|     "101", | ||||
|     "102", | ||||
|     "112", | ||||
|     "111", | ||||
| ] | ||||
|  | ||||
| ROCM_VERSIONS = [ | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -7,6 +7,10 @@ source /env | ||||
| # Defaults here so they can be changed in one place | ||||
| export MAX_JOBS=${MAX_JOBS:-$(( $(nproc) - 2 ))} | ||||
|  | ||||
| if [[ "${DESIRED_CUDA}" == "cu111" ]]; then | ||||
|   export BUILD_SPLIT_CUDA="ON" | ||||
| fi | ||||
|  | ||||
| # Parse the parameters | ||||
| if [[ "$PACKAGE_TYPE" == 'conda' ]]; then | ||||
|   build_script='conda/build_pytorch.sh' | ||||
|  | ||||
| @ -15,6 +15,10 @@ else | ||||
|   export VC_YEAR=2019 | ||||
| fi | ||||
|  | ||||
| if [[ "${DESIRED_CUDA}" == "cu111" ]]; then | ||||
|   export BUILD_SPLIT_CUDA="ON" | ||||
| fi | ||||
|  | ||||
| set +x | ||||
| export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-} | ||||
| export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-} | ||||
|  | ||||
| @ -111,11 +111,11 @@ commands: | ||||
|                 git config --global user.email "circleci.ossci@gmail.com" | ||||
|                 git config --global user.name "CircleCI" | ||||
|                 git config remote.origin.url https://github.com/pytorch/pytorch.git | ||||
|                 git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master | ||||
|                 git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet | ||||
|                 git config --add remote.origin.fetch +refs/heads/release/1.8:refs/remotes/origin/release/1.8 | ||||
|                 git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/release/1.8:refs/remotes/origin/release/1.8 --depth=100 --quiet | ||||
|                 # PRs generated from ghstack has format CIRCLE_PR_BASE_BRANCH=gh/xxx/1234/base | ||||
|                 if [[ "${CIRCLE_PR_BASE_BRANCH}" == "gh/"* ]]; then | ||||
|                   CIRCLE_PR_BASE_BRANCH=master | ||||
|                   CIRCLE_PR_BASE_BRANCH=release/1.8 | ||||
|                 fi | ||||
|                 export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/$CIRCLE_PR_BASE_BRANCH` | ||||
|                 echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET} | ||||
|  | ||||
| @ -198,6 +198,44 @@ | ||||
|         root: /Users/distiller/project | ||||
|         paths: final_pkgs | ||||
|  | ||||
|     - store_artifacts: | ||||
|         path: /Users/distiller/project/final_pkgs | ||||
|  | ||||
|   binary_macos_arm64_build: | ||||
|     <<: *binary_mac_params | ||||
|     macos: | ||||
|       xcode: "12.3.0" | ||||
|     steps: | ||||
|     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml | ||||
|     - checkout | ||||
|     - run: | ||||
|         <<: *binary_checkout | ||||
|     - run: | ||||
|         <<: *binary_populate_env | ||||
|     - brew_update | ||||
|     - run: | ||||
|         <<: *binary_install_miniconda | ||||
|  | ||||
|     - run: | ||||
|         name: Build | ||||
|         no_output_timeout: "90m" | ||||
|         command: | | ||||
|           # Do not set -u here; there is some problem with CircleCI | ||||
|           # variable expansion with PROMPT_COMMAND | ||||
|           set -ex -o pipefail | ||||
|           export CROSS_COMPILE_ARM64=1 | ||||
|           script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_build.sh" | ||||
|           cat "$script" | ||||
|           source "$script" | ||||
|  | ||||
|     - persist_to_workspace: | ||||
|         root: /Users/distiller/project | ||||
|         paths: final_pkgs | ||||
|  | ||||
|     - store_artifacts: | ||||
|         path: /Users/distiller/project/final_pkgs | ||||
|  | ||||
|  | ||||
|   binary_ios_build: | ||||
|     <<: *pytorch_ios_params | ||||
|     macos: | ||||
|  | ||||
							
								
								
									
										5
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							| @ -93,9 +93,12 @@ jobs: | ||||
|           check_name: 'flake8-py3' | ||||
|           linter_output_path: 'flake8-output.txt' | ||||
|           commit_sha: ${{ steps.get_pr_tip.outputs.commit_sha }} | ||||
|           regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w\d+) (?<errorDesc>.*)' | ||||
|           regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w+\d+) (?<errorDesc>.*)' | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|       - name: Catch any other warnings | ||||
|         run: | | ||||
|           [ ! -s flake8-output.txt ] | ||||
|  | ||||
|   clang-tidy: | ||||
|     if: github.event_name == 'pull_request' | ||||
|  | ||||
							
								
								
									
										2
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @ -121,7 +121,7 @@ | ||||
| [submodule "third_party/XNNPACK"] | ||||
|     ignore = dirty | ||||
|     path = third_party/XNNPACK | ||||
|     url = https://github.com/google/XNNPACK.git | ||||
|     url = https://github.com/malfet/XNNPACK.git | ||||
| [submodule "third_party/fmt"] | ||||
|     ignore = dirty | ||||
|     path = third_party/fmt | ||||
|  | ||||
| @ -182,7 +182,7 @@ fi | ||||
|  | ||||
| # Patch required to build xla | ||||
| if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then | ||||
|   git clone --recursive https://github.com/pytorch/xla.git | ||||
|   git clone --recursive -b r1.8 https://github.com/pytorch/xla.git | ||||
|   ./xla/scripts/apply_patches.sh | ||||
| fi | ||||
|  | ||||
|  | ||||
| @ -54,7 +54,7 @@ function file_diff_from_base() { | ||||
|   set +e | ||||
|   git fetch origin master --quiet | ||||
|   set -e | ||||
|   git diff --name-only "$(git merge-base origin/master HEAD)" > "$1" | ||||
|   git diff --name-only "$(git merge-base origin/release/1.8 HEAD)" > "$1" | ||||
| } | ||||
|  | ||||
| function get_bazel() { | ||||
|  | ||||
| @ -300,7 +300,7 @@ test_backward_compatibility() { | ||||
|   pushd test/backward_compatibility | ||||
|   python -m venv venv | ||||
|   . venv/bin/activate | ||||
|   pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html | ||||
|   pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html | ||||
|   pip show torch | ||||
|   python dump_all_function_schemas.py --filename nightly_schemas.txt | ||||
|   deactivate | ||||
|  | ||||
| @ -11,7 +11,6 @@ | ||||
| #include <ATen/DeviceGuard.h> | ||||
| #include <ATen/DimVector.h> | ||||
| #include <ATen/Dispatch.h> | ||||
| #include <ATen/DynamicLibrary.h> | ||||
| #include <ATen/Formatting.h> | ||||
| #include <ATen/Functions.h> | ||||
| #include <ATen/NamedTensor.h> | ||||
|  | ||||
| @ -25,9 +25,16 @@ static void* checkDL(void* x) { | ||||
|  | ||||
|   return x; | ||||
| } | ||||
| DynamicLibrary::DynamicLibrary(const char* name) { | ||||
| DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) { | ||||
|   // NOLINTNEXTLINE(hicpp-signed-bitwise) | ||||
|   handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW)); | ||||
|   handle = dlopen(name, RTLD_LOCAL | RTLD_NOW); | ||||
|   if (!handle) { | ||||
|     if (alt_name) { | ||||
|       handle = checkDL(dlopen(alt_name, RTLD_LOCAL | RTLD_NOW)); | ||||
|     } else { | ||||
|         AT_ERROR("Error in dlopen or dlsym: ", dlerror()); | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| void* DynamicLibrary::sym(const char* name) { | ||||
| @ -45,7 +52,7 @@ DynamicLibrary::~DynamicLibrary() { | ||||
|  | ||||
| // Windows | ||||
|  | ||||
| DynamicLibrary::DynamicLibrary(const char* name) { | ||||
| DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) { | ||||
|   // NOLINTNEXTLINE(hicpp-signed-bitwise) | ||||
|   HMODULE theModule; | ||||
|   bool reload = true; | ||||
|  | ||||
| @ -8,7 +8,7 @@ namespace at { | ||||
| struct DynamicLibrary { | ||||
|   AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary); | ||||
|  | ||||
|   TORCH_API DynamicLibrary(const char* name); | ||||
|   TORCH_API DynamicLibrary(const char* name, const char* alt_name = nullptr); | ||||
|  | ||||
|   TORCH_API void* sym(const char* name); | ||||
|  | ||||
|  | ||||
| @ -23,10 +23,17 @@ at::DynamicLibrary& getNVRTCLibrary() { | ||||
|   constexpr auto minor = ( CUDA_VERSION / 10 ) % 10; | ||||
| #if defined(_WIN32) | ||||
|   auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll"; | ||||
|   std::string alt_libname; | ||||
| #else | ||||
|   static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor); | ||||
|   static auto lib_version = std::to_string(major) + "." + std::to_string(minor); | ||||
|   static auto libname = std::string("libnvrtc.so.") + lib_version; | ||||
| #ifdef NVRTC_SHORTHASH | ||||
|   static auto alt_libname = std::string("libnvrtc-") + C10_STRINGIZE(NVRTC_SHORTHASH) + ".so." + lib_version; | ||||
| #else | ||||
|   std::string alt_libname; | ||||
| #endif | ||||
|   static at::DynamicLibrary lib(libname.c_str()); | ||||
| #endif | ||||
|   static at::DynamicLibrary lib(libname.c_str(), alt_libname.empty() ? nullptr : alt_libname.c_str()); | ||||
|   return lib; | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -238,7 +238,12 @@ auto ConvParams::use_mkldnn(const at::Tensor& input, const at::Tensor& weight) c | ||||
|      (groups > 1 | ||||
|       || (weight.size(-1) > 3 && weight.size(-2) > 3) | ||||
|       || input.size(0) > 1 | ||||
|       || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480)); // for some case, native is faster | ||||
|       || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster | ||||
|       // OneDNN < 1.8.1 produce incorrect results in this case (see #50042) | ||||
|       // TODO(VitalyFedyunin): Remove this patch after OneDNN 1.8.1 merged in | ||||
|       && !(groups == 24 && weight.size(0) == 24 && weight.size(1) == 1) | ||||
|       );  | ||||
|  | ||||
| #endif | ||||
|   return false; | ||||
| } | ||||
|  | ||||
| @ -26,7 +26,7 @@ static void upsample_bicubic2d_out_frame( | ||||
|         const scalar_t* in = &idata[output_y * input_width + output_x]; | ||||
|         scalar_t* out = &odata[output_y * output_width + output_x]; | ||||
|  | ||||
|         for (int64_t c = 0; c < channels; ++c) { | ||||
|         for (int64_t c = 0; c < channels * nbatch; ++c) { | ||||
|           out[0] = in[0]; | ||||
|           in += input_width * input_height; | ||||
|           out += output_width * output_height; | ||||
|  | ||||
| @ -19,6 +19,27 @@ namespace { | ||||
|  | ||||
| using namespace vec256; | ||||
|  | ||||
| // Note: Explicit implementation of copysign for Half and BFloat16 | ||||
| // is needed to workaround g++-7/8 crash on aarch64, but also makes | ||||
| // copysign faster for the half-precision types | ||||
| template<typename T> | ||||
| T copysign(T a, T b) { | ||||
|   return std::copysign(a, b); | ||||
| } | ||||
|  | ||||
| // Implement copysign for half precision floats using bit ops | ||||
| // Sign is the most significant bit for both half and bfloat16 types | ||||
| template<> | ||||
| c10::Half copysign(c10::Half a, c10::Half b) { | ||||
|   return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits()); | ||||
| } | ||||
|  | ||||
| template<> | ||||
| c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) { | ||||
|    return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits()); | ||||
| } | ||||
|  | ||||
|  | ||||
| // Note: Undefined behavior when performing addition is intentionally | ||||
| // ignored. | ||||
| void add_kernel(TensorIteratorBase& iter, Scalar alpha_scalar) { | ||||
| @ -180,7 +201,7 @@ void div_floor_kernel(TensorIterator& iter) { | ||||
|                 floordiv += scalar_t(1.0); | ||||
|               } | ||||
|             } else { | ||||
|               floordiv = std::copysign(scalar_t(0), a / b); | ||||
|               floordiv = copysign(scalar_t(0), a / b); | ||||
|             } | ||||
|             return floordiv; | ||||
|           }); | ||||
| @ -889,23 +910,6 @@ void heaviside_kernel(TensorIterator& iter) { | ||||
|   }); | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| T copysign(T a, T b) { | ||||
|   return std::copysign(a, b); | ||||
| } | ||||
|  | ||||
| // Implement copysign for half precision floats using bit ops | ||||
| // Sign is the most significant bit for both half and bfloat16 types | ||||
| template<> | ||||
| c10::Half copysign(c10::Half a, c10::Half b) { | ||||
|   return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits()); | ||||
| } | ||||
|  | ||||
| template<> | ||||
| c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) { | ||||
|    return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits()); | ||||
| } | ||||
|  | ||||
| void copysign_kernel(TensorIterator& iter) { | ||||
|   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() { | ||||
|     cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t { | ||||
|  | ||||
| @ -113,31 +113,46 @@ __global__ void upsample_trilinear3d_out_frame( | ||||
| template <typename scalar_t, typename accscalar_t> | ||||
| C10_LAUNCH_BOUNDS_1(1024) | ||||
| __global__ void upsample_trilinear3d_backward_out_frame( | ||||
|     const size_t nc_, | ||||
|     const int depth1, | ||||
|     const int height1, | ||||
|     const int width1, | ||||
|     const int depth2, | ||||
|     const int height2, | ||||
|     const int width2, | ||||
|     const int num_kernels, | ||||
|     const accscalar_t rdepth, | ||||
|     const accscalar_t rheight, | ||||
|     const accscalar_t rwidth, | ||||
|     const bool align_corners, | ||||
|     scalar_t* __restrict__ idata, | ||||
|     const scalar_t* __restrict__ odata) { | ||||
|   const size_t i_numel = nc_ * depth1 * height1 * width1; | ||||
|   const size_t o_numel = nc_ * depth2 * height2 * width2; | ||||
|     PackedTensorAccessor64<scalar_t, 5> idata, | ||||
|     const PackedTensorAccessor64<scalar_t, 5> odata, | ||||
|     scalar_t* idata_ptr) { | ||||
|   int index = threadIdx.x + blockIdx.x * blockDim.x; | ||||
|  | ||||
|   for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel; index += blockDim.x * gridDim.x) { | ||||
|     size_t index_temp = index; | ||||
|     const int w2 = index_temp % width2;   // 0:width2-1 | ||||
|     index_temp /= width2; | ||||
|     const int h2 = index_temp % height2;  // 0:height2-1 | ||||
|     index_temp /= height2; | ||||
|     const int t2 = index_temp % depth2;   // 0:depth2-1 | ||||
|     const int nc = index_temp / depth2; | ||||
|   const int batchsize = idata.size(0); | ||||
|   const int channels = idata.size(1); | ||||
|   const int depth1 = idata.size(2); | ||||
|   const int height1 = idata.size(3); | ||||
|   const int width1 = idata.size(4); | ||||
|   const int depth2 = odata.size(2); | ||||
|   const int height2 = odata.size(3); | ||||
|   const int width2 = odata.size(4); | ||||
|  | ||||
|   const size_t i_numel = batchsize * channels * depth1 * height1 * width1; | ||||
|  | ||||
|   if (index < num_kernels) { | ||||
|     const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1 | ||||
|     const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1 | ||||
|     const int t2 = index / (height2 * width2); // 0:depth2-1 | ||||
|     // special case: just copy | ||||
|     if (depth1 == depth2 && height1 == height2 && width1 == width2) { | ||||
|       const int t1 = t2; | ||||
|       const int h1 = h2; | ||||
|       const int w1 = w2; | ||||
|  | ||||
|       for (int n = 0; n < batchsize; n++) { | ||||
|         for (int c = 0; c < channels; ++c) { | ||||
|           const scalar_t val = odata[n][c][t1][h1][w1]; | ||||
|           idata[n][c][t2][h2][w2] = val; | ||||
|         } | ||||
|       } | ||||
|       return; | ||||
|     } | ||||
|     // | ||||
|     const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>( | ||||
|         rdepth, t2, align_corners, /*cubic=*/false); | ||||
|     const int t1 = t1r; | ||||
| @ -159,55 +174,60 @@ __global__ void upsample_trilinear3d_backward_out_frame( | ||||
|     const accscalar_t w1lambda = w1r - w1; | ||||
|     const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda; | ||||
|     // | ||||
|     const scalar_t d2val = odata[index]; | ||||
|     fastAtomicAdd( | ||||
|       idata, | ||||
|       idx_3d(nc, depth1, height1, width1, t1, h1, w1), | ||||
|       i_numel, | ||||
|       static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val), | ||||
|       true); | ||||
|     fastAtomicAdd( | ||||
|       idata, | ||||
|       idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p), | ||||
|       i_numel, | ||||
|       static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val), | ||||
|       true); | ||||
|     fastAtomicAdd( | ||||
|       idata, | ||||
|       idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1), | ||||
|       i_numel, | ||||
|       static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val), | ||||
|       true); | ||||
|     fastAtomicAdd( | ||||
|       idata, | ||||
|       idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p), | ||||
|       i_numel, | ||||
|       static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val), | ||||
|       true); | ||||
|     fastAtomicAdd( | ||||
|       idata, | ||||
|       idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1), | ||||
|       i_numel, | ||||
|       static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val), | ||||
|       true); | ||||
|     fastAtomicAdd( | ||||
|       idata, | ||||
|       idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p), | ||||
|       i_numel, | ||||
|       static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val), | ||||
|       true); | ||||
|     fastAtomicAdd( | ||||
|       idata, | ||||
|       idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1), | ||||
|       i_numel, | ||||
|       static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val), | ||||
|       true); | ||||
|     fastAtomicAdd( | ||||
|       idata, | ||||
|       idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p), | ||||
|       i_numel, | ||||
|       static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val), | ||||
|       true); | ||||
|     for (int n = 0; n < batchsize; n++) { | ||||
|       for (int c = 0; c < channels; ++c) { | ||||
|         const scalar_t d2val = odata[n][c][t2][h2][w2]; | ||||
|         const size_t nc = n * channels + c; | ||||
|         fastAtomicAdd( | ||||
|           idata_ptr, | ||||
|           idx_3d(nc, depth1, height1, width1, t1, h1, w1), | ||||
|           i_numel, | ||||
|           static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val), | ||||
|           true); | ||||
|         fastAtomicAdd( | ||||
|           idata_ptr, | ||||
|           idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p), | ||||
|           i_numel, | ||||
|           static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val), | ||||
|           true); | ||||
|         fastAtomicAdd( | ||||
|           idata_ptr, | ||||
|           idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1), | ||||
|           i_numel, | ||||
|           static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val), | ||||
|           true); | ||||
|         fastAtomicAdd( | ||||
|           idata_ptr, | ||||
|           idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p), | ||||
|           i_numel, | ||||
|           static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val), | ||||
|           true); | ||||
|         fastAtomicAdd( | ||||
|           idata_ptr, | ||||
|           idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1), | ||||
|           i_numel, | ||||
|           static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val), | ||||
|           true); | ||||
|         fastAtomicAdd( | ||||
|           idata_ptr, | ||||
|           idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p), | ||||
|           i_numel, | ||||
|           static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val), | ||||
|           true); | ||||
|         fastAtomicAdd( | ||||
|           idata_ptr, | ||||
|           idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1), | ||||
|           i_numel, | ||||
|           static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val), | ||||
|           true); | ||||
|         fastAtomicAdd( | ||||
|           idata_ptr, | ||||
|           idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p), | ||||
|           i_numel, | ||||
|           static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val), | ||||
|           true); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| @ -350,21 +370,20 @@ static void upsample_trilinear3d_backward_out_cuda_template( | ||||
|   // so it has to be initialized to zero. | ||||
|   grad_input.zero_(); | ||||
|  | ||||
|   // const size_t num_kernels = nbatch * channels * output_depth * output_height * output_width; | ||||
|   const size_t num_kernels = grad_output.numel(); | ||||
|   const int num_kernels = output_depth * output_height * output_width; | ||||
|   const int num_threads = std::min( | ||||
|       at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); | ||||
|   cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|   if (num_kernels > 0) { | ||||
|   AT_DISPATCH_FLOATING_TYPES_AND_HALF( | ||||
|       grad_output.scalar_type(), | ||||
|       "upsample_trilinear3d_backward_out_frame", | ||||
|       [&] { | ||||
|         using accscalar_t = at::acc_type<scalar_t, true>; | ||||
|  | ||||
|         auto idata = grad_input.data_ptr<scalar_t>(); | ||||
|         auto odata = grad_output.data_ptr<scalar_t>(); | ||||
|         auto idata = grad_input.packed_accessor64<scalar_t, 5>(); | ||||
|         auto odata = grad_output.packed_accessor64<scalar_t, 5>(); | ||||
|         scalar_t* idata_ptr = grad_input.data_ptr<scalar_t>(); | ||||
|  | ||||
|         const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>( | ||||
|             input_depth, output_depth, align_corners, scales_d); | ||||
| @ -374,26 +393,20 @@ static void upsample_trilinear3d_backward_out_cuda_template( | ||||
|             input_width, output_width, align_corners, scales_w); | ||||
|  | ||||
|         upsample_trilinear3d_backward_out_frame<scalar_t, accscalar_t> | ||||
|             <<<cuda::ATenCeilDiv(num_kernels, static_cast<size_t>(num_threads)), | ||||
|             <<<cuda::ATenCeilDiv(num_kernels, num_threads), | ||||
|                num_threads, | ||||
|                0, | ||||
|                stream>>>( | ||||
|                 nbatch * channels, | ||||
|                 input_depth, | ||||
|                 input_height, | ||||
|                 input_width, | ||||
|                 output_depth, | ||||
|                 output_height, | ||||
|                 output_width, | ||||
|                 num_kernels, | ||||
|                 rdepth, | ||||
|                 rheight, | ||||
|                 rwidth, | ||||
|                 align_corners, | ||||
|                 idata, | ||||
|                 odata); | ||||
|                 odata, | ||||
|                 idata_ptr); | ||||
|         C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
|       }); | ||||
|   } | ||||
| } | ||||
|  | ||||
| } // namespace | ||||
|  | ||||
| @ -133,7 +133,9 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) { | ||||
|     ASSERT_EQ(buffer1[i].z, buffer2[i].z); | ||||
|     ASSERT_EQ(buffer1[i].w, buffer2[i].w); | ||||
|   } | ||||
| // Skipping this part until https://github.com/pytorch/pytorch/issues/51863 is resolved | ||||
|  | ||||
| #if 0 | ||||
|   // unaligned | ||||
|   for (int i = 0; i < 16; i++) { | ||||
|     for (int j = 0; j < 16; j++) { | ||||
| @ -151,4 +153,5 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) { | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @ -16,7 +16,7 @@ int32_t driver_version() { | ||||
|   return driver_version; | ||||
| } | ||||
|  | ||||
| int device_count_impl() { | ||||
| int device_count_impl(bool fail_if_no_driver) { | ||||
|   int count; | ||||
|   auto err = cudaGetDeviceCount(&count); | ||||
|   if (err == cudaSuccess) { | ||||
| @ -34,6 +34,11 @@ int device_count_impl() { | ||||
|     case cudaErrorInsufficientDriver: { | ||||
|       auto version = driver_version(); | ||||
|       if (version <= 0) { | ||||
|         if (!fail_if_no_driver) { | ||||
|           // No CUDA driver means no devices | ||||
|           count = 0; | ||||
|           break; | ||||
|         } | ||||
|         TORCH_CHECK( | ||||
|             false, | ||||
|             "Found no NVIDIA driver on your system. Please check that you " | ||||
| @ -95,9 +100,9 @@ DeviceIndex device_count() noexcept { | ||||
|   // initialize number of devices only once | ||||
|   static int count = []() { | ||||
|     try { | ||||
|       auto result = device_count_impl(); | ||||
|       auto result = device_count_impl(/*fail_if_no_driver=*/false); | ||||
|       TORCH_INTERNAL_ASSERT(result <= std::numeric_limits<DeviceIndex>::max(), "Too many CUDA devices, DeviceIndex overflowed"); | ||||
|       return device_count_impl(); | ||||
|       return result; | ||||
|     } catch (const c10::Error& ex) { | ||||
|       // We don't want to fail, but still log the warning | ||||
|       // msg() returns the message without the stack trace | ||||
| @ -110,7 +115,7 @@ DeviceIndex device_count() noexcept { | ||||
|  | ||||
| DeviceIndex device_count_ensure_non_zero() { | ||||
|   // Call the implementation every time to throw the exception | ||||
|   int count = device_count_impl(); | ||||
|   int count = device_count_impl(/*fail_if_no_driver=*/true); | ||||
|   // Zero gpus doesn't produce a warning in `device_count` but we fail here | ||||
|   TORCH_CHECK(count, "No CUDA GPUs are available"); | ||||
|   return static_cast<DeviceIndex>(count); | ||||
|  | ||||
| @ -590,6 +590,10 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) | ||||
|       list(APPEND Caffe2_GPU_SRCS | ||||
|         ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp) | ||||
|     endif() | ||||
|     set_source_files_properties( | ||||
|       ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp | ||||
|       PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}" | ||||
|     ) | ||||
|   endif() | ||||
|  | ||||
|   if(USE_ROCM) | ||||
| @ -741,6 +745,10 @@ file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT}) | ||||
| # Wrapper library for people who link against torch and expect both CPU and CUDA support | ||||
| # Contains "torch_cpu" and "torch_cuda" | ||||
| add_library(torch ${DUMMY_EMPTY_FILE}) | ||||
| if(BUILD_SPLIT_CUDA) | ||||
|   # When we split torch_cuda, we want a dummy torch_cuda library that contains both parts | ||||
|   add_library(torch_cuda ${DUMMY_EMPTY_FILE}) | ||||
| endif() | ||||
| if(HAVE_SOVERSION) | ||||
|   set_target_properties(torch PROPERTIES | ||||
|       VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) | ||||
| @ -1233,11 +1241,12 @@ endif() | ||||
|  | ||||
| caffe2_interface_library(torch_cpu torch_cpu_library) | ||||
|  | ||||
| if(BUILD_SPLIT_CUDA) | ||||
|   caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library) | ||||
|   caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library) | ||||
| elseif(USE_CUDA) | ||||
| if(USE_CUDA) | ||||
|   caffe2_interface_library(torch_cuda torch_cuda_library) | ||||
|   if(BUILD_SPLIT_CUDA) | ||||
|     caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library) | ||||
|     caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library) | ||||
|   endif() | ||||
| elseif(USE_ROCM) | ||||
|   caffe2_interface_library(torch_hip torch_hip_library) | ||||
| endif() | ||||
| @ -1245,22 +1254,26 @@ endif() | ||||
| caffe2_interface_library(torch torch_library) | ||||
|  | ||||
| install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") | ||||
| if(BUILD_SPLIT_CUDA) | ||||
|   install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") | ||||
|   install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") | ||||
| elseif(USE_CUDA) | ||||
|  | ||||
| if(USE_CUDA) | ||||
|   install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") | ||||
|   if(BUILD_SPLIT_CUDA) | ||||
|     install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") | ||||
|     install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") | ||||
|   endif() | ||||
| elseif(USE_ROCM) | ||||
|   install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") | ||||
| endif() | ||||
| install(TARGETS torch torch_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") | ||||
|  | ||||
| target_link_libraries(torch PUBLIC torch_cpu_library) | ||||
| if(BUILD_SPLIT_CUDA) | ||||
|   target_link_libraries(torch PUBLIC torch_cuda_cu_library) | ||||
|   target_link_libraries(torch PUBLIC torch_cuda_cpp_library) | ||||
| elseif(USE_CUDA) | ||||
|  | ||||
| if(USE_CUDA) | ||||
|   target_link_libraries(torch PUBLIC torch_cuda_library) | ||||
|   if(BUILD_SPLIT_CUDA) | ||||
|     target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library) | ||||
|     target_link_libraries(torch_cuda PUBLIC torch_cuda_cpp_library) | ||||
|   endif() | ||||
| elseif(USE_ROCM) | ||||
|   target_link_libraries(torch PUBLIC torch_hip_library) | ||||
| endif() | ||||
|  | ||||
| @ -188,6 +188,20 @@ find_library(CUDA_CUDA_LIB cuda | ||||
| find_library(CUDA_NVRTC_LIB nvrtc | ||||
|     PATHS ${CUDA_TOOLKIT_ROOT_DIR} | ||||
|     PATH_SUFFIXES lib lib64 lib/x64) | ||||
|   if(CUDA_NVRTC_LIB AND NOT CUDA_NVRTC_SHORTHASH) | ||||
|  execute_process( | ||||
|     COMMAND "${PYTHON_EXECUTABLE}" -c | ||||
|     "import hashlib;hash=hashlib.sha256();hash.update(open('${CUDA_NVRTC_LIB}','rb').read());print(hash.hexdigest()[:8])" | ||||
|     RESULT_VARIABLE _retval | ||||
|     OUTPUT_VARIABLE CUDA_NVRTC_SHORTHASH) | ||||
|   if(NOT _retval EQUAL 0) | ||||
|     message(WARNING "Failed to compute shorthash for libnvrtc.so") | ||||
|     set(CUDA_NVRTC_SHORTHASH "XXXXXXXX") | ||||
|   else() | ||||
|     string(STRIP "${CUDA_NVRTC_SHORTHASH}" CUDA_NVRTC_SHORTHASH) | ||||
|     message(STATUS "${CUDA_NVRTC_LIB} shorthash is ${CUDA_NVRTC_SHORTHASH}") | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
| # Create new style imported libraries. | ||||
| # Several of these libraries have a hardcoded path if CAFFE2_STATIC_LINK_CUDA | ||||
| @ -338,6 +352,12 @@ if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32) | ||||
|       set_property( | ||||
|         TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES | ||||
|         "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublasLt_static.a") | ||||
|       # Add explicit dependency to cudart_static to fix | ||||
|       # libcublasLt_static.a.o): undefined reference to symbol 'cudaStreamWaitEvent' | ||||
|       # error adding symbols: DSO missing from command line | ||||
|       set_property( | ||||
|         TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES | ||||
|         "${CUDA_cudart_static_LIBRARY}" rt dl) | ||||
|     endif() | ||||
| else() | ||||
|     set_property( | ||||
|  | ||||
							
								
								
									
										74
									
								
								docs/source/ddp_comm_hooks.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								docs/source/ddp_comm_hooks.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,74 @@ | ||||
| DDP Communication Hooks | ||||
| ======================= | ||||
|  | ||||
| DDP communication hook is a generic interface to control how to communicate | ||||
| gradients across workers by overriding the vanilla allreduce in | ||||
| `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.>`_. | ||||
| A few built-in communication hooks are provided, | ||||
| and users can easily apply any of these hooks to optimize communication. | ||||
| Besides, the hook interface can also support user-defined communication | ||||
| strategies for more advanced use cases. | ||||
|  | ||||
| .. warning :: | ||||
|     DDP communication hook is experimental and subject to change. | ||||
|  | ||||
| .. warning :: | ||||
|     DDP communication hooks can only support single process single device mode | ||||
|     on NCCL backend. | ||||
|  | ||||
| How to Use a Communication Hook? | ||||
| -------------------------------- | ||||
|  | ||||
| To use a communication hook, the user just needs to let the DDP model register | ||||
| the hook before the training loop as below. | ||||
|  | ||||
| :func:`torch.nn.parallel.DistributedDataParallel.register_comm_hook`. | ||||
|     :noindex: | ||||
|  | ||||
| Default Communication Hooks | ||||
| --------------------------- | ||||
|  | ||||
| Default communication hooks are simple **stateless** hooks, so the input state | ||||
| in ``register_comm_hook`` is either a process group or ``None``. | ||||
|  | ||||
| .. automodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks | ||||
|     :members: | ||||
|  | ||||
| PowerSGD Communication Hook | ||||
| --------------------------- | ||||
|  | ||||
| PowerSGD (`Vogels et al., NeurIPS 2019 <https://arxiv.org/abs/1905.13727>`_) | ||||
| is a gradient compression algorithm, which can provide very high compression | ||||
| rates and accelerate bandwidth-bound distributed training. | ||||
| This algorithm needs to maintain both some hyperparameters and the internal | ||||
| state. Therefore, PowerSGD communication hook is a **stateful** hook, | ||||
| and the user needs to provide a state object defined as below. | ||||
|  | ||||
| PowerSGD State | ||||
| ^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| .. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook | ||||
| .. autoclass:: PowerSGDState | ||||
|  | ||||
| PowerSGD Hooks | ||||
| ^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| .. warning :: | ||||
|     PowerSGD typically requires extra memory of the same size as the model's | ||||
|     gradients to enable error feedback, which can compensate for biased | ||||
|     compressed communication and improve accuracy. | ||||
|  | ||||
| .. warning :: | ||||
|     The current implementation may cause gradient overflow for FP16 input. | ||||
|  | ||||
| .. autofunction:: powerSGD_hook | ||||
| .. autofunction:: batched_powerSGD_hook | ||||
|  | ||||
| Acknowledgements | ||||
| ---------------- | ||||
|  | ||||
| Many thanks to PowerSGD paper author **Thijs Vogels** for the code review on | ||||
| PowerSGD communication hook, as well as the | ||||
| `comparison experiments <https://observablehq.com/@tvogels/powersgd-benchmark>`_, | ||||
| which show that the performance of PowerSGD communication hook is on par with | ||||
| the implementation in the original `paper <https://arxiv.org/abs/1905.13727>`_. | ||||
| @ -58,16 +58,16 @@ distributed (NCCL only when building with CUDA). MPI is an optional backend that | ||||
| included if you build PyTorch from source. (e.g.building PyTorch on a host that has MPI | ||||
| installed.) | ||||
|  | ||||
| .. warning :: | ||||
|     As of PyTorch v1.7, Windows support for the distributed package only covers collective | ||||
|     communications with Gloo backend, `FileStore`, and `DistributedDataParallel`. Therefore, | ||||
|     the `init_method` argument in :func:`init_process_group` must point to a file. This works | ||||
|     for both local and shared file systems: | ||||
| .. note :: | ||||
|     As of PyTorch v1.8, Windows supports all collective communications backend but NCCL, | ||||
|     If  the `init_method` argument of :func:`init_process_group` points to a file it must adhere | ||||
|     to the following schema: | ||||
|  | ||||
|     - Local file system, ``init_method="file:///d:/tmp/some_file"`` | ||||
|     - Shared file system, ``init_method="file://////{machine_name}/{share_folder_name}/some_file"`` | ||||
|  | ||||
|     Similarly, if you directly pass in a `store` argument, it must be a ``FileStore`` instance. | ||||
|     Same as on Linux platform, you can enable TcpStore by setting environment variables, | ||||
|     MASTER_ADDR and MASTER_PORT. | ||||
|  | ||||
| Which backend to use? | ||||
| ^^^^^^^^^^^^^^^^^^^^^ | ||||
|  | ||||
| @ -176,6 +176,15 @@ Probability distributions - torch.distributions | ||||
|     :undoc-members: | ||||
|     :show-inheritance: | ||||
|  | ||||
| :hidden:`LKJCholesky` | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
| .. currentmodule:: torch.distributions.lkj_cholesky | ||||
| .. autoclass:: LKJCholesky | ||||
|     :members: | ||||
|     :undoc-members: | ||||
|     :show-inheritance: | ||||
|  | ||||
| :hidden:`Laplace` | ||||
| ~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|  | ||||
|  | ||||
| @ -71,6 +71,7 @@ Features described in this documentation are classified by release status: | ||||
|    onnx | ||||
|    optim | ||||
|    complex_numbers | ||||
|    ddp_comm_hooks | ||||
|    pipeline | ||||
|    quantization | ||||
|    rpc | ||||
|  | ||||
| @ -484,6 +484,7 @@ Sparse tensor functions | ||||
| +++++++++++++++++++++++ | ||||
|  | ||||
| .. autofunction:: torch.sparse_coo_tensor | ||||
|    :noindex: | ||||
| .. autofunction:: torch.sparse.sum | ||||
| .. autofunction:: torch.sparse.addmm | ||||
| .. autofunction:: torch.sparse.mm | ||||
|  | ||||
| @ -563,5 +563,4 @@ Utilities | ||||
|     promote_types | ||||
|     use_deterministic_algorithms | ||||
|     are_deterministic_algorithms_enabled | ||||
|     vmap | ||||
|     _assert | ||||
|  | ||||
							
								
								
									
										45
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										45
									
								
								setup.py
									
									
									
									
									
								
							| @ -552,6 +552,50 @@ class build_ext(setuptools.command.build_ext.build_ext): | ||||
|             with open('compile_commands.json', 'w') as f: | ||||
|                 f.write(new_contents) | ||||
|  | ||||
| class concat_license_files(): | ||||
|     """Merge LICENSE and LICENSES_BUNDLED.txt as a context manager | ||||
|  | ||||
|     LICENSE is the main PyTorch license, LICENSES_BUNDLED.txt is auto-generated | ||||
|     from all the licenses found in ./third_party/. We concatenate them so there | ||||
|     is a single license file in the sdist and wheels with all of the necessary | ||||
|     licensing info. | ||||
|     """ | ||||
|     def __init__(self): | ||||
|         self.f1 = 'LICENSE' | ||||
|         self.f2 = 'third_party/LICENSES_BUNDLED.txt' | ||||
|  | ||||
|     def __enter__(self): | ||||
|         """Concatenate files""" | ||||
|         with open(self.f1, 'r') as f1: | ||||
|             self.bsd_text = f1.read() | ||||
|  | ||||
|         with open(self.f1, 'a') as f1: | ||||
|             with open(self.f2, 'r') as f2: | ||||
|                 self.bundled_text = f2.read() | ||||
|                 f1.write('\n\n') | ||||
|                 f1.write(self.bundled_text) | ||||
|  | ||||
|     def __exit__(self, exception_type, exception_value, traceback): | ||||
|         """Restore content of f1""" | ||||
|         with open(self.f1, 'w') as f: | ||||
|             f.write(self.bsd_text) | ||||
|  | ||||
|  | ||||
| try: | ||||
|     from wheel.bdist_wheel import bdist_wheel | ||||
| except ImportError: | ||||
|     # This is useful when wheel is not installed and bdist_wheel is not | ||||
|     # specified on the command line. If it _is_ specified, parsing the command | ||||
|     # line will fail before wheel_concatenate is needed | ||||
|     wheel_concatenate = None | ||||
| else: | ||||
|     # Need to create the proper LICENSE.txt for the wheel | ||||
|     class wheel_concatenate(bdist_wheel): | ||||
|         """ check submodules on sdist to prevent incomplete tarballs """ | ||||
|         def run(self): | ||||
|             with concat_license_files(): | ||||
|                 super().run() | ||||
|  | ||||
|  | ||||
| class install(setuptools.command.install.install): | ||||
|     def run(self): | ||||
| @ -724,6 +768,7 @@ def configure_extension_build(): | ||||
|         'build_ext': build_ext, | ||||
|         'clean': clean, | ||||
|         'install': install, | ||||
|         'bdist_wheel': wheel_concatenate, | ||||
|     } | ||||
|  | ||||
|     entry_points = { | ||||
|  | ||||
| @ -3,9 +3,11 @@ | ||||
| #include <test/cpp/jit/test_utils.h> | ||||
|  | ||||
| #include <ATen/core/qualified_name.h> | ||||
| #include <torch/csrc/jit/api/module.h> | ||||
| #include <torch/csrc/jit/frontend/resolver.h> | ||||
| #include <torch/csrc/jit/serialization/import.h> | ||||
| #include <torch/csrc/jit/serialization/import_source.h> | ||||
| #include <torch/csrc/jit/testing/file_check.h> | ||||
| #include <torch/torch.h> | ||||
|  | ||||
| namespace torch { | ||||
| @ -341,6 +343,20 @@ TEST(ModuleAPITest, Define) { | ||||
|   AT_ASSERT(result.toTensor().item<float>() == 6); | ||||
| } | ||||
|  | ||||
| TEST(ModuleAPITest, Freezing) { | ||||
|   Module m("m"); | ||||
|   m.register_parameter("foo", torch::ones({}), false); | ||||
|   m.define(R"( | ||||
|     def forward(self, x, b : int = 4): | ||||
|       return self.foo + x + b | ||||
|   )"); | ||||
|   m.eval(); | ||||
|   auto frozen_mod = torch::jit::freeze(m); | ||||
|   auto forward_g = frozen_mod.get_method("forward").graph(); | ||||
|   testing::FileCheck().check_not("GetAttr")->run(*forward_g); | ||||
|   ; | ||||
| } | ||||
|  | ||||
| TEST(ModuleAPITest, To_CUDA) { | ||||
|   Module m("test"); | ||||
|   { | ||||
|  | ||||
							
								
								
									
										0
									
								
								test/distributed/test_c10d.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								test/distributed/test_c10d.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							| @ -1508,7 +1508,7 @@ class TestFrozenOptimizations(JitTestCase): | ||||
|         bn = torch.nn.BatchNorm2d(out_channels, eps=.001) | ||||
|         mod = torch.nn.Sequential(conv, bn) | ||||
|         # set optimize to False here, by default freezing runs optimize_frozen_module | ||||
|         frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize=False) | ||||
|         frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize_numerics=False) | ||||
|         # inspect frozen mod | ||||
|         FileCheck().check("batch_norm").run(frozen_mod.graph) | ||||
|         torch.jit.optimize_frozen_module(frozen_mod) | ||||
|  | ||||
| @ -182,7 +182,7 @@ class TestModels(TestCase): | ||||
|         self.exportTest(toC(FakeQuantNet()), toC(x)) | ||||
|  | ||||
|     @skipIfUnsupportedMinOpsetVersion(10) | ||||
|     def test_qat_resnet(self): | ||||
|     def test_qat_resnet_pertensor(self): | ||||
|         # Quantize ResNet50 model | ||||
|         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) | ||||
|         qat_resnet50 = resnet50() | ||||
| @ -202,6 +202,27 @@ class TestModels(TestCase): | ||||
|  | ||||
|         self.exportTest(toC(qat_resnet50), toC(x)) | ||||
|  | ||||
|     @skipIfUnsupportedMinOpsetVersion(13) | ||||
|     def test_qat_resnet_per_channel(self): | ||||
|         # Quantize ResNet50 model | ||||
|         x = torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0) | ||||
|         qat_resnet50 = resnet50() | ||||
|  | ||||
|         qat_resnet50.qconfig = quantization.QConfig( | ||||
|             activation=quantization.default_fake_quant, | ||||
|             weight=quantization.default_per_channel_weight_fake_quant) | ||||
|         quantization.prepare_qat(qat_resnet50, inplace=True) | ||||
|         qat_resnet50.apply(torch.quantization.enable_observer) | ||||
|         qat_resnet50.apply(torch.quantization.enable_fake_quant) | ||||
|  | ||||
|         _ = qat_resnet50(x) | ||||
|         for module in qat_resnet50.modules(): | ||||
|             if isinstance(module, quantization.FakeQuantize): | ||||
|                 module.calculate_qparams() | ||||
|         qat_resnet50.apply(torch.quantization.disable_observer) | ||||
|  | ||||
|         self.exportTest(toC(qat_resnet50), toC(x)) | ||||
|  | ||||
|     @disableScriptTest()  # None type in outputs | ||||
|     def test_googlenet(self): | ||||
|         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) | ||||
|  | ||||
| @ -5998,6 +5998,20 @@ class TestONNXRuntime(unittest.TestCase): | ||||
|         x = torch.randn(6, 4, 3, 3) | ||||
|         self.run_test(FakeQuantizePerTensorModel(), (x)) | ||||
|  | ||||
|     @skipIfUnsupportedMinOpsetVersion(13) | ||||
|     def test_fake_quantize_per_channel(self): | ||||
|         class FakeQuantizePerChannelModel(torch.nn.Module): | ||||
|             def forward(self, input): | ||||
|                 amax = torch.ones(4) | ||||
|                 scale = amax / 127. | ||||
|                 zero_point = torch.zeros_like(amax, dtype=torch.long) | ||||
|                 # Quantize twice to test differnet branches | ||||
|                 y = torch.fake_quantize_per_channel_affine(input, scale, zero_point, 1, 0, 255) | ||||
|                 return torch.fake_quantize_per_channel_affine(y, scale, zero_point, 1, -128, 127) | ||||
|  | ||||
|         x = torch.randn(6, 4, 3, 3) | ||||
|         self.run_test(FakeQuantizePerChannelModel(), (x)) | ||||
|  | ||||
|     def test_batchnorm_training(self): | ||||
|         class MyModule(torch.nn.Module): | ||||
|             def __init__(self): | ||||
|  | ||||
| @ -2,6 +2,8 @@ import unittest | ||||
| import onnxruntime  # noqa | ||||
| import torch | ||||
|  | ||||
| from torch.cuda.amp import autocast | ||||
|  | ||||
| from test_pytorch_common import skipIfUnsupportedMinOpsetVersion | ||||
| from test_pytorch_common import skipIfNoCuda | ||||
|  | ||||
| @ -24,6 +26,43 @@ class TestONNXRuntime_cuda(unittest.TestCase): | ||||
|         x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device('cuda')) | ||||
|         self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5) | ||||
|  | ||||
|     @skipIfUnsupportedMinOpsetVersion(9) | ||||
|     @skipIfNoCuda | ||||
|     def test_layer_norm_fp16(self): | ||||
|         class LayerNormModel(torch.nn.Module): | ||||
|             def __init__(self): | ||||
|                 super(LayerNormModel, self).__init__() | ||||
|                 self.layer_norm = torch.nn.LayerNorm([10, 10]) | ||||
|  | ||||
|             def forward(self, x): | ||||
|                 return self.layer_norm(x) | ||||
|  | ||||
|         x = torch.randn(20, 5, 10, 10, requires_grad=True, dtype=torch.float16, device=torch.device('cuda')) | ||||
|         self.run_test(LayerNormModel(), x, rtol=1e-3, atol=1e-5) | ||||
|  | ||||
|  | ||||
|     @skipIfUnsupportedMinOpsetVersion(12) | ||||
|     @skipIfNoCuda | ||||
|     def test_softmaxCrossEntropy_fusion_fp16(self): | ||||
|         class FusionModel(torch.nn.Module): | ||||
|             def __init__(self): | ||||
|                 super(FusionModel, self).__init__() | ||||
|                 self.loss = torch.nn.NLLLoss(reduction='none') | ||||
|                 self.m = torch.nn.LogSoftmax(dim=1) | ||||
|  | ||||
|             @autocast() | ||||
|             def forward(self, input, target): | ||||
|                 output = self.loss(self.m(2 * input), target) | ||||
|                 return output | ||||
|  | ||||
|         N, C = 5, 4 | ||||
|         input = torch.randn(N, 16, dtype=torch.float16, device=torch.device('cuda')) | ||||
|         target = torch.empty(N, dtype=torch.long, device=torch.device('cuda')).random_(0, C) | ||||
|  | ||||
|         # using test data containing default ignore_index=-100 | ||||
|         target[target == 1] = -100 | ||||
|         self.run_test(FusionModel(), (input, target)) | ||||
|  | ||||
| TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp | ||||
| TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test | ||||
|  | ||||
|  | ||||
| @ -872,7 +872,7 @@ class TestFakeQuantize(TestCase): | ||||
|             scale, zero_point = float(scale), int(zero_point) | ||||
|             quant_min, quant_max = obs._calculate_qmin_qmax() | ||||
|  | ||||
|             Y_test, _mask = torch.fake_quantize_per_tensor_affine_cachemask( | ||||
|             Y_test = torch.fake_quantize_per_tensor_affine( | ||||
|                 X, scale, zero_point, quant_min, quant_max) | ||||
|             Y_ref = _fake_quantize_per_tensor_affine_reference( | ||||
|                 X.cpu(), scale, zero_point, quant_min, quant_max).to(device) | ||||
| @ -899,7 +899,7 @@ class TestFakeQuantize(TestCase): | ||||
|             quant_min, quant_max = obs._calculate_qmin_qmax() | ||||
|  | ||||
|             # forward pass | ||||
|             Y_test, mask = torch.fake_quantize_per_tensor_affine_cachemask( | ||||
|             Y_test = torch.fake_quantize_per_tensor_affine( | ||||
|                 X, scale, zero_point, quant_min, quant_max) | ||||
|             Y_ref = _fake_quantize_per_tensor_affine_reference( | ||||
|                 X.cpu(), scale, zero_point, quant_min, quant_max).to(device) | ||||
| @ -1246,7 +1246,7 @@ class TestFakeQuantize(TestCase): | ||||
|  | ||||
|             Y = _fake_quantize_per_channel_affine_reference( | ||||
|                 X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max) | ||||
|             Y_prime, _mask = torch.fake_quantize_per_channel_affine_cachemask( | ||||
|             Y_prime = torch.fake_quantize_per_channel_affine( | ||||
|                 X, scale, zero_point, axis, quant_min, quant_max) | ||||
|             np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance) | ||||
|  | ||||
| @ -1339,7 +1339,7 @@ class TestFakeQuantize(TestCase): | ||||
|             zero_point = zero_point.to(torch.int64) | ||||
|             quant_min, quant_max = obs._calculate_qmin_qmax() | ||||
|             X.requires_grad_() | ||||
|             Y_prime, _mask = torch.fake_quantize_per_channel_affine_cachemask( | ||||
|             Y_prime = torch.fake_quantize_per_channel_affine( | ||||
|                 X, scale, zero_point, axis, quant_min, quant_max) | ||||
|             dout = torch.rand(X.shape, dtype=torch.float).to(device) | ||||
|             dX = _fake_quantize_per_channel_affine_grad_reference( | ||||
|  | ||||
| @ -108,6 +108,7 @@ TESTS = [ | ||||
|     'test_fx_experimental', | ||||
|     'test_functional_autograd_benchmark', | ||||
|     'test_package', | ||||
|     'test_license', | ||||
|     'distributed/pipeline/sync/skip/test_api', | ||||
|     'distributed/pipeline/sync/skip/test_gpipe', | ||||
|     'distributed/pipeline/sync/skip/test_inspect_skip_layout', | ||||
|  | ||||
| @ -14,7 +14,7 @@ from math import sqrt | ||||
| from pathlib import Path | ||||
| from torch.multiprocessing import Process | ||||
| from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Interpreter, Tracer, Transformer, Graph, wrap | ||||
| from torch.fx.node import Target | ||||
| from torch.fx.node import Target, Argument | ||||
| from torch.fx.passes import shape_prop | ||||
| from torch.fx.immutable_collections import immutable_dict, immutable_list | ||||
| from copy import deepcopy | ||||
| @ -187,7 +187,7 @@ class TestFX(JitTestCase): | ||||
|         # Custom delegate to disallow in-place tensor operations | ||||
|         class NoMutableCallTracer(Tracer): | ||||
|             def create_node(self, kind : str, target : Union[str, Callable], | ||||
|                             args : Tuple[Any], kwargs : Dict[str, Any], name : Optional[str] = None, | ||||
|                             args : Tuple[Argument, ...], kwargs : Dict[str, Any], name : Optional[str] = None, | ||||
|                             type_expr : Optional[Any] = None) -> Node: | ||||
|                 name = target if isinstance(target, str) else torch.typename(target) | ||||
|                 if name[-1] == '_': | ||||
| @ -539,7 +539,7 @@ class TestFX(JitTestCase): | ||||
|     def test_node_tagging(self): | ||||
|         class TaggingTracer(Tracer): | ||||
|             def create_node(self, kind : str, target : Union[str, Callable], | ||||
|                             args : Tuple[Any], kwargs : Dict[str, Any], name : Optional[str] = None, | ||||
|                             args : Tuple[Argument, ...], kwargs : Dict[str, Any], name : Optional[str] = None, | ||||
|                             type_expr : Optional[Any] = None) -> Node: | ||||
|                 n = super().create_node(kind, target, args, kwargs, name) | ||||
|                 n.tag = 'foo' | ||||
| @ -1057,6 +1057,13 @@ class TestFX(JitTestCase): | ||||
|         result = interp.run(torch.ones(3, 4), torch.ones(3, 4), torch.rand(3, 4)) | ||||
|         self.assertEqual(result, torch.ones(3, 4) * 2.0) | ||||
|  | ||||
|     @skipIfNoTorchVision | ||||
|     def test_interpreter_noop_resnet18(self): | ||||
|         rn18 = resnet18() | ||||
|         transformed = torch.fx.Transformer(symbolic_trace(rn18)).transform() | ||||
|         inp = torch.randn(5, 3, 224, 224) | ||||
|         self.assertEqual(transformed(inp), rn18(inp)) | ||||
|  | ||||
|     def test_transformer_noop(self): | ||||
|         class MyModule(torch.nn.Module): | ||||
|             def __init__(self): | ||||
| @ -1377,6 +1384,45 @@ class TestFX(JitTestCase): | ||||
|         x, y = torch.randn(3, 4), torch.randn(3, 4) | ||||
|         self.checkGraphModule(foo, (x, y)) | ||||
|  | ||||
|     def test_trace_dict_int_keys(self): | ||||
|         class ModWithDictArg(torch.nn.Module): | ||||
|             def forward(self, d : Dict[int, torch.Tensor]): | ||||
|                 return d[42] | ||||
|  | ||||
|         class CallsModWithDict(torch.nn.Module): | ||||
|             def __init__(self): | ||||
|                 super().__init__() | ||||
|                 self.m = ModWithDictArg() | ||||
|  | ||||
|             def forward(self, x): | ||||
|                 return self.m({42: x}) | ||||
|  | ||||
|         class MyTracer(torch.fx.Tracer): | ||||
|             def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool: | ||||
|                 return isinstance(m, ModWithDictArg) | ||||
|  | ||||
|         traced_graph = MyTracer().trace(CallsModWithDict()) | ||||
|  | ||||
|     def test_trace_dict_proxy_keys(self): | ||||
|         class ModWithDictArg(torch.nn.Module): | ||||
|             def forward(self, d : Dict[torch.Tensor, torch.Tensor]): | ||||
|                 return d[42] | ||||
|  | ||||
|         class CallsModWithDict(torch.nn.Module): | ||||
|             def __init__(self): | ||||
|                 super().__init__() | ||||
|                 self.m = ModWithDictArg() | ||||
|  | ||||
|             def forward(self, x): | ||||
|                 return self.m({x: x}) | ||||
|  | ||||
|         class MyTracer(torch.fx.Tracer): | ||||
|             def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool: | ||||
|                 return isinstance(m, ModWithDictArg) | ||||
|  | ||||
|         with self.assertRaisesRegex(RuntimeError, 'cannot contain a Node'): | ||||
|             traced_graph = MyTracer().trace(CallsModWithDict()) | ||||
|  | ||||
|     def test_direct_param_use(self): | ||||
|         class TransposeTest(torch.nn.Module): | ||||
|             def __init__(self): | ||||
|  | ||||
| @ -5,14 +5,14 @@ from typing import Callable, Dict, Union, List | ||||
| from torch.fx.symbolic_trace import symbolic_trace | ||||
| from torch.fx.graph_module import GraphModule | ||||
| from torch.fx.node import Node | ||||
| from torch.fx.experimental import graph_manipulation | ||||
| from torch.fx.experimental.accelerator_partitioner import Partitioner | ||||
| from torch.fx.experimental.rewriter import RewritingTracer | ||||
| from torch.fx.experimental.param_fetch import lift_lowering_attrs_to_nodes | ||||
| from torch.fx._experimental import graph_manipulation | ||||
| from torch.fx._experimental.accelerator_partitioner import Partitioner | ||||
| from torch.fx._experimental.rewriter import RewritingTracer | ||||
| from torch.fx._experimental.param_fetch import lift_lowering_attrs_to_nodes | ||||
| from torch.testing._internal.common_utils import run_tests | ||||
| from torch.testing._internal.jit_utils import JitTestCase | ||||
| from torch.fx.passes.split_module import split_module | ||||
| from torch.fx.experimental.partitioner_utils import ( | ||||
| from torch.fx._experimental.partitioner_utils import ( | ||||
|     NodeLatency, | ||||
|     get_partition_to_latency_mapping, | ||||
|     get_latency_of_partitioned_graph, | ||||
| @ -20,8 +20,8 @@ from torch.fx.experimental.partitioner_utils import ( | ||||
|     PartitionerConfig, | ||||
|     PartitionMode | ||||
| ) | ||||
| from torch.fx.experimental.fuser import fuse | ||||
| from torch.fx.experimental import merge_matmul | ||||
| from torch.fx._experimental.fuser import fuse | ||||
| from torch.fx._experimental import merge_matmul | ||||
|  | ||||
| try: | ||||
|     from torchvision.models import resnet18 | ||||
| @ -849,7 +849,7 @@ terrible spacing | ||||
|  | ||||
|     def test_merge_matmuls(self): | ||||
|         """ | ||||
|         A collection of test cases for torch.fx.experimental.merge_matmul, | ||||
|         A collection of test cases for torch.fx._experimental.merge_matmul, | ||||
|         a graph transformation that merges matrix multiplication operations. | ||||
|         """ | ||||
|         # Utility function for counting matmuls for test assertions. | ||||
|  | ||||
| @ -6503,6 +6503,38 @@ a") | ||||
|             self.checkModule(module().train(), ()) | ||||
|             self.checkModule(module().eval(), ()) | ||||
|  | ||||
|     def test_ternary_static_if(self): | ||||
|         # Test for True branch when condition variable | ||||
|         # is annotated as Final | ||||
|         class M1(torch.nn.Module): | ||||
|             flag: torch.jit.Final[bool] | ||||
|  | ||||
|             def __init__(self): | ||||
|                 super().__init__() | ||||
|                 self.flag = True | ||||
|  | ||||
|             def forward(self) -> torch.Tensor: | ||||
|                 return torch.ones(3) if self.flag else {} | ||||
|  | ||||
|         # Test for True branch when condition variable | ||||
|         # is annotated as Final | ||||
|         class M2(torch.nn.Module): | ||||
|             flag: torch.jit.Final[bool] | ||||
|  | ||||
|             def __init__(self): | ||||
|                 super().__init__() | ||||
|                 self.flag = False | ||||
|  | ||||
|             def forward(self) -> torch.Tensor: | ||||
|                 return {} if self.flag else torch.ones(3) | ||||
|  | ||||
|         model1 = M1() | ||||
|         model2 = M2() | ||||
|         script_model_1 = torch.jit.script(model1) | ||||
|         script_model_2 = torch.jit.script(model2) | ||||
|         self.assertEqual(model1.forward(), script_model_1.forward()) | ||||
|         self.assertEqual(model2.forward(), script_model_2.forward()) | ||||
|  | ||||
|     def test_print(self): | ||||
|         def func(x, y): | ||||
|             q = (x + y).sigmoid() | ||||
|  | ||||
| @ -1,6 +1,9 @@ | ||||
| import glob | ||||
| import io | ||||
| import os | ||||
| import unittest | ||||
|  | ||||
| import torch | ||||
| from torch.testing._internal.common_utils import TestCase, run_tests | ||||
|  | ||||
|  | ||||
| @ -10,11 +13,14 @@ except ImportError: | ||||
|     create_bundled = None | ||||
|  | ||||
| license_file = 'third_party/LICENSES_BUNDLED.txt' | ||||
| starting_txt = 'The Pytorch repository and source distributions bundle' | ||||
| site_packages = os.path.dirname(os.path.dirname(torch.__file__)) | ||||
| distinfo = glob.glob(os.path.join(site_packages, 'torch-*dist-info')) | ||||
|  | ||||
| class TestLicense(TestCase): | ||||
|  | ||||
|     @unittest.skipIf(not create_bundled, "can only be run in a source tree") | ||||
|     def test_license_in_wheel(self): | ||||
|     def test_license_for_wheel(self): | ||||
|         current = io.StringIO() | ||||
|         create_bundled('third_party', current) | ||||
|         with open(license_file) as fid: | ||||
| @ -25,6 +31,18 @@ class TestLicense(TestCase): | ||||
|                 'match the current state of the third_party files. Use ' | ||||
|                 '"python third_party/build_bundled.py" to regenerate it') | ||||
|  | ||||
|     @unittest.skipIf(len(distinfo) == 0, "no installation in site-package to test") | ||||
|     def test_distinfo_license(self): | ||||
|         """If run when pytorch is installed via a wheel, the license will be in | ||||
|         site-package/torch-*dist-info/LICENSE. Make sure it contains the third | ||||
|         party bundle of licenses""" | ||||
|  | ||||
|         if len(distinfo) > 1: | ||||
|             raise AssertionError('Found too many "torch-*dist-info" directories ' | ||||
|                                  f'in "{site_packages}, expected only one') | ||||
|         with open(os.path.join(os.path.join(distinfo[0], 'LICENSE'))) as fid: | ||||
|             txt = fid.read() | ||||
|             self.assertTrue(starting_txt in txt) | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     run_tests() | ||||
|  | ||||
| @ -4276,6 +4276,37 @@ class TestNN(NNTestCase): | ||||
|             with torch.backends.mkldnn.flags(enabled=enabled): | ||||
|                 gradcheck(F.conv2d, (input, mod.weight)) | ||||
|  | ||||
|     def test_Conv2d_OneDNN(self): | ||||
|         def run_once(): | ||||
|             group_val = 24 | ||||
|             ifm = torch.ones([1, group_val, 6, 6], dtype=torch.float32) | ||||
|             weights = torch.ones([group_val, 1, 3, 3], dtype=torch.float32) | ||||
|             op = torch.nn.Conv2d( | ||||
|                 in_channels=group_val, | ||||
|                 out_channels=group_val, | ||||
|                 kernel_size=[3, 3], | ||||
|                 stride=[2, 2], | ||||
|                 padding=[1, 1], | ||||
|                 dilation=[1, 1], | ||||
|                 groups=group_val, | ||||
|                 bias=False, | ||||
|                 padding_mode='zeros' | ||||
|             ) | ||||
|  | ||||
|             op.weight.data = weights | ||||
|             res = op(ifm) | ||||
|             grad_in = torch.ones(res.shape, dtype=torch.float32) | ||||
|             res.backward(grad_in) | ||||
|             return op.weight.grad | ||||
|  | ||||
|         with torch.backends.mkldnn.flags(enabled=False): | ||||
|             without_onednn = run_once() | ||||
|  | ||||
|         with torch.backends.mkldnn.flags(enabled=True): | ||||
|             with_onednn = run_once() | ||||
|  | ||||
|         self.assertEqual(without_onednn, with_onednn) | ||||
|  | ||||
|     @unittest.skipIf(not TEST_CUDA, 'CUDA not available') | ||||
|     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') | ||||
|     def test_cudnn_non_contiguous(self): | ||||
| @ -8643,7 +8674,7 @@ class TestNN(NNTestCase): | ||||
|             kwargs = dict(mode='bicubic', align_corners=align_corners) | ||||
|             # test float scale factor up & downsampling | ||||
|             for device in device_list: | ||||
|                 for scale_factor in [0.5, 1.5, 2]: | ||||
|                 for scale_factor in [0.5, 1, 1.5, 2]: | ||||
|                     in_t = torch.ones(2, 2, 2, 2).to(device) | ||||
|                     out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs) | ||||
|                     out_size = int(math.floor(in_t.shape[-1] * scale_factor)) | ||||
|  | ||||
| @ -1,7 +1,8 @@ | ||||
| from torch.testing._internal.common_utils import TestCase, run_tests | ||||
| import torch | ||||
| import torch.nn.functional as F | ||||
| from torch import Tensor, vmap | ||||
| from torch import Tensor | ||||
| from torch._vmap_internals import vmap | ||||
| import functools | ||||
| import itertools | ||||
| import warnings | ||||
|  | ||||
							
								
								
									
										2
									
								
								third_party/XNNPACK
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
							
						
						
									
										2
									
								
								third_party/XNNPACK
									
									
									
									
										vendored
									
									
								
							 Submodule third_party/XNNPACK updated: e1ffe15459...383b0752fe
									
								
							
							
								
								
									
										2
									
								
								third_party/tensorpipe
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
							
						
						
									
										2
									
								
								third_party/tensorpipe
									
									
									
									
										vendored
									
									
								
							 Submodule third_party/tensorpipe updated: a814dda3ef...05467ba9bc
									
								
							| @ -82,7 +82,8 @@ SKIP_PYTHON_BINDINGS = [ | ||||
|     'set_data', | ||||
|     '.*_overrideable',  # overrideable functions for backend extension | ||||
|     'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retain_grad', 'set_', | ||||
|     '_fw_primal' | ||||
|     '_fw_primal', 'fake_quantize_per_tensor_affine_cachemask', | ||||
|     'fake_quantize_per_channel_affine_cachemask', | ||||
| ] | ||||
|  | ||||
| # These function signatures are not exposed to Python. Note that this signature | ||||
|  | ||||
| @ -350,8 +350,8 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) - | ||||
|         'saddmm': ['def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number=1, ' | ||||
|                    'alpha: Number=1, out: Optional[Tensor]=None) -> Tensor: ...'], | ||||
|         'spmm': ['def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...'], | ||||
|         'div': ['def div(input: Union[Tensor, Number], other: Union[Tensor, Number], ' | ||||
|                 'rounding_mode: str = "true", *, out: Optional[Tensor]=None) -> Tensor: ...'], | ||||
|         'div': ['def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, ' | ||||
|                 'rounding_mode: Optional[str]=None, out: Optional[Tensor]=None) -> Tensor: ...'], | ||||
|     }) | ||||
|     for binop in ['mul', 'true_divide', 'floor_divide']: | ||||
|         unsorted_function_hints[binop].append( | ||||
| @ -462,9 +462,9 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) - | ||||
|                  'def set_(self, storage: Storage) -> Tensor: ...'], | ||||
|         'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...', | ||||
|                   'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'], | ||||
|         'div': ['def div(self, other: Union[Tensor, Number], ' | ||||
|                 'rounding_mode: str = "true", *, out: Optional[Tensor]=None) -> Tensor: ...'], | ||||
|         'div_': ['def div_(self, other: Union[Tensor, Number], rounding_mode: str = "true") -> Tensor: ...'], | ||||
|         'div': ['def div(self, other: Union[Tensor, Number], *, ' | ||||
|                 'rounding_mode: Optional[str]=None, out: Optional[Tensor]=None) -> Tensor: ...'], | ||||
|         'div_': ['def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str]=None) -> Tensor: ...'], | ||||
|     }) | ||||
|     for binop in ['mul', 'true_divide', 'floor_divide']: | ||||
|         for inplace in [False, True]: | ||||
|  | ||||
| @ -162,7 +162,7 @@ endif() | ||||
|  | ||||
| # In the most recent CMake versions, a new 'TRANSFORM' subcommand of 'list' allows much of the boilerplate of defining the lists | ||||
| # of type stub files to be omitted. | ||||
| # For comptability with older CMake versions, we omit it for now, but leave it as a comment in case comptability with the older | ||||
| # For compatibility with older CMake versions, we omit it for now, but leave it as a comment in case compatibility with the older | ||||
| # CMake versions is eventually dropped. | ||||
| # set(Modules | ||||
| #     __init__ | ||||
|  | ||||
| @ -174,6 +174,11 @@ def _freeze_module(module: ScriptModule, | ||||
|                    freeze_interfaces: _bool = True, | ||||
|                    preserveParameters: _bool = True) -> ScriptModule: ... | ||||
| def _jit_pass_optimize_frozen_graph(Graph) -> None: ... | ||||
| def _jit_pass_fold_frozen_conv_bn(graph: Graph): ... | ||||
| def _jit_pass_fold_frozen_conv_add_or_sub(graph: Graph): ... | ||||
| def _jit_pass_fold_frozen_conv_mul_or_div(graph: Graph): ... | ||||
| def _jit_pass_remove_dropout(module: 'torch.jit.ScriptModule'): ... | ||||
|  | ||||
| def _is_tracing() -> _bool: ... | ||||
| def _jit_init() -> _bool: ... | ||||
| def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ... | ||||
|  | ||||
| @ -662,8 +662,6 @@ del register_after_fork | ||||
| # torch.jit.script as a decorator, for instance): | ||||
| from ._lobpcg import lobpcg | ||||
|  | ||||
| from ._vmap_internals import vmap | ||||
|  | ||||
| # These were previously defined in native_functions.yaml and appeared on the | ||||
| # `torch` namespace, but we moved them to c10 dispatch to facilitate custom | ||||
| # class usage. We add these lines here to preserve backward compatibility. | ||||
|  | ||||
| @ -1194,25 +1194,25 @@ See :func:`torch.dist` | ||||
| """) | ||||
|  | ||||
| add_docstr_all('div', r""" | ||||
| div(value, *, rounding_mode='true') -> Tensor | ||||
| div(value, *, rounding_mode=None) -> Tensor | ||||
|  | ||||
| See :func:`torch.div` | ||||
| """) | ||||
|  | ||||
| add_docstr_all('div_', r""" | ||||
| div_(value, *, rounding_mode='true') -> Tensor | ||||
| div_(value, *, rounding_mode=None) -> Tensor | ||||
|  | ||||
| In-place version of :meth:`~Tensor.div` | ||||
| """) | ||||
|  | ||||
| add_docstr_all('divide', r""" | ||||
| divide(value, *, rounding_mode='true') -> Tensor | ||||
| divide(value, *, rounding_mode=None) -> Tensor | ||||
|  | ||||
| See :func:`torch.divide` | ||||
| """) | ||||
|  | ||||
| add_docstr_all('divide_', r""" | ||||
| divide_(value, *, rounding_mode='true') -> Tensor | ||||
| divide_(value, *, rounding_mode=None) -> Tensor | ||||
|  | ||||
| In-place version of :meth:`~Tensor.divide` | ||||
| """) | ||||
|  | ||||
| @ -2741,7 +2741,7 @@ Example:: | ||||
| """.format(**common_args)) | ||||
|  | ||||
| add_docstr(torch.div, r""" | ||||
| div(input, other, *, rounding_mode='true' out=None) -> Tensor | ||||
| div(input, other, *, rounding_mode=None, out=None) -> Tensor | ||||
|  | ||||
| Divides each element of the input ``input`` by the corresponding element of | ||||
| :attr:`other`. | ||||
| @ -2764,7 +2764,7 @@ Args: | ||||
| Keyword args: | ||||
|     rounding_mode (str, optional): Type of rounding applied to the result: | ||||
|  | ||||
|         * ``"true"`` - default behavior. Performs no rounding and, if both :attr:`input` and | ||||
|         * None - default behavior. Performs no rounding and, if both :attr:`input` and | ||||
|           :attr:`other` are integer types, promotes the inputs to the default scalar type. | ||||
|           Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``. | ||||
|         * ``"trunc"`` - rounds the results of the division towards zero. | ||||
| @ -2806,7 +2806,7 @@ Examples:: | ||||
| """.format(**common_args)) | ||||
|  | ||||
| add_docstr(torch.divide, r""" | ||||
| divide(input, other, *, rounding_mode='true', out=None) -> Tensor | ||||
| divide(input, other, *, rounding_mode=None, out=None) -> Tensor | ||||
|  | ||||
| Alias for :func:`torch.div`. | ||||
| """) | ||||
| @ -8515,9 +8515,9 @@ If :attr:`upper` is ``False``, then lower triangular portion is used. | ||||
| .. note:: Irrespective of the original strides, the returned matrix `V` will | ||||
|           be transposed, i.e. with strides `V.contiguous().transpose(-1, -2).stride()`. | ||||
|  | ||||
| .. note:: Extra care needs to be taken when backward through outputs. Such | ||||
|           operation is really only stable when all eigenvalues are distinct. | ||||
|           Otherwise, ``NaN`` can appear as the gradients are not properly defined. | ||||
| .. warning:: Extra care needs to be taken when backward through outputs. Such | ||||
|              operation is only stable when all eigenvalues are distinct and becomes | ||||
|              less stable the smaller :math:`\min_{i \neq j} |\lambda_i - \lambda_j|` is. | ||||
|  | ||||
| Args: | ||||
|     input (Tensor): the input tensor of size :math:`(*, n, n)` where `*` is zero or more | ||||
| @ -9207,7 +9207,7 @@ Example:: | ||||
| add_docstr(torch.true_divide, r""" | ||||
| true_divide(dividend, divisor, *, out) -> Tensor | ||||
|  | ||||
| Alias for :func:`torch.div` with ``rounding_mode='true'``. | ||||
| Alias for :func:`torch.div` with ``rounding_mode=None``. | ||||
| """.format(**common_args)) | ||||
|  | ||||
| add_docstr(torch.trunc, | ||||
|  | ||||
| @ -8,6 +8,8 @@ | ||||
| #include <torch/csrc/jit/frontend/schema_matching.h> | ||||
| #include <torch/csrc/jit/jit_log.h> | ||||
| #include <torch/csrc/jit/passes/dead_code_elimination.h> | ||||
| #include <torch/csrc/jit/passes/freeze_module.h> | ||||
| #include <torch/csrc/jit/passes/frozen_graph_optimizations.h> | ||||
| #include <torch/csrc/jit/passes/inliner.h> | ||||
| #include <torch/csrc/jit/runtime/operator.h> | ||||
|  | ||||
| @ -336,6 +338,21 @@ IValue Module::create_class(const c10::QualifiedName& name, Stack stack) const { | ||||
|   return obj; | ||||
| } | ||||
|  | ||||
| Module freeze( | ||||
|     const Module& module, | ||||
|     c10::optional<std::vector<std::string>> preserved_attrs, | ||||
|     bool optimize_numerics) { | ||||
|   TORCH_CHECK( | ||||
|       module.is_training(), | ||||
|       "Freezing is currently only implemented for modules in eval mode. Please call .eval() before freezing"); | ||||
|  | ||||
|   Module out_mod = freeze_module( | ||||
|       module, preserved_attrs.value_or(std::vector<std::string>({}))); | ||||
|   auto graph = module.get_method("forward").graph(); | ||||
|   OptimizeFrozenGraph(graph, optimize_numerics); | ||||
|   return out_mod; | ||||
| } | ||||
|  | ||||
| buffer_list Module::buffers(bool recurse) const { | ||||
|   return buffer_list(*this, recurse, /*return_module=*/false); | ||||
| } | ||||
|  | ||||
| @ -276,6 +276,13 @@ struct TORCH_API Module : public Object { | ||||
|       bool non_blocking); | ||||
| }; | ||||
|  | ||||
| // C++ equivalent api of `torch.jit.freeze`. See documentation there for | ||||
| // details. | ||||
| TORCH_API Module freeze( | ||||
|     const Module& module, | ||||
|     c10::optional<std::vector<std::string>> preserved_attrs = c10::nullopt, | ||||
|     bool optimize_numerics = true); | ||||
|  | ||||
| namespace detail { | ||||
|  | ||||
| struct TORCH_API SlotCursor { | ||||
|  | ||||
| @ -1,5 +1,6 @@ | ||||
| #include <torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h> | ||||
|  | ||||
| #include <ATen/DynamicLibrary.h> | ||||
| #include <c10/util/Exception.h> | ||||
| #include <c10/util/Optional.h> | ||||
| #include <torch/csrc/jit/codegen/fuser/compiler.h> | ||||
|  | ||||
| @ -9,13 +9,18 @@ | ||||
| #include <memory> | ||||
| #include <string> | ||||
|  | ||||
| // Forward declare DynamicLibrary | ||||
| namespace at { | ||||
| struct DynamicLibrary; | ||||
| } | ||||
|  | ||||
| namespace torch { | ||||
| namespace jit { | ||||
| namespace fuser { | ||||
| namespace cpu { | ||||
|  | ||||
| // Represents a compiled CPU kernel and the metadata necessary to run it | ||||
| struct TORCH_API FusedKernelCPU : public ::torch::jit::fuser::FusedKernel { | ||||
| struct TORCH_API FusedKernelCPU : public FusedKernel { | ||||
|   FusedKernelCPU( | ||||
|       std::string name, | ||||
|       std::string code, | ||||
|  | ||||
| @ -1258,6 +1258,15 @@ struct to_ir { | ||||
|       const TernaryIf& expr, | ||||
|       const TypePtr& type_hint = nullptr) { | ||||
|     CondValue cond_value = emitCondExpr(expr.cond()); | ||||
|     // If the cond expr is a static value, then we metacompile the `if` | ||||
|     // statemement and only emit true or false branch | ||||
|     if (cond_value.staticIf()) { | ||||
|         if (*cond_value.staticIf()) { | ||||
|             return emitExpr(expr.true_expr(), type_hint); | ||||
|         } else { | ||||
|             return emitExpr(expr.false_expr(), type_hint); | ||||
|         } | ||||
|     } | ||||
|     auto true_expr = [&] { return emitExpr(expr.true_expr(), type_hint); }; | ||||
|     auto false_expr = [&] { return emitExpr(expr.false_expr(), type_hint); }; | ||||
|     return emitIfExpr(expr.range(), cond_value, true_expr, false_expr); | ||||
|  | ||||
| @ -8,12 +8,16 @@ | ||||
| namespace torch { | ||||
| namespace jit { | ||||
|  | ||||
| void OptimizeFrozenGraph(std::shared_ptr<Graph>& graph) { | ||||
| void OptimizeFrozenGraph( | ||||
|     std::shared_ptr<Graph>& graph, | ||||
|     bool optimize_numerics) { | ||||
|   // run a couple times to capture Conv -> Mul -> Add etc | ||||
|   for (size_t i = 0; i < 2; i++) { | ||||
|     FoldFrozenConvBatchnorm(graph); | ||||
|     FoldFrozenConvAddOrSub(graph); | ||||
|     FoldFrozenConvMulOrDiv(graph); | ||||
|   if (optimize_numerics) { | ||||
|     for (size_t i = 0; i < 2; i++) { | ||||
|       FoldFrozenConvBatchnorm(graph); | ||||
|       FoldFrozenConvAddOrSub(graph); | ||||
|       FoldFrozenConvMulOrDiv(graph); | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -13,7 +13,9 @@ | ||||
| namespace torch { | ||||
| namespace jit { | ||||
|  | ||||
| TORCH_API void OptimizeFrozenGraph(std::shared_ptr<Graph>& graph); | ||||
| TORCH_API void OptimizeFrozenGraph( | ||||
|     std::shared_ptr<Graph>& graph, | ||||
|     bool optimize_numerics = true); | ||||
|  | ||||
| } // namespace jit | ||||
| } // namespace torch | ||||
|  | ||||
| @ -668,6 +668,24 @@ static void fuseLogSoftmaxNllLoss(Block* b) { | ||||
|       auto prev = it->input(0)->node(); | ||||
|       Node* origNllLossNode = *it; | ||||
|       Node* origLogSoftmaxNode; | ||||
|  | ||||
|       // Check for patterns especially in cases with autocasting enabled | ||||
|       // in which a cast node is inserted before the NegativeLogLikelihoodLoss | ||||
|       // node and this causes the patterns below not to be recognizable by the | ||||
|       // fuseLogSoftmaxNllLoss function | ||||
|       // For example if the input is 2D | ||||
|       // graph(%input : Half(3, 5), | ||||
|       // %target : Long(3)): | ||||
|       // %4 : Half(3, 5) = onnx::LogSoftmaxaxis=1 | ||||
|       // %8 : Float = onnx::Cast[to=1](%4) | ||||
|       // %9 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"] | ||||
|       // return (%8) | ||||
|       Node* castNode = nullptr; | ||||
|       if (prev->kind() == onnx::Cast) { | ||||
|         castNode = prev; | ||||
|         prev = prev->input(0)->node(); | ||||
|       } | ||||
|  | ||||
|       if (prev->kind() == onnx::LogSoftmax) { | ||||
|         // if the input is 2D | ||||
|         // graph(%input : Float(3, 5), | ||||
| @ -675,7 +693,7 @@ static void fuseLogSoftmaxNllLoss(Block* b) { | ||||
|         // %4 : Float(3, 5) = onnx::LogSoftmaxaxis=1 | ||||
|         // %8 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"] | ||||
|         // return (%8) | ||||
|         origLogSoftmaxNode = it->input(0)->node(); | ||||
|         origLogSoftmaxNode = prev; | ||||
|       } else if ( | ||||
|           prev->kind() == onnx::Transpose && | ||||
|           prev->input(0)->node()->kind() == onnx::LogSoftmax) { | ||||
| @ -751,6 +769,19 @@ static void fuseLogSoftmaxNllLoss(Block* b) { | ||||
|         continue; | ||||
|       } | ||||
|  | ||||
|       // If the pattern indeed consists of a cast node before the | ||||
|       // NegativeLogLikelihoodLoss node, place a cast node in the beginning | ||||
|       // of the pattern instead | ||||
|       if (castNode != nullptr) { | ||||
|         auto onnx_type = castNode->i(attr::to); | ||||
|         Node* cast_node = b->owningGraph()->create(onnx::Cast, 1); | ||||
|         cast_node->addInput(origLogSoftmaxNode->inputs().at(0)); | ||||
|         cast_node->i_(attr::to, onnx_type); | ||||
|         cast_node->insertBefore(origLogSoftmaxNode); | ||||
|         origLogSoftmaxNode->replaceInputWith( | ||||
|             origLogSoftmaxNode->inputs().at(0), cast_node->output()); | ||||
|       } | ||||
|  | ||||
|       Node* softmaxCrossEntropyNode = b->owningGraph()->create( | ||||
|           onnx::SoftmaxCrossEntropyLoss, it->outputs().size()); | ||||
|       for (size_t i = 0; i < softmaxCrossEntropyNode->outputs().size(); ++i) { | ||||
|  | ||||
| @ -33,39 +33,38 @@ def _orthogonalize(matrix, epsilon=1e-8): | ||||
|  | ||||
|  | ||||
| class PowerSGDState(object): | ||||
|     """ | ||||
|     Stores both the gradient compression configs and the internal states for all the gradients during the training. | ||||
|     Particularly, `matrix_approximation_rank` and `start_powerSGD_iter` are the main configs that need to be tuned by the user. | ||||
|     Although `use_error_feedback` and `warm_start` can also be tuned by the user, | ||||
|     they are typically turned on for performance. | ||||
|     r""" | ||||
|     Stores both the algorithm's hyperparameters and the internal state for all the gradients during the training. | ||||
|     Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main hyperparameters that should be tuned by the user. | ||||
|     For performance, we suggest to keep binary hyperparameters ``use_error_feedback`` and ``warm_start`` on. | ||||
|  | ||||
|     Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`] | ||||
|     ~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
|     1) To tune `matrix_approximation_rank`, the user can increase it from 1 by factors of 2, | ||||
|     until a satisfying accuracy can be reached. | ||||
|     The increase of `matrix_approximation_rank` can substantially increase the computation costs of the compression. | ||||
|     However, the accuracy may not be futher improved beyond a certain `matrix_approximation_rank` value. | ||||
|     2) To tune `start_powerSGD_iter`, the user can typically start with 10% of total training steps, | ||||
|     and increase it until a satisfying accuracy can be reached. | ||||
|     Deferrring PowerSGD can effectively improve the accuracy, | ||||
|     even a relatively small `matrix_approximation_rank` is used. | ||||
|     This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, | ||||
|     and compressing gradients too early may make the training quickly take a suboptimal trajectory, | ||||
|     which can result in an irrecoverable impact on the accuracy. | ||||
|     The minimum value allowed in DDP is 2, if error feedback or warm-up is enabled. | ||||
|     This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP, | ||||
|     and this can conflict with any tensor memorized before the rebuild process. | ||||
|     """ | ||||
|     1. ``matrix_approximation_rank`` controls the size of compressed low-rank tensors, which determines the compression rate. The lower the rank, the stronger the compression. | ||||
|  | ||||
|         1.1. If ``matrix_approximation_rank`` is too low, the full model quality will need more training steps to reach or will never reach and yield loss in accuracy. | ||||
|  | ||||
|         1.2. The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression, and the accuracy may not be futher improved beyond a certain ``matrix_approximation_rank`` threshold. | ||||
|  | ||||
|     To tune ``matrix_approximation_rank``, we suggest to start from 1 and increase by factors of 2 (like an expoential grid search, 1, 2, 4, ...), until a satisfactory accuracy is reached. Typically only a small value 1-4 is used. For some NLP tasks (as shown in Appendix D of the original paper), this value has been increased to 32. | ||||
|  | ||||
|     2. ``start_powerSGD_iter`` defers PowerSGD compression util step ``start_powerSGD_iter``, and vanilla allreduce runs prior to step ``start_powerSGD_iter``. This hybrid scheme of **vanilla allreduce + PowerSGD** can effectively improve the accuracy, even a relatively small ``matrix_approximation_rank`` is used. This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, and compressing gradients too early may make the training quickly take a suboptimal trajectory, which can result in an irrecoverable impact on the accuracy. | ||||
|  | ||||
|     To tune ``start_powerSGD_iter``, we suggest to start with 10% of total training steps, and increase it until a satisfactory accuracy is reached. | ||||
|  | ||||
|     .. warning :: | ||||
|         If error feedback or warm-up is enabled, the minimum value of ``start_powerSGD_iter`` allowed in DDP is 2. | ||||
|         This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP, | ||||
|         and this can conflict with any tensor memorized before the rebuild process. | ||||
|     """  # noqa | ||||
|  | ||||
|     __slots__ = [ | ||||
|         "process_group", | ||||
|         # The two fields below are the configs that usually need to be tuned by the user. | ||||
|         # The two fields below are the hyperparameters that should be tuned by the user. | ||||
|         "matrix_approximation_rank", | ||||
|         "start_powerSGD_iter", | ||||
|         # The two fields below are the configs that usually need to be turned on for performance. | ||||
|         # The two fields below are the binary hyperparameters recommended to be turned on for performance. | ||||
|         "use_error_feedback", | ||||
|         "warm_start", | ||||
|         # The fields below are not configs. | ||||
|         # The fields below are internal state. | ||||
|         "rng", | ||||
|         "error_dict", | ||||
|         "p_memory_dict", | ||||
| @ -93,21 +92,12 @@ class PowerSGDState(object): | ||||
|         ) | ||||
|  | ||||
|         self.process_group = process_group | ||||
|         # The low rank for matrix approximation controls the size of compressed low-rank tensors, | ||||
|         # which determines the computation ratio. | ||||
|         # Typically only a small value 1-4 is used. | ||||
|         # For some NLP tasks (as shown in Appendix D of the original paper | ||||
|         # https://arxiv.org/pdf/1905.13727.pdf, the rank value has been increased to 32. | ||||
|         # A high rank value will increase the computation costs of compression exponentially. | ||||
|         # A good choice depends on how much extra computation can be hidden by the dominating communication costs. | ||||
|         self.matrix_approximation_rank = matrix_approximation_rank | ||||
|         # This defers PowerSGD compression util step 'start_powerSGD_iter', | ||||
|         # and vanilla allreduce runs before step 'start_powerSGD_iter'. | ||||
|         # This hybrid scheme of vanilla allreduce + PowerSGD can have two advantages: | ||||
|         # Deferring PowerSGD compression util step 'start_powerSGD_iter' can have two advantages: | ||||
|         # 1) It turns out that PowerSGD may lead to a non-trivial accuracy loss, | ||||
|         # even if the matrix approximation rank is increased to a large value. | ||||
|         # To mitigate the accuracy loss, a simple yet effective way is mixing vanilla allreduce | ||||
|         # (or a more convervative compression such as FP16 compression) with PowerSGD. | ||||
|         # (or a more conservative compression such as FP16 compression) with PowerSGD. | ||||
|         # 2) There is an internal optimization of rebuilding buckets process in DDP, | ||||
|         # in order to save the memory space. | ||||
|         # This step takes place after the first iteration. | ||||
| @ -162,38 +152,44 @@ class PowerSGDState(object): | ||||
|  | ||||
|  | ||||
| def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: | ||||
|     """ | ||||
|     This DDP communication hook implements the original PowerSGD gradient compression | ||||
|     algorithm described in https://arxiv.org/abs/1905.13727. | ||||
|     r""" | ||||
|     This DDP communication hook implements PowerSGD gradient compression | ||||
|     algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_. | ||||
|     Once gradient tensors are aggregated across all workers, this hook applies | ||||
|     compression as follows: | ||||
|     1) Views the input flattened 1D gradient tensor as two groups of per-parameter tensors: | ||||
|     high-rank tensors and vector-like rank-1 tensors (for biases). | ||||
|     2) Handles rank-1 tensors by allreducing them without compression: | ||||
|         2.1) Allocate contiguous memory for those rank-1 tensors, | ||||
|         and allreduces all the rank-1 tensors as a batch, without compression; | ||||
|         2.2) Copies the individual rank-1 tensors from the contiguous memory back to the input tensor. | ||||
|     3) Handles high-rank tensors by PowerSGD compression: | ||||
|         3.1) For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M, | ||||
|  | ||||
|     1. Views the input flattened 1D gradient tensor as two groups of per-parameter tensors: high-rank tensors and vector-like rank-1 tensors (for biases). | ||||
|  | ||||
|     2. Handles rank-1 tensors by allreducing them without compression: | ||||
|  | ||||
|         2.1. Allocate contiguous memory for those rank-1 tensors, and allreduces all the rank-1 tensors as a batch, without compression; | ||||
|  | ||||
|         2.2. Copies the individual rank-1 tensors from the contiguous memory back to the input tensor. | ||||
|  | ||||
|     3. Handles high-rank tensors by PowerSGD compression: | ||||
|  | ||||
|         3.1. For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M, | ||||
|         such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized; | ||||
|         3.2) Computes each P in Ps, which is equal to MQ; | ||||
|         3.3) Allreduces Ps as a batch; | ||||
|         3.4) Orthogonalizes each P in Ps; | ||||
|         3.5) Computes each Q in Qs, which is approximately equal to M^TP; | ||||
|         3.6) Allreduces Qs as a batch; | ||||
|         3.7) Computes each M among all the high-rank tensors, which is approximately equal to PQ^T. | ||||
|  | ||||
|     Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations. | ||||
|     This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy, | ||||
|     but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers. | ||||
|         3.2. Computes each P in Ps, which is equal to MQ; | ||||
|  | ||||
|     TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration -- | ||||
|     one left multiplication and one right multiplication. | ||||
|     For warm-start, can take one such step at a time, and alternate between them. | ||||
|         3.3. Allreduces Ps as a batch; | ||||
|  | ||||
|         3.4. Orthogonalizes each P in Ps; | ||||
|  | ||||
|         3.5. Computes each Q in Qs, which is approximately equal to M^TP; | ||||
|  | ||||
|         3.6. Allreduces Qs as a batch; | ||||
|  | ||||
|         3.7. Computes each M among all the high-rank tensors, which is approximately equal to PQ^T. | ||||
|  | ||||
|     Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations. | ||||
|     This not only gives the user more control over the tradeoff between speedup and accuracy, | ||||
|     but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers. | ||||
|  | ||||
|     Args: | ||||
|         state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc. | ||||
|             To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`]. | ||||
|             To tune the compression configs, mainly need to tune `matrix_approximation_rank`` and ``start_powerSGD_iter``. | ||||
|         bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors. | ||||
|             Note that since DDP comm hook only supports single process single device mode at this time, | ||||
|             only exactly one tensor is stored in this bucket. | ||||
| @ -202,9 +198,9 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: | ||||
|         Future handler of the communication, which updates the gradients in place. | ||||
|  | ||||
|     Example:: | ||||
|         state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10) | ||||
|         >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10) | ||||
|         >>> ddp_model.register_comm_hook(state, powerSGD_hook) | ||||
|     """ | ||||
|     """  # noqa | ||||
|     process_group = state.process_group | ||||
|     group_to_use = process_group if process_group is not None else dist.group.WORLD | ||||
|     world_size = group_to_use.size() | ||||
| @ -374,6 +370,10 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: | ||||
|         for tensor, p, q in zip(high_rank_tensors, ps, qs): | ||||
|             torch.matmul(tensor.t(), p, out=q) | ||||
|  | ||||
|         # TODO: The above procedure does two matmul+allreduce steps per iteration -- | ||||
|         # one left multiplication and one right multiplication. | ||||
|         # For warm-start, can take one such step at a time, and alternate between them. | ||||
|  | ||||
|         # Allreduce Qs. | ||||
|         return [ | ||||
|             dist.all_reduce( | ||||
| @ -412,40 +412,48 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: | ||||
|  | ||||
|  | ||||
| def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: | ||||
|     """ | ||||
|     r""" | ||||
|     This DDP communication hook implements a simplified PowerSGD gradient compression | ||||
|     algorithm described in https://arxiv.org/abs/1905.13727. | ||||
|     algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_. | ||||
|     This variant does not compress the gradients layer by layer, | ||||
|     but instead compresses the flattened input tensor that batches all the gradients. | ||||
|     Therefore, it is **faster** than :meth:`powerSGD_hook`, | ||||
|     but usually results in a **much lower accuracy**, unless ``matrix_approximation_rank`` is 1. | ||||
|  | ||||
|     .. warning :: | ||||
|         Increasing ``matrix_approximation_rank`` here may not necessarily increase the accuracy, | ||||
|         because batching per-parameter tensors without column/row alignment can destroy low-rank structure. | ||||
|         Therefore, the user should always consider :meth:`powerSGD_hook` first, | ||||
|         and only consider this variant when a satisfactory accuracy can be achieved when ``matrix_approximation_rank`` is 1. | ||||
|  | ||||
|     Once gradient tensors are aggregated across all workers, this hook applies | ||||
|     compression to the flattened input tensor that batches per-parameter tensors as follows: | ||||
|     1) Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings; | ||||
|     2) Creates two low-rank tensors P and Q for decomposing M, | ||||
|     such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized; | ||||
|     2) Computes P, which is equal to MQ; | ||||
|     3) Allreduces P; | ||||
|     4) Orthogonalizes P; | ||||
|     5) Computes Q, which is approximately equal to M^TP; | ||||
|     6) Allreduces Q; | ||||
|     7) Computes M, which is approximately equal to PQ^T. | ||||
|     8) Truncates the input tensor to the original length. | ||||
|     compression as follows: | ||||
|  | ||||
|     This variant is faster than `powerSGD_hook` that runs layer-wise gradient compression, | ||||
|     but it usually results in a much lower accuracy, unless `matrix_approximation_rank` in the state is 1. | ||||
|     Increasing `matrix_approximation_rank` may not necessarily increase the accuracy, | ||||
|     because batching per-parameter tensors without column/row alignment can destroy low-rank structure. | ||||
|     Therefore, the user shoud always consider `powerSGD_hook` first, | ||||
|     and only consider this variant when a satisfying accuracy can be achieved when `matrix_approximation_rank` is 1. | ||||
|     1. Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings; | ||||
|  | ||||
|     Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations. | ||||
|     This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy, | ||||
|     but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers. | ||||
|     2. Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized; | ||||
|  | ||||
|     TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration -- | ||||
|     one left multiplication and one right multiplication. | ||||
|     For warm-start, can take one such step at a time, and alternate between them. | ||||
|     3. Computes P, which is equal to MQ; | ||||
|  | ||||
|     4. Allreduces P; | ||||
|  | ||||
|     5. Orthogonalizes P; | ||||
|  | ||||
|     6. Computes Q, which is approximately equal to M^TP; | ||||
|  | ||||
|     7. Allreduces Q; | ||||
|  | ||||
|     8. Computes M, which is approximately equal to PQ^T. | ||||
|  | ||||
|     9. Truncates the input tensor to the original length. | ||||
|  | ||||
|     Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations. | ||||
|     This not only gives the user more control over the tradeoff between speedup and accuracy, | ||||
|     but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers. | ||||
|  | ||||
|     Args: | ||||
|         state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc. | ||||
|             To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`]. | ||||
|             To tune the compression configs, mainly need to tune ``matrix_approximation_rank`` and ``start_powerSGD_iter``. | ||||
|         bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors. | ||||
|             Note that since DDP comm hook only supports single process single device mode at this time, | ||||
|             only exactly one tensor is stored in this bucket. | ||||
| @ -454,9 +462,9 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: | ||||
|         Future handler of the communication, which updates the gradients in place. | ||||
|  | ||||
|     Example:: | ||||
|         state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1) | ||||
|         >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1) | ||||
|         >>> ddp_model.register_comm_hook(state, batched_powerSGD_hook) | ||||
|     """ | ||||
|     """  # noqa | ||||
|     process_group = state.process_group | ||||
|     group_to_use = process_group if process_group is not None else dist.group.WORLD | ||||
|     world_size = group_to_use.size() | ||||
| @ -563,6 +571,10 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future: | ||||
|             out=state.q_memory_dict[bucket_index], | ||||
|         ) | ||||
|  | ||||
|         # TODO: The above procedure does two matmul+allreduce steps per iteration -- | ||||
|         # one left multiplication and one right multiplication. | ||||
|         # For warm-start, can take one such step at a time, and alternate between them. | ||||
|  | ||||
|         return [ | ||||
|             dist.all_reduce( | ||||
|                 state.q_memory_dict[bucket_index], group=group_to_use, async_op=True | ||||
|  | ||||
| @ -4,7 +4,7 @@ from typing import Dict, List, Set, NamedTuple, Tuple | ||||
| import torch | ||||
| from torch.fx.passes.split_module import split_module | ||||
| import operator | ||||
| from torch.fx.experimental.partitioner_utils import Partition, \ | ||||
| from torch.fx._experimental.partitioner_utils import Partition, \ | ||||
|     Device, PartitionerConfig, get_partition_to_latency_mapping,\ | ||||
|     get_latency_of_partitioned_graph, NodeLatency, get_extra_size_of, \ | ||||
|     PartitionMode | ||||
| @ -2,7 +2,7 @@ from typing import Dict, List, NamedTuple, Any | ||||
| 
 | ||||
| import torch | ||||
| from torch.fx.passes.shape_prop import ShapeProp | ||||
| from torch.fx.experimental.param_fetch import lift_lowering_attrs_to_nodes | ||||
| from torch.fx._experimental.param_fetch import lift_lowering_attrs_to_nodes | ||||
| from torch.fx.graph import Graph, get_qualified_name | ||||
| from torch.fx.graph_module import GraphModule | ||||
| from torch.fx.node import Node, Target, map_arg | ||||
| @ -116,7 +116,7 @@ class Interpreter: | ||||
|  | ||||
|     # Main Node running APIs | ||||
|  | ||||
|     def placeholder(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any: | ||||
|     def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|         """ | ||||
|         Execute a ``placeholder`` node. Note that this is stateful: | ||||
|         ``Interpreter`` maintains an internal iterator over | ||||
| @ -141,7 +141,7 @@ class Interpreter: | ||||
|         else: | ||||
|             return next(self.args_iter) | ||||
|  | ||||
|     def get_attr(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any: | ||||
|     def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|         """ | ||||
|         Execute a ``get_attr`` node. Will retrieve an attribute | ||||
|         value from the ``Module`` hierarchy of ``self.module``. | ||||
| @ -159,7 +159,7 @@ class Interpreter: | ||||
|         assert isinstance(target, str) | ||||
|         return self.fetch_attr(target) | ||||
|  | ||||
|     def call_function(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any: | ||||
|     def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|         """ | ||||
|         Execute a ``call_function`` node and return the result. | ||||
|  | ||||
| @ -178,7 +178,7 @@ class Interpreter: | ||||
|         # Execute the function and return the result | ||||
|         return target(*args, **kwargs) | ||||
|  | ||||
|     def call_method(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any: | ||||
|     def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|         """ | ||||
|         Execute a ``call_method`` node and return the result. | ||||
|  | ||||
| @ -199,7 +199,7 @@ class Interpreter: | ||||
|         assert isinstance(target, str) | ||||
|         return getattr(self_obj, target)(*args_tail, **kwargs) | ||||
|  | ||||
|     def call_module(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any: | ||||
|     def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|         """ | ||||
|         Execute a ``call_module`` node and return the result. | ||||
|  | ||||
| @ -221,7 +221,7 @@ class Interpreter: | ||||
|  | ||||
|         return submod(*args, **kwargs) | ||||
|  | ||||
|     def output(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any: | ||||
|     def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|         """ | ||||
|         Execute an ``output`` node. This really just retrieves | ||||
|         the value referenced by the ``output`` node and returns it. | ||||
| @ -307,12 +307,12 @@ class Transformer(Interpreter): | ||||
|         method equivalents). We could subclass ``Transformer`` like so:: | ||||
|  | ||||
|             class NegSigmSwapXformer(Transformer): | ||||
|                 def call_function(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any: | ||||
|                 def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|                     if target == torch.sigmoid: | ||||
|                         return torch.neg(*args, **kwargs) | ||||
|                     return super().call_function(n) | ||||
|  | ||||
|                 def call_method(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any: | ||||
|                 def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|                     if target == 'neg': | ||||
|                         call_self, *args_tail = args | ||||
|                         return call_self.sigmoid(*args_tail, **kwargs) | ||||
| @ -344,7 +344,7 @@ class Transformer(Interpreter): | ||||
|         self.tracer = TransformerTracer(self.new_graph) | ||||
|         self.tracer.root = module | ||||
|  | ||||
|     def placeholder(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Proxy: | ||||
|     def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy: | ||||
|         """ | ||||
|         Execute a ``placeholder`` node. In ``Transformer``, this is | ||||
|         overridden to insert a new ``placeholder`` into the output | ||||
| @ -360,7 +360,7 @@ class Transformer(Interpreter): | ||||
|         assert isinstance(target, str) | ||||
|         return Proxy(self.new_graph.placeholder(target), self.tracer) | ||||
|  | ||||
|     def get_attr(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Proxy: | ||||
|     def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy: | ||||
|         """ | ||||
|         Execute a ``get_attr`` node. In ``Transformer``, this is | ||||
|         overridden to insert a new ``get_attr`` node into the output | ||||
| @ -376,6 +376,12 @@ class Transformer(Interpreter): | ||||
|         assert isinstance(target, str) | ||||
|         return Proxy(self.new_graph.get_attr(target), self.tracer) | ||||
|  | ||||
|     def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: | ||||
|         # Override so that the leaf module policy from `self.tracer` is respected. | ||||
|         assert isinstance(target, str) | ||||
|         submod = self.fetch_attr(target) | ||||
|         return self.tracer.call_module(submod, submod.forward, args, kwargs) | ||||
|  | ||||
|     def transform(self) -> GraphModule: | ||||
|         """ | ||||
|         Transform ``self.module`` and return the transformed | ||||
|  | ||||
| @ -5,7 +5,7 @@ import operator | ||||
|  | ||||
| from .graph import magic_methods, reflectable_magic_methods, Graph | ||||
| from typing import Tuple, Dict, Optional, Iterable, Any, Iterator | ||||
| from .node import Target, Node, Argument, base_types | ||||
| from .node import Target, Node, Argument, base_types, map_aggregate | ||||
|  | ||||
| class TracerBase: | ||||
|     graph: Graph | ||||
| @ -61,8 +61,17 @@ class TracerBase: | ||||
|         elif isinstance(a, dict): | ||||
|             r = {} | ||||
|             for k, v in a.items(): | ||||
|                 if not isinstance(k, str): | ||||
|                     raise NotImplementedError(f"dictionaries with non-string keys: {a}") | ||||
|                 # Check for invalid dict keys. We do not want a Proxy to appear | ||||
|                 # anywhere within the key. Since keys can be collection types, | ||||
|                 # we iterate through the key with map_aggregate | ||||
|                 k = self.create_arg(k) | ||||
|  | ||||
|                 def no_node(arg): | ||||
|                     if isinstance(arg, Node): | ||||
|                         raise RuntimeError("Keys for dictionaries used as an argument cannot contain a " | ||||
|                                            "Node. Got key: {k}") | ||||
|                 map_aggregate(k, no_node) | ||||
|  | ||||
|                 r[k] = self.create_arg(v) | ||||
|             return r | ||||
|         elif isinstance(a, slice): | ||||
|  | ||||
| @ -10,7 +10,7 @@ import torch | ||||
| from torch.jit._script import RecursiveScriptModule, ScriptModule | ||||
|  | ||||
|  | ||||
| def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = True): | ||||
| def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics: bool = True): | ||||
|     r""" | ||||
|     Freezing a :class:`ScriptModule` will clone it and attempt to inline the cloned | ||||
|     module's submodules, parameters, and attributes as constants in the TorchScript IR Graph. | ||||
| @ -26,10 +26,8 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = Tr | ||||
|         preserved_attrs (Optional[List[str]]): a list of attributes to preserve in addition to the forward method. | ||||
|         Attributes modified in preserved methods will also be preserved. | ||||
|  | ||||
|         optimize (bool): If ``True``, a set of optimization passes will be run to prepare the graph for inference, | ||||
|         in addition to the graph cleanup that already occurs. The details of the optimizations can be found in | ||||
|         `torch.jit.optimize_frozen_module.` | ||||
|  | ||||
|         optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly  | ||||
|         preserve numerics. Full details of optimization can be found at `torch.jit.optimize_frozen_module`. | ||||
|  | ||||
|     Returns: | ||||
|         Frozen :class:`ScriptModule`. | ||||
| @ -102,16 +100,16 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = Tr | ||||
|  | ||||
|     out = RecursiveScriptModule(torch._C._freeze_module(mod._c, preserved_attrs)) | ||||
|     RecursiveScriptModule._finalize_scriptmodule(out) | ||||
|     if optimize: | ||||
|         optimize_frozen_module(out) | ||||
|     optimize_frozen_module(out, optimize_numerics) | ||||
|  | ||||
|     return out | ||||
|  | ||||
|  | ||||
| def optimize_frozen_module(mod): | ||||
| def optimize_frozen_module(mod, optimize_numerics: bool = True): | ||||
|     r""" | ||||
|     Runs a series of optimizations looking for patterns that occur in frozen graphs. | ||||
|     The current set of optimizations is: | ||||
|         - Dropout Removal | ||||
|         - Conv -> Batchnorm folding | ||||
|         - Conv -> Add/Sub folding | ||||
|         - Conv -> Mul/Div folding | ||||
| @ -119,6 +117,12 @@ def optimize_frozen_module(mod): | ||||
|     Args: | ||||
|         mod (:class:`ScriptModule`): a frozen module to be optimized | ||||
|  | ||||
|         optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly  | ||||
|         preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_allclose`  | ||||
|         when applied on a single transformation, however in a module where many transformations are applied  | ||||
|         the rtol or atol may no longer fall within the default `assert_allclose` tolerance. Conv -> Batchnorm folding,  | ||||
|         Conv-Add/Sub, and Conv -> Mul/Div folding all may alter numerics. | ||||
|  | ||||
|     Returns: | ||||
|         None | ||||
|  | ||||
| @ -140,4 +144,12 @@ def optimize_frozen_module(mod): | ||||
|         assert "batch_norm" not in str(frozen_mod.graph) | ||||
|  | ||||
|     """ | ||||
|     torch._C._jit_pass_optimize_frozen_graph(mod.graph) | ||||
|     # xxx: keep in sync with frozen_graph_optimization.cpp | ||||
|     # intentionally duplicated to make to make it easier to create custom optimization sequence | ||||
|     torch._C._jit_pass_remove_dropout(mod._c) | ||||
|     if optimize_numerics: | ||||
|         # run a couple times to capture Conv -> Mul -> Add etc | ||||
|         for _ in range(2): | ||||
|             torch._C._jit_pass_fold_frozen_conv_bn(mod.graph) | ||||
|             torch._C._jit_pass_fold_frozen_conv_add_or_sub(mod.graph) | ||||
|             torch._C._jit_pass_fold_frozen_conv_mul_or_div(mod.graph) | ||||
|  | ||||
| @ -24,6 +24,7 @@ from .replicate import replicate | ||||
| from .scatter_gather import scatter_kwargs, gather, is_namedtuple | ||||
| from .parallel_apply import parallel_apply | ||||
| from torch._utils import _get_device_index, _get_all_device_indices | ||||
| from ._functions import _get_stream | ||||
|  | ||||
|  | ||||
| def _find_tensors(obj): | ||||
| @ -438,6 +439,8 @@ class DistributedDataParallel(Module): | ||||
|  | ||||
|         # reduction bucket size | ||||
|         self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024) | ||||
|         # Whether to perform input tensor CPU to GPU copies on a side-stream | ||||
|         self.use_side_stream_for_tensor_copies = os.environ.get("PYTORCH_DDP_USE_SIDE_STREAM", "1") == "1" | ||||
|  | ||||
|         # Sync params and buffers | ||||
|         self._sync_params_and_buffers(authoritative_rank=0) | ||||
| @ -732,7 +735,23 @@ class DistributedDataParallel(Module): | ||||
|         """ | ||||
|         def to_map(obj): | ||||
|             if isinstance(obj, torch.Tensor): | ||||
|                 return (obj.to(target_gpu), ) | ||||
|                 if not self.use_side_stream_for_tensor_copies: | ||||
|                     return (obj.to(target_gpu), ) | ||||
|                 else: | ||||
|                     # Perform CPU -> GPU copies in a background stream. This code is | ||||
|                     # motivated from similar logic in torch/nn/parallel/_functions.py | ||||
|                     stream = _get_stream(target_gpu) | ||||
|                     with torch.cuda.stream(stream): | ||||
|                         output = obj.to(target_gpu) | ||||
|                     # synchronize with the copy stream | ||||
|                     with torch.cuda.device(target_gpu): | ||||
|                         current_stream = torch.cuda.current_stream() | ||||
|                         # Sync the current stream with the copy stream | ||||
|                         current_stream.wait_stream(stream) | ||||
|                         # Ensure tensor memory is not reused until work on | ||||
|                         # main stream is complete | ||||
|                         output.record_stream(current_stream) | ||||
|                     return (output, ) | ||||
|             if is_namedtuple(obj): | ||||
|                 return [type(obj)(*args) for args in zip(*map(to_map, obj))] | ||||
|             if isinstance(obj, tuple) and len(obj) > 0: | ||||
| @ -1021,13 +1040,14 @@ class DistributedDataParallel(Module): | ||||
|         parameter syncs while running Distributed DataParallel training. | ||||
|  | ||||
|         Args: | ||||
|             state (object): state is passed to the hook and can be used to maintain | ||||
|                             and update any state information that users would like to | ||||
|                             maintain as part of the training process. Examples: error | ||||
|                             feedback in gradient compression, peers to communicate with | ||||
|                             next in GossipGrad etc. | ||||
|             hook (callable): is defined as: | ||||
|                              hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future: | ||||
|             state (object): Passed to the hook to maintain any state information during the training process. | ||||
|                             Examples include error feedback in gradient compression, | ||||
|                             peers to communicate with next in GossipGrad, etc. | ||||
|  | ||||
|                             It is locally stored by each worker | ||||
|                             and shared by all the gradient tensors on the worker. | ||||
|             hook (callable): Averages gradient tensors across workers and defined as: | ||||
|                              ``hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future``: | ||||
|  | ||||
|                              This function is called once the bucket is ready. The | ||||
|                              hook can perform whatever processing is needed and return | ||||
| @ -1067,7 +1087,7 @@ class DistributedDataParallel(Module): | ||||
|             DDP communication hook is experimental and subject to change. | ||||
|  | ||||
|         Example:: | ||||
|             Below is an example of a noop hook that returns back the same tensors: | ||||
|             Below is an example of a noop hook that returns the same tensors. | ||||
|  | ||||
|             >>> def noop(state: object, bucket: dist._GradBucket): -> torch.futures.Future | ||||
|             >>>     fut = torch.futures.Future() | ||||
| @ -1091,7 +1111,6 @@ class DistributedDataParallel(Module): | ||||
|             >>>     return fut.then(decode) | ||||
|  | ||||
|             >>> ddp.register_comm_hook(state = None, hook = encode_and_decode) | ||||
|  | ||||
|         """ | ||||
|         self._check_comm_hook(hook) | ||||
|         dist._register_comm_hook(self.reducer, state, hook) | ||||
|  | ||||
| @ -296,6 +296,22 @@ def _is_fp(value): | ||||
|             return (type == 'Float') or (type == 'Double') or (type == 'Half') | ||||
|     return False | ||||
|  | ||||
| def _generate_wrapped_number(g, scalar): | ||||
|     """ | ||||
|     Create a wrapped number based on https://github.com/pytorch/pytorch/issues/9515 | ||||
|     A Tensor is a considered a "wrapped number" if it is | ||||
|     auto-wrapped from a C++ or Python number type. Integer types are | ||||
|     wrapped as 0-dim int64 tensors and floating-point types are | ||||
|     wrapped as 0-dim double tensors. | ||||
|  | ||||
|     The input to this function is constant value. If the data type  | ||||
|     is a floating point type, it is converted to a 0-dim double | ||||
|     tensor, else it is converted to a 0-dim tensor of its original type | ||||
|     """ | ||||
|     assert not isinstance(scalar, torch.Tensor) | ||||
|     if isinstance(scalar, float): | ||||
|         return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double)) | ||||
|     return g.op("Constant", value_t=torch.tensor(scalar)) | ||||
|  | ||||
| def _sort_helper(g, input, dim, decending=True, out=None): | ||||
|     if out is not None: | ||||
|  | ||||
| @ -121,6 +121,21 @@ def where(g, condition, self=None, other=None, _outputs=None): | ||||
|         return sym_help._unbind_helper(g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs) | ||||
|     return g.op("Where", condition, self, other) | ||||
|  | ||||
| @parse_args('v', 'v', 'v', 'i', 'i', 'i') | ||||
| def fake_quantize_per_channel_affine(g, inputs, scale, zero_point, axis, quant_min=-128, quant_max=127): | ||||
|     if quant_min not in [0, -128] or quant_max not in [127, 255]: | ||||
|         raise RuntimeError( | ||||
|             "ONNX defines [0, 255] for quint8 and [-128, 127] for qint8, got [{}, {}]".format(quant_min, quant_max)) | ||||
|  | ||||
|     # ONNX defines zero_point to be int8 or uint8 | ||||
|     if quant_min == 0: | ||||
|         zero_point = g.op("Cast", zero_point, to_i=sym_help.cast_pytorch_to_onnx['Byte']) | ||||
|     else: | ||||
|         zero_point = g.op("Cast", zero_point, to_i=sym_help.cast_pytorch_to_onnx['Char']) | ||||
|     return g.op( | ||||
|         "DequantizeLinear", | ||||
|         g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis), | ||||
|         scale, zero_point, axis_i=axis) | ||||
|  | ||||
| def _reduce_op_symbolic(onnx_op_name): | ||||
|     def symbolic(g, self, dim=None, keepdim=None): | ||||
|  | ||||
| @ -1319,8 +1319,8 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable): | ||||
|  | ||||
|     axes = [-i for i in range(len(normalized_shape), 0, -1)] | ||||
|  | ||||
|     two_cst = g.op("Constant", value_t=torch.tensor(2.)) | ||||
|     eps_cst = g.op("Constant", value_t=torch.tensor(eps)) | ||||
|     two_cst = sym_help._generate_wrapped_number(g, 2.) | ||||
|     eps_cst = sym_help._generate_wrapped_number(g, eps) | ||||
|  | ||||
|     mean = g.op("ReduceMean", input, axes_i=axes) | ||||
|     numerator = sub(g, input, mean) | ||||
|  | ||||
| @ -391,9 +391,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: | ||||
|         torch.exp2: lambda input, out=None: -1, | ||||
|         torch.expm1: lambda input, out=None: -1, | ||||
|         torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1, | ||||
|         torch.fake_quantize_per_channel_affine_cachemask: lambda input, scale, zero_point, axis, quant_min, quant_max: -1, | ||||
|         torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1, | ||||
|         torch.fake_quantize_per_tensor_affine_cachemask: lambda input, scale, zero_point, quant_min, quant_max: -1, | ||||
|         torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1, | ||||
|         torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1, | ||||
|         torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1, | ||||
|  | ||||
| @ -22,16 +22,21 @@ from typing import List, Optional, Union | ||||
| from setuptools.command.build_ext import build_ext | ||||
| from pkg_resources import packaging  # type: ignore | ||||
|  | ||||
| BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA') | ||||
| IS_WINDOWS = sys.platform == 'win32' | ||||
| LIB_EXT = '.pyd' if IS_WINDOWS else '.so' | ||||
| EXEC_EXT = '.exe' if IS_WINDOWS else '' | ||||
| CLIB_PREFIX = '' if IS_WINDOWS else 'lib' | ||||
| CLIB_EXT = '.dll' if IS_WINDOWS else '.so' | ||||
| SHARED_FLAG = '/DLL' if IS_WINDOWS else '-shared' | ||||
|  | ||||
| _HERE = os.path.abspath(__file__) | ||||
| _TORCH_PATH = os.path.dirname(os.path.dirname(_HERE)) | ||||
| TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib') | ||||
|  | ||||
|  | ||||
| BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA') or (os.path.exists(os.path.join( | ||||
|     TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cu{CLIB_EXT}')) and os.path.exists(os.path.join(TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cpp{CLIB_EXT}'))) | ||||
|  | ||||
| # Taken directly from python stdlib < 3.9 | ||||
| # See https://github.com/pytorch/pytorch/issues/48617 | ||||
| def _nt_quote_args(args: Optional[List[str]]) -> List[str]: | ||||
|  | ||||
							
								
								
									
										0
									
								
								torch/utils/data/datapipes/__init__.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								torch/utils/data/datapipes/__init__.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								torch/utils/data/datapipes/iter/__init__.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								torch/utils/data/datapipes/iter/__init__.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								torch/utils/data/datapipes/iter/listdirfiles.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								torch/utils/data/datapipes/iter/listdirfiles.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								torch/utils/data/datapipes/iter/loadfilesfromdisk.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								torch/utils/data/datapipes/iter/loadfilesfromdisk.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								torch/utils/data/datapipes/iter/readfilesfromtar.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								torch/utils/data/datapipes/iter/readfilesfromtar.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								torch/utils/data/datapipes/iter/readfilesfromzip.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								torch/utils/data/datapipes/iter/readfilesfromzip.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								torch/utils/data/datapipes/utils/__init__.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								torch/utils/data/datapipes/utils/__init__.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										0
									
								
								torch/utils/data/datapipes/utils/common.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										0
									
								
								torch/utils/data/datapipes/utils/common.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
								
								
									
										17
									
								
								torch/utils/hipify/hipify_python.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										17
									
								
								torch/utils/hipify/hipify_python.py
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							| @ -174,7 +174,7 @@ def preprocess_file_and_save_result( | ||||
|     result = preprocessor(output_directory, filepath, all_files, includes, stats, | ||||
|                           hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress) | ||||
|  | ||||
|     fin_path = os.path.join(output_directory, filepath) | ||||
|     fin_path = os.path.abspath(os.path.join(output_directory, filepath)) | ||||
|     # Show what happened | ||||
|     if show_progress: | ||||
|         print( | ||||
| @ -711,7 +711,7 @@ def preprocessor( | ||||
|         clean_ctx: GeneratedFileCleaner, | ||||
|         show_progress: bool) -> HipifyResult: | ||||
|     """ Executes the CUDA -> HIP conversion on the specified file. """ | ||||
|     fin_path = os.path.join(output_directory, filepath) | ||||
|     fin_path = os.path.abspath(os.path.join(output_directory, filepath)) | ||||
|  | ||||
|     with open(fin_path, 'r', encoding='utf-8') as fin: | ||||
|         if fin.readline() == HIPIFY_C_BREADCRUMB: | ||||
| @ -721,7 +721,7 @@ def preprocessor( | ||||
|  | ||||
|     orig_output_source = output_source | ||||
|  | ||||
|     fout_path = os.path.join(output_directory, get_hip_file_path(filepath, is_pytorch_extension)) | ||||
|     fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(filepath, is_pytorch_extension))) | ||||
|     if not os.path.exists(os.path.dirname(fout_path)): | ||||
|         clean_ctx.makedirs(os.path.dirname(fout_path)) | ||||
|  | ||||
| @ -829,9 +829,14 @@ def preprocessor( | ||||
|         with open(fout_path, 'r', encoding='utf-8') as fout_old: | ||||
|             do_write = fout_old.read() != output_source | ||||
|     if do_write: | ||||
|         with clean_ctx.open(fout_path, 'w', encoding='utf-8') as fout: | ||||
|             fout.write(output_source) | ||||
|         return {"hipified_path": fout_path, "status": "ok"} | ||||
|         try: | ||||
|             with clean_ctx.open(fout_path, 'w', encoding='utf-8') as fout: | ||||
|                 fout.write(output_source) | ||||
|             return {"hipified_path": fout_path, "status": "ok"} | ||||
|         except PermissionError as e: | ||||
|             print(f"{bcolors.WARNING}Failed to save {fout_path} with \"{e.strerror}\", leaving {fin_path} unchanged.{bcolors.ENDC}", | ||||
|                   file=sys.stderr) | ||||
|             return {"hipified_path": fin_path, "status": "skipped"} | ||||
|     else: | ||||
|         return {"hipified_path": fout_path, "status": "skipped"} | ||||
|  | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	