apply diff 52351 (#52649 )

[1.8] Update XNNPACK (#52647 )
Cherry-pick 55d53a4e70 into release/1.8 branch
2025-10-22 14:15:01 +08:00 · 2021-02-23 07:51:38 -08:00 · 2021-02-23 05:31:57 -08:00 · 2021-02-22 16:55:12 -08:00 · 2021-02-22 10:14:09 -08:00 · 2021-02-19 14:40:53 -08:00
79 changed files with 1348 additions and 619 deletions
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@ -52,6 +52,14 @@ CONFIG_TREE_DATA = OrderedDict(
            "3.7",
        ],
    )),
+    macos_arm64=([None], OrderedDict(
+        wheel=[
+            "3.8",
+        ],
+        conda=[
+            "3.8",
+        ],
+    )),
    # Skip CUDA-9.2 builds on Windows
    windows=(
        [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
--- a/.circleci/cimodel/data/binary_build_definitions.py
+++ b/.circleci/cimodel/data/binary_build_definitions.py
@ -164,7 +164,7 @@ def gen_build_env_list(smoke):
            c.find_prop("gpu"),
            c.find_prop("package_format"),
            [c.find_prop("pyver")],
-            c.find_prop("smoke"),
+            c.find_prop("smoke") and not (c.find_prop("os_name") == "macos_arm64"),  # don't test arm64
            c.find_prop("libtorch_variant"),
            c.find_prop("gcc_config_variant"),
            c.find_prop("libtorch_config_variant"),
@ -216,7 +216,9 @@ def get_jobs(toplevel_key, smoke):
    configs = gen_build_env_list(smoke)
    phase = "build" if toplevel_key == "binarybuilds" else "test"
    for build_config in configs:
-        jobs_list.append(build_config.gen_workflow_job(phase, nightly=True))
+        # don't test for macos_arm64 as it's cross compiled
+        if phase != "test" or build_config.os != "macos_arm64":
+            jobs_list.append(build_config.gen_workflow_job(phase, nightly=True))

    return jobs_list

--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@ -3,7 +3,7 @@ PHASES = ["build", "test"]
 CUDA_VERSIONS = [
    "101",
    "102",
-    "112",
+    "111",
 ]

 ROCM_VERSIONS = [
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
--- a/.circleci/scripts/binary_linux_build.sh
+++ b/.circleci/scripts/binary_linux_build.sh
@ -7,6 +7,10 @@ source /env
 # Defaults here so they can be changed in one place
 export MAX_JOBS=${MAX_JOBS:-$(( $(nproc) - 2 ))}

+if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
+  export BUILD_SPLIT_CUDA="ON"
+fi
+
 # Parse the parameters
 if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
  build_script='conda/build_pytorch.sh'
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -15,6 +15,10 @@ else
  export VC_YEAR=2019
 fi

+if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
+  export BUILD_SPLIT_CUDA="ON"
+fi
+
 set +x
 export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
 export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}
--- a/.circleci/verbatim-sources/commands.yml
+++ b/.circleci/verbatim-sources/commands.yml
@ -111,11 +111,11 @@ commands:
                git config --global user.email "circleci.ossci@gmail.com"
                git config --global user.name "CircleCI"
                git config remote.origin.url https://github.com/pytorch/pytorch.git
-                git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
-                git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
+                git config --add remote.origin.fetch +refs/heads/release/1.8:refs/remotes/origin/release/1.8
+                git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/release/1.8:refs/remotes/origin/release/1.8 --depth=100 --quiet
                # PRs generated from ghstack has format CIRCLE_PR_BASE_BRANCH=gh/xxx/1234/base
                if [[ "${CIRCLE_PR_BASE_BRANCH}" == "gh/"* ]]; then
-                  CIRCLE_PR_BASE_BRANCH=master
+                  CIRCLE_PR_BASE_BRANCH=release/1.8
                fi
                export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/$CIRCLE_PR_BASE_BRANCH`
                echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@ -198,6 +198,44 @@
        root: /Users/distiller/project
        paths: final_pkgs

+    - store_artifacts:
+        path: /Users/distiller/project/final_pkgs
+
+  binary_macos_arm64_build:
+    <<: *binary_mac_params
+    macos:
+      xcode: "12.3.0"
+    steps:
+    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
+    - checkout
+    - run:
+        <<: *binary_checkout
+    - run:
+        <<: *binary_populate_env
+    - brew_update
+    - run:
+        <<: *binary_install_miniconda
+
+    - run:
+        name: Build
+        no_output_timeout: "90m"
+        command: |
+          # Do not set -u here; there is some problem with CircleCI
+          # variable expansion with PROMPT_COMMAND
+          set -ex -o pipefail
+          export CROSS_COMPILE_ARM64=1
+          script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_build.sh"
+          cat "$script"
+          source "$script"
+
+    - persist_to_workspace:
+        root: /Users/distiller/project
+        paths: final_pkgs
+
+    - store_artifacts:
+        path: /Users/distiller/project/final_pkgs
+
+
  binary_ios_build:
    <<: *pytorch_ios_params
    macos:
--- a/.gitmodules
+++ b/.gitmodules
@ -121,7 +121,7 @@
 [submodule "third_party/XNNPACK"]
    ignore = dirty
    path = third_party/XNNPACK
-    url = https://github.com/google/XNNPACK.git
+    url = https://github.com/malfet/XNNPACK.git
 [submodule "third_party/fmt"]
    ignore = dirty
    path = third_party/fmt
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -182,7 +182,7 @@ fi

 # Patch required to build xla
 if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
-  git clone --recursive https://github.com/pytorch/xla.git
+  git clone --recursive -b r1.8 https://github.com/pytorch/xla.git
  ./xla/scripts/apply_patches.sh
 fi

--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@ -54,7 +54,7 @@ function file_diff_from_base() {
  set +e
  git fetch origin master --quiet
  set -e
-  git diff --name-only "$(git merge-base origin/master HEAD)" > "$1"
+  git diff --name-only "$(git merge-base origin/release/1.8 HEAD)" > "$1"
 }

 function get_bazel() {
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -300,7 +300,7 @@ test_backward_compatibility() {
  pushd test/backward_compatibility
  python -m venv venv
  . venv/bin/activate
-  pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+  pip_install --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
  pip show torch
  python dump_all_function_schemas.py --filename nightly_schemas.txt
  deactivate
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@ -11,7 +11,6 @@
 #include <ATen/DeviceGuard.h>
 #include <ATen/DimVector.h>
 #include <ATen/Dispatch.h>
-#include <ATen/DynamicLibrary.h>
 #include <ATen/Formatting.h>
 #include <ATen/Functions.h>
 #include <ATen/NamedTensor.h>
--- a/aten/src/ATen/DynamicLibrary.cpp
+++ b/aten/src/ATen/DynamicLibrary.cpp
@ -25,9 +25,16 @@ static void* checkDL(void* x) {

  return x;
 }
-DynamicLibrary::DynamicLibrary(const char* name) {
+DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
  // NOLINTNEXTLINE(hicpp-signed-bitwise)
-  handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
+  handle = dlopen(name, RTLD_LOCAL | RTLD_NOW);
+  if (!handle) {
+    if (alt_name) {
+      handle = checkDL(dlopen(alt_name, RTLD_LOCAL | RTLD_NOW));
+    } else {
+        AT_ERROR("Error in dlopen or dlsym: ", dlerror());
+    }
+  }
 }

 void* DynamicLibrary::sym(const char* name) {
@ -45,7 +52,7 @@ DynamicLibrary::~DynamicLibrary() {

 // Windows

-DynamicLibrary::DynamicLibrary(const char* name) {
+DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) {
  // NOLINTNEXTLINE(hicpp-signed-bitwise)
  HMODULE theModule;
  bool reload = true;
--- a/aten/src/ATen/DynamicLibrary.h
+++ b/aten/src/ATen/DynamicLibrary.h
@ -8,7 +8,7 @@ namespace at {
 struct DynamicLibrary {
  AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);

-  TORCH_API DynamicLibrary(const char* name);
+  TORCH_API DynamicLibrary(const char* name, const char* alt_name = nullptr);

  TORCH_API void* sym(const char* name);

--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -23,10 +23,17 @@ at::DynamicLibrary& getNVRTCLibrary() {
  constexpr auto minor = ( CUDA_VERSION / 10 ) % 10;
 #if defined(_WIN32)
  auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll";
+  std::string alt_libname;
 #else
-  static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor);
+  static auto lib_version = std::to_string(major) + "." + std::to_string(minor);
+  static auto libname = std::string("libnvrtc.so.") + lib_version;
+#ifdef NVRTC_SHORTHASH
+  static auto alt_libname = std::string("libnvrtc-") + C10_STRINGIZE(NVRTC_SHORTHASH) + ".so." + lib_version;
+#else
+  std::string alt_libname;
 #endif
-  static at::DynamicLibrary lib(libname.c_str());
+#endif
+  static at::DynamicLibrary lib(libname.c_str(), alt_libname.empty() ? nullptr : alt_libname.c_str());
  return lib;
 }

--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -238,7 +238,12 @@ auto ConvParams::use_mkldnn(const at::Tensor& input, const at::Tensor& weight) c
     (groups > 1
      || (weight.size(-1) > 3 && weight.size(-2) > 3)
      || input.size(0) > 1
-      || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480)); // for some case, native is faster
+      || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster
+      // OneDNN < 1.8.1 produce incorrect results in this case (see #50042)
+      // TODO(VitalyFedyunin): Remove this patch after OneDNN 1.8.1 merged in
+      && !(groups == 24 && weight.size(0) == 24 && weight.size(1) == 1)
+      ); 
+
 #endif
  return false;
 }
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@ -26,7 +26,7 @@ static void upsample_bicubic2d_out_frame(
        const scalar_t* in = &idata[output_y * input_width + output_x];
        scalar_t* out = &odata[output_y * output_width + output_x];

-        for (int64_t c = 0; c < channels; ++c) {
+        for (int64_t c = 0; c < channels * nbatch; ++c) {
          out[0] = in[0];
          in += input_width * input_height;
          out += output_width * output_height;
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -19,6 +19,27 @@ namespace {

 using namespace vec256;

+// Note: Explicit implementation of copysign for Half and BFloat16
+// is needed to workaround g++-7/8 crash on aarch64, but also makes
+// copysign faster for the half-precision types
+template<typename T>
+T copysign(T a, T b) {
+  return std::copysign(a, b);
+}
+
+// Implement copysign for half precision floats using bit ops
+// Sign is the most significant bit for both half and bfloat16 types
+template<>
+c10::Half copysign(c10::Half a, c10::Half b) {
+  return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
+}
+
+template<>
+c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
+   return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
+}
+
+
 // Note: Undefined behavior when performing addition is intentionally
 // ignored.
 void add_kernel(TensorIteratorBase& iter, Scalar alpha_scalar) {
@ -180,7 +201,7 @@ void div_floor_kernel(TensorIterator& iter) {
                floordiv += scalar_t(1.0);
              }
            } else {
-              floordiv = std::copysign(scalar_t(0), a / b);
+              floordiv = copysign(scalar_t(0), a / b);
            }
            return floordiv;
          });
@ -889,23 +910,6 @@ void heaviside_kernel(TensorIterator& iter) {
  });
 }

-template<typename T>
-T copysign(T a, T b) {
-  return std::copysign(a, b);
-}
-
-// Implement copysign for half precision floats using bit ops
-// Sign is the most significant bit for both half and bfloat16 types
-template<>
-c10::Half copysign(c10::Half a, c10::Half b) {
-  return c10::Half((a.x&0x7fff) | (b.x&0x8000), c10::Half::from_bits());
-}
-
-template<>
-c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
-   return c10::BFloat16((a.x&0x7fff) | (b.x&0x8000), c10::BFloat16::from_bits());
-}
-
 void copysign_kernel(TensorIterator& iter) {
  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() {
    cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@ -113,31 +113,46 @@ __global__ void upsample_trilinear3d_out_frame(
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(1024)
 __global__ void upsample_trilinear3d_backward_out_frame(
-    const size_t nc_,
-    const int depth1,
-    const int height1,
-    const int width1,
-    const int depth2,
-    const int height2,
-    const int width2,
+    const int num_kernels,
    const accscalar_t rdepth,
    const accscalar_t rheight,
    const accscalar_t rwidth,
    const bool align_corners,
-    scalar_t* __restrict__ idata,
-    const scalar_t* __restrict__ odata) {
-  const size_t i_numel = nc_ * depth1 * height1 * width1;
-  const size_t o_numel = nc_ * depth2 * height2 * width2;
+    PackedTensorAccessor64<scalar_t, 5> idata,
+    const PackedTensorAccessor64<scalar_t, 5> odata,
+    scalar_t* idata_ptr) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;

-  for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel; index += blockDim.x * gridDim.x) {
-    size_t index_temp = index;
-    const int w2 = index_temp % width2;   // 0:width2-1
-    index_temp /= width2;
-    const int h2 = index_temp % height2;  // 0:height2-1
-    index_temp /= height2;
-    const int t2 = index_temp % depth2;   // 0:depth2-1
-    const int nc = index_temp / depth2;
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int depth1 = idata.size(2);
+  const int height1 = idata.size(3);
+  const int width1 = idata.size(4);
+  const int depth2 = odata.size(2);
+  const int height2 = odata.size(3);
+  const int width2 = odata.size(4);

+  const size_t i_numel = batchsize * channels * depth1 * height1 * width1;
+
+  if (index < num_kernels) {
+    const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2 * width2); // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][t1][h1][w1];
+          idata[n][c][t2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
    const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>(
        rdepth, t2, align_corners, /*cubic=*/false);
    const int t1 = t1r;
@ -159,55 +174,60 @@ __global__ void upsample_trilinear3d_backward_out_frame(
    const accscalar_t w1lambda = w1r - w1;
    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
    //
-    const scalar_t d2val = odata[index];
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1, h1, w1),
-      i_numel,
-      static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
-      i_numel,
-      static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
-      i_numel,
-      static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
-      i_numel,
-      static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
-      i_numel,
-      static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
-      i_numel,
-      static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
-      i_numel,
-      static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
-      true);
-    fastAtomicAdd(
-      idata,
-      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
-      i_numel,
-      static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
-      true);
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][t2][h2][w2];
+        const size_t nc = n * channels + c;
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1, w1),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
+          true);
+      }
+    }
  }
 }

@ -350,21 +370,20 @@ static void upsample_trilinear3d_backward_out_cuda_template(
  // so it has to be initialized to zero.
  grad_input.zero_();

-  // const size_t num_kernels = nbatch * channels * output_depth * output_height * output_width;
-  const size_t num_kernels = grad_output.numel();
+  const int num_kernels = output_depth * output_height * output_width;
  const int num_threads = std::min(
      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  if (num_kernels > 0) {
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      grad_output.scalar_type(),
      "upsample_trilinear3d_backward_out_frame",
      [&] {
        using accscalar_t = at::acc_type<scalar_t, true>;

-        auto idata = grad_input.data_ptr<scalar_t>();
-        auto odata = grad_output.data_ptr<scalar_t>();
+        auto idata = grad_input.packed_accessor64<scalar_t, 5>();
+        auto odata = grad_output.packed_accessor64<scalar_t, 5>();
+        scalar_t* idata_ptr = grad_input.data_ptr<scalar_t>();

        const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
            input_depth, output_depth, align_corners, scales_d);
@ -374,26 +393,20 @@ static void upsample_trilinear3d_backward_out_cuda_template(
            input_width, output_width, align_corners, scales_w);

        upsample_trilinear3d_backward_out_frame<scalar_t, accscalar_t>
-            <<<cuda::ATenCeilDiv(num_kernels, static_cast<size_t>(num_threads)),
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
               num_threads,
               0,
               stream>>>(
-                nbatch * channels,
-                input_depth,
-                input_height,
-                input_width,
-                output_depth,
-                output_height,
-                output_width,
+                num_kernels,
                rdepth,
                rheight,
                rwidth,
                align_corners,
                idata,
-                odata);
+                odata,
+                idata_ptr);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      });
-  }
 }

 } // namespace
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@ -133,7 +133,9 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
    ASSERT_EQ(buffer1[i].z, buffer2[i].z);
    ASSERT_EQ(buffer1[i].w, buffer2[i].w);
  }
+// Skipping this part until https://github.com/pytorch/pytorch/issues/51863 is resolved

+#if 0
  // unaligned
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 16; j++) {
@ -151,4 +153,5 @@ TEST(TestVectorizedMemoryAccess, CopyKernel) {
      }
    }
  }
+#endif
 }
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@ -16,7 +16,7 @@ int32_t driver_version() {
  return driver_version;
 }

-int device_count_impl() {
+int device_count_impl(bool fail_if_no_driver) {
  int count;
  auto err = cudaGetDeviceCount(&count);
  if (err == cudaSuccess) {
@ -34,6 +34,11 @@ int device_count_impl() {
    case cudaErrorInsufficientDriver: {
      auto version = driver_version();
      if (version <= 0) {
+        if (!fail_if_no_driver) {
+          // No CUDA driver means no devices
+          count = 0;
+          break;
+        }
        TORCH_CHECK(
            false,
            "Found no NVIDIA driver on your system. Please check that you "
@ -95,9 +100,9 @@ DeviceIndex device_count() noexcept {
  // initialize number of devices only once
  static int count = []() {
    try {
-      auto result = device_count_impl();
+      auto result = device_count_impl(/*fail_if_no_driver=*/false);
      TORCH_INTERNAL_ASSERT(result <= std::numeric_limits<DeviceIndex>::max(), "Too many CUDA devices, DeviceIndex overflowed");
-      return device_count_impl();
+      return result;
    } catch (const c10::Error& ex) {
      // We don't want to fail, but still log the warning
      // msg() returns the message without the stack trace
@ -110,7 +115,7 @@ DeviceIndex device_count() noexcept {

 DeviceIndex device_count_ensure_non_zero() {
  // Call the implementation every time to throw the exception
-  int count = device_count_impl();
+  int count = device_count_impl(/*fail_if_no_driver=*/true);
  // Zero gpus doesn't produce a warning in `device_count` but we fail here
  TORCH_CHECK(count, "No CUDA GPUs are available");
  return static_cast<DeviceIndex>(count);
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -590,6 +590,10 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
      list(APPEND Caffe2_GPU_SRCS
        ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
    endif()
+    set_source_files_properties(
+      ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+      PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
+    )
  endif()

  if(USE_ROCM)
@ -741,6 +745,10 @@ file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT})
 # Wrapper library for people who link against torch and expect both CPU and CUDA support
 # Contains "torch_cpu" and "torch_cuda"
 add_library(torch ${DUMMY_EMPTY_FILE})
+if(BUILD_SPLIT_CUDA)
+  # When we split torch_cuda, we want a dummy torch_cuda library that contains both parts
+  add_library(torch_cuda ${DUMMY_EMPTY_FILE})
+endif()
 if(HAVE_SOVERSION)
  set_target_properties(torch PROPERTIES
      VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
@ -1233,11 +1241,12 @@ endif()

 caffe2_interface_library(torch_cpu torch_cpu_library)

-if(BUILD_SPLIT_CUDA)
-  caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library)
-  caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library)
-elseif(USE_CUDA)
+if(USE_CUDA)
  caffe2_interface_library(torch_cuda torch_cuda_library)
+  if(BUILD_SPLIT_CUDA)
+    caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library)
+    caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library)
+  endif()
 elseif(USE_ROCM)
  caffe2_interface_library(torch_hip torch_hip_library)
 endif()
@ -1245,22 +1254,26 @@ endif()
 caffe2_interface_library(torch torch_library)

 install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-if(BUILD_SPLIT_CUDA)
-  install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-  install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-elseif(USE_CUDA)
+
+if(USE_CUDA)
  install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  if(BUILD_SPLIT_CUDA)
+    install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+    install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  endif()
 elseif(USE_ROCM)
  install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
 install(TARGETS torch torch_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")

 target_link_libraries(torch PUBLIC torch_cpu_library)
-if(BUILD_SPLIT_CUDA)
-  target_link_libraries(torch PUBLIC torch_cuda_cu_library)
-  target_link_libraries(torch PUBLIC torch_cuda_cpp_library)
-elseif(USE_CUDA)
+
+if(USE_CUDA)
  target_link_libraries(torch PUBLIC torch_cuda_library)
+  if(BUILD_SPLIT_CUDA)
+    target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library)
+    target_link_libraries(torch_cuda PUBLIC torch_cuda_cpp_library)
+  endif()
 elseif(USE_ROCM)
  target_link_libraries(torch PUBLIC torch_hip_library)
 endif()
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@ -188,6 +188,20 @@ find_library(CUDA_CUDA_LIB cuda
 find_library(CUDA_NVRTC_LIB nvrtc
    PATHS ${CUDA_TOOLKIT_ROOT_DIR}
    PATH_SUFFIXES lib lib64 lib/x64)
+  if(CUDA_NVRTC_LIB AND NOT CUDA_NVRTC_SHORTHASH)
+ execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" -c
+    "import hashlib;hash=hashlib.sha256();hash.update(open('${CUDA_NVRTC_LIB}','rb').read());print(hash.hexdigest()[:8])"
+    RESULT_VARIABLE _retval
+    OUTPUT_VARIABLE CUDA_NVRTC_SHORTHASH)
+  if(NOT _retval EQUAL 0)
+    message(WARNING "Failed to compute shorthash for libnvrtc.so")
+    set(CUDA_NVRTC_SHORTHASH "XXXXXXXX")
+  else()
+    string(STRIP "${CUDA_NVRTC_SHORTHASH}" CUDA_NVRTC_SHORTHASH)
+    message(STATUS "${CUDA_NVRTC_LIB} shorthash is ${CUDA_NVRTC_SHORTHASH}")
+  endif()
+endif()

 # Create new style imported libraries.
 # Several of these libraries have a hardcoded path if CAFFE2_STATIC_LINK_CUDA
@ -338,6 +352,12 @@ if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
      set_property(
        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublasLt_static.a")
+      # Add explicit dependency to cudart_static to fix
+      # libcublasLt_static.a.o): undefined reference to symbol 'cudaStreamWaitEvent'
+      # error adding symbols: DSO missing from command line
+      set_property(
+        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
+        "${CUDA_cudart_static_LIBRARY}" rt dl)
    endif()
 else()
    set_property(
--- a/docs/source/ddp_comm_hooks.rst
+++ b/docs/source/ddp_comm_hooks.rst
@ -0,0 +1,74 @@
+DDP Communication Hooks
+=======================
+
+DDP communication hook is a generic interface to control how to communicate
+gradients across workers by overriding the vanilla allreduce in
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.>`_.
+A few built-in communication hooks are provided,
+and users can easily apply any of these hooks to optimize communication.
+Besides, the hook interface can also support user-defined communication
+strategies for more advanced use cases.
+
+.. warning ::
+    DDP communication hook is experimental and subject to change.
+
+.. warning ::
+    DDP communication hooks can only support single process single device mode
+    on NCCL backend.
+
+How to Use a Communication Hook?
+--------------------------------
+
+To use a communication hook, the user just needs to let the DDP model register
+the hook before the training loop as below.
+
+:func:`torch.nn.parallel.DistributedDataParallel.register_comm_hook`.
+    :noindex:
+
+Default Communication Hooks
+---------------------------
+
+Default communication hooks are simple **stateless** hooks, so the input state
+in ``register_comm_hook`` is either a process group or ``None``.
+
+.. automodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks
+    :members:
+
+PowerSGD Communication Hook
+---------------------------
+
+PowerSGD (`Vogels et al., NeurIPS 2019 <https://arxiv.org/abs/1905.13727>`_)
+is a gradient compression algorithm, which can provide very high compression
+rates and accelerate bandwidth-bound distributed training.
+This algorithm needs to maintain both some hyperparameters and the internal
+state. Therefore, PowerSGD communication hook is a **stateful** hook,
+and the user needs to provide a state object defined as below.
+
+PowerSGD State
+^^^^^^^^^^^^^^^^
+
+.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
+.. autoclass:: PowerSGDState
+
+PowerSGD Hooks
+^^^^^^^^^^^^^^^^
+
+.. warning ::
+    PowerSGD typically requires extra memory of the same size as the model's
+    gradients to enable error feedback, which can compensate for biased
+    compressed communication and improve accuracy.
+
+.. warning ::
+    The current implementation may cause gradient overflow for FP16 input.
+
+.. autofunction:: powerSGD_hook
+.. autofunction:: batched_powerSGD_hook
+
+Acknowledgements
+----------------
+
+Many thanks to PowerSGD paper author **Thijs Vogels** for the code review on
+PowerSGD communication hook, as well as the
+`comparison experiments <https://observablehq.com/@tvogels/powersgd-benchmark>`_,
+which show that the performance of PowerSGD communication hook is on par with
+the implementation in the original `paper <https://arxiv.org/abs/1905.13727>`_.
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@ -58,16 +58,16 @@ distributed (NCCL only when building with CUDA). MPI is an optional backend that
 included if you build PyTorch from source. (e.g.building PyTorch on a host that has MPI
 installed.)

-.. warning ::
-    As of PyTorch v1.7, Windows support for the distributed package only covers collective
-    communications with Gloo backend, `FileStore`, and `DistributedDataParallel`. Therefore,
-    the `init_method` argument in :func:`init_process_group` must point to a file. This works
-    for both local and shared file systems:
+.. note ::
+    As of PyTorch v1.8, Windows supports all collective communications backend but NCCL,
+    If  the `init_method` argument of :func:`init_process_group` points to a file it must adhere
+    to the following schema:

    - Local file system, ``init_method="file:///d:/tmp/some_file"``
    - Shared file system, ``init_method="file://////{machine_name}/{share_folder_name}/some_file"``

-    Similarly, if you directly pass in a `store` argument, it must be a ``FileStore`` instance.
+    Same as on Linux platform, you can enable TcpStore by setting environment variables,
+    MASTER_ADDR and MASTER_PORT.

 Which backend to use?
 ^^^^^^^^^^^^^^^^^^^^^
@ -330,13 +330,13 @@ as they should never be created manually, but they are guaranteed to support two

 Synchronous and asynchronous collective operations
 --------------------------------------------------
-Every collective operation function supports the following two kinds of operations, 
+Every collective operation function supports the following two kinds of operations,
 depending on the setting of the ``async_op`` flag passed into the collective:

 **Synchronous operation** - the default mode, when ``async_op`` is set to ``False``.
 When the function returns, it is guaranteed that
 the collective operation is performed. In the case of CUDA operations, it is not guaranteed
-that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any 
+that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any
 further function calls utilizing the output of the collective call will behave as expected. For CUDA collectives,
 function calls utilizing the output on the same CUDA stream will behave as expected. Users must take care of
 synchronization under the scenario of running under different streams. For details on CUDA semantics such as stream
@ -347,12 +347,12 @@ See the below script to see examples of differences in these semantics for CPU a
 returns a distributed request object. In general, you don't need to create it manually and it
 is guaranteed to support two methods:

-* ``is_completed()`` - in the case of CPU collectives, returns ``True`` if completed. In the case of CUDA operations, 
-  returns ``True`` if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the 
-  default stream without further synchronization. 
+* ``is_completed()`` - in the case of CPU collectives, returns ``True`` if completed. In the case of CUDA operations,
+  returns ``True`` if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the
+  default stream without further synchronization.
 * ``wait()`` - in the case of CPU collectives, will block the process until the operation is completed. In the case
-  of CUDA collectives, will block until the operation has been successfully enqueued onto a CUDA stream and the 
-  output can be utilized on the default stream without further synchronization. 
+  of CUDA collectives, will block until the operation has been successfully enqueued onto a CUDA stream and the
+  output can be utilized on the default stream without further synchronization.

 **Example**

@ -368,7 +368,7 @@ It shows the explicit need to synchronize when using collective outputs on diffe
    handle = dist.all_reduce(output, async_op=True)
    # Wait ensures the operation is enqueued, but not necessarily complete.
    handle.wait()
-    # Using result on non-default stream.    
+    # Using result on non-default stream.
    with torch.cuda.stream(s):
        s.wait_stream(torch.cuda.default_stream())
        output.add_(100)
@ -382,7 +382,7 @@ It shows the explicit need to synchronize when using collective outputs on diffe
 Collective functions
 --------------------

-.. autofunction:: broadcast 
+.. autofunction:: broadcast

 .. autofunction:: broadcast_object_list

@ -426,7 +426,7 @@ you can find an implementation of those in the `torch.distributed.nn.*` module.
 Functions here are synchronous and will be inserted in the autograd graph, so
 you need to ensure that all the processes that participated in the collective operation
 will do the backward pass for the backward communication to effectively happen and
-don't cause a deadlock. 
+don't cause a deadlock.

 Please notice that currently the only backend where all the functions are guaranteed to work is ``gloo``.
 .. autofunction:: torch.distributed.nn.broadcast
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -71,6 +71,7 @@ Features described in this documentation are classified by release status:
   onnx
   optim
   complex_numbers
+   ddp_comm_hooks
   pipeline
   quantization
   rpc
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@ -484,6 +484,7 @@ Sparse tensor functions
 +++++++++++++++++++++++

 .. autofunction:: torch.sparse_coo_tensor
+   :noindex:
 .. autofunction:: torch.sparse.sum
 .. autofunction:: torch.sparse.addmm
 .. autofunction:: torch.sparse.mm
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -563,5 +563,4 @@ Utilities
    promote_types
    use_deterministic_algorithms
    are_deterministic_algorithms_enabled
-    vmap
    _assert
--- a/setup.py
+++ b/setup.py
@ -552,6 +552,50 @@ class build_ext(setuptools.command.build_ext.build_ext):
            with open('compile_commands.json', 'w') as f:
                f.write(new_contents)

+class concat_license_files():
+    """Merge LICENSE and LICENSES_BUNDLED.txt as a context manager
+
+    LICENSE is the main PyTorch license, LICENSES_BUNDLED.txt is auto-generated
+    from all the licenses found in ./third_party/. We concatenate them so there
+    is a single license file in the sdist and wheels with all of the necessary
+    licensing info.
+    """
+    def __init__(self):
+        self.f1 = 'LICENSE'
+        self.f2 = 'third_party/LICENSES_BUNDLED.txt'
+
+    def __enter__(self):
+        """Concatenate files"""
+        with open(self.f1, 'r') as f1:
+            self.bsd_text = f1.read()
+
+        with open(self.f1, 'a') as f1:
+            with open(self.f2, 'r') as f2:
+                self.bundled_text = f2.read()
+                f1.write('\n\n')
+                f1.write(self.bundled_text)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        """Restore content of f1"""
+        with open(self.f1, 'w') as f:
+            f.write(self.bsd_text)
+
+
+try:
+    from wheel.bdist_wheel import bdist_wheel
+except ImportError:
+    # This is useful when wheel is not installed and bdist_wheel is not
+    # specified on the command line. If it _is_ specified, parsing the command
+    # line will fail before wheel_concatenate is needed
+    wheel_concatenate = None
+else:
+    # Need to create the proper LICENSE.txt for the wheel
+    class wheel_concatenate(bdist_wheel):
+        """ check submodules on sdist to prevent incomplete tarballs """
+        def run(self):
+            with concat_license_files():
+                super().run()
+

 class install(setuptools.command.install.install):
    def run(self):
@ -724,6 +768,7 @@ def configure_extension_build():
        'build_ext': build_ext,
        'clean': clean,
        'install': install,
+        'bdist_wheel': wheel_concatenate,
    }

    entry_points = {
--- a/test/cpp/jit/test_module_api.cpp
+++ b/test/cpp/jit/test_module_api.cpp
@ -3,9 +3,11 @@
 #include <test/cpp/jit/test_utils.h>

 #include <ATen/core/qualified_name.h>
+#include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/frontend/resolver.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/serialization/import_source.h>
+#include <torch/csrc/jit/testing/file_check.h>
 #include <torch/torch.h>

 namespace torch {
@ -341,6 +343,20 @@ TEST(ModuleAPITest, Define) {
  AT_ASSERT(result.toTensor().item<float>() == 6);
 }

+TEST(ModuleAPITest, Freezing) {
+  Module m("m");
+  m.register_parameter("foo", torch::ones({}), false);
+  m.define(R"(
+    def forward(self, x, b : int = 4):
+      return self.foo + x + b
+  )");
+  m.eval();
+  auto frozen_mod = torch::jit::freeze(m);
+  auto forward_g = frozen_mod.get_method("forward").graph();
+  testing::FileCheck().check_not("GetAttr")->run(*forward_g);
+  ;
+}
+
 TEST(ModuleAPITest, To_CUDA) {
  Module m("test");
  {
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@ -1508,7 +1508,7 @@ class TestFrozenOptimizations(JitTestCase):
        bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
        mod = torch.nn.Sequential(conv, bn)
        # set optimize to False here, by default freezing runs optimize_frozen_module
-        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize=False)
+        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize_numerics=False)
        # inspect frozen mod
        FileCheck().check("batch_norm").run(frozen_mod.graph)
        torch.jit.optimize_frozen_module(frozen_mod)
--- a/test/onnx/test_models.py
+++ b/test/onnx/test_models.py
@ -182,7 +182,7 @@ class TestModels(TestCase):
        self.exportTest(toC(FakeQuantNet()), toC(x))

    @skipIfUnsupportedMinOpsetVersion(10)
-    def test_qat_resnet(self):
+    def test_qat_resnet_pertensor(self):
        # Quantize ResNet50 model
        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
        qat_resnet50 = resnet50()
@ -202,6 +202,27 @@ class TestModels(TestCase):

        self.exportTest(toC(qat_resnet50), toC(x))

+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_qat_resnet_per_channel(self):
+        # Quantize ResNet50 model
+        x = torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
+        qat_resnet50 = resnet50()
+
+        qat_resnet50.qconfig = quantization.QConfig(
+            activation=quantization.default_fake_quant,
+            weight=quantization.default_per_channel_weight_fake_quant)
+        quantization.prepare_qat(qat_resnet50, inplace=True)
+        qat_resnet50.apply(torch.quantization.enable_observer)
+        qat_resnet50.apply(torch.quantization.enable_fake_quant)
+
+        _ = qat_resnet50(x)
+        for module in qat_resnet50.modules():
+            if isinstance(module, quantization.FakeQuantize):
+                module.calculate_qparams()
+        qat_resnet50.apply(torch.quantization.disable_observer)
+
+        self.exportTest(toC(qat_resnet50), toC(x))
+
    @disableScriptTest()  # None type in outputs
    def test_googlenet(self):
        x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@ -5998,6 +5998,20 @@ class TestONNXRuntime(unittest.TestCase):
        x = torch.randn(6, 4, 3, 3)
        self.run_test(FakeQuantizePerTensorModel(), (x))

+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_fake_quantize_per_channel(self):
+        class FakeQuantizePerChannelModel(torch.nn.Module):
+            def forward(self, input):
+                amax = torch.ones(4)
+                scale = amax / 127.
+                zero_point = torch.zeros_like(amax, dtype=torch.long)
+                # Quantize twice to test differnet branches
+                y = torch.fake_quantize_per_channel_affine(input, scale, zero_point, 1, 0, 255)
+                return torch.fake_quantize_per_channel_affine(y, scale, zero_point, 1, -128, 127)
+
+        x = torch.randn(6, 4, 3, 3)
+        self.run_test(FakeQuantizePerChannelModel(), (x))
+
    def test_batchnorm_training(self):
        class MyModule(torch.nn.Module):
            def __init__(self):
--- a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
@ -2,6 +2,8 @@ import unittest
 import onnxruntime  # noqa
 import torch

+from torch.cuda.amp import autocast
+
 from test_pytorch_common import skipIfUnsupportedMinOpsetVersion
 from test_pytorch_common import skipIfNoCuda

@ -24,6 +26,43 @@ class TestONNXRuntime_cuda(unittest.TestCase):
        x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
        self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5)

+    @skipIfUnsupportedMinOpsetVersion(9)
+    @skipIfNoCuda
+    def test_layer_norm_fp16(self):
+        class LayerNormModel(torch.nn.Module):
+            def __init__(self):
+                super(LayerNormModel, self).__init__()
+                self.layer_norm = torch.nn.LayerNorm([10, 10])
+
+            def forward(self, x):
+                return self.layer_norm(x)
+
+        x = torch.randn(20, 5, 10, 10, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
+        self.run_test(LayerNormModel(), x, rtol=1e-3, atol=1e-5)
+
+
+    @skipIfUnsupportedMinOpsetVersion(12)
+    @skipIfNoCuda
+    def test_softmaxCrossEntropy_fusion_fp16(self):
+        class FusionModel(torch.nn.Module):
+            def __init__(self):
+                super(FusionModel, self).__init__()
+                self.loss = torch.nn.NLLLoss(reduction='none')
+                self.m = torch.nn.LogSoftmax(dim=1)
+
+            @autocast()
+            def forward(self, input, target):
+                output = self.loss(self.m(2 * input), target)
+                return output
+
+        N, C = 5, 4
+        input = torch.randn(N, 16, dtype=torch.float16, device=torch.device('cuda'))
+        target = torch.empty(N, dtype=torch.long, device=torch.device('cuda')).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
+        self.run_test(FusionModel(), (input, target))
+
 TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp
 TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test

--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@ -872,7 +872,7 @@ class TestFakeQuantize(TestCase):
            scale, zero_point = float(scale), int(zero_point)
            quant_min, quant_max = obs._calculate_qmin_qmax()

-            Y_test, _mask = torch.fake_quantize_per_tensor_affine_cachemask(
+            Y_test = torch.fake_quantize_per_tensor_affine(
                X, scale, zero_point, quant_min, quant_max)
            Y_ref = _fake_quantize_per_tensor_affine_reference(
                X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
@ -899,7 +899,7 @@ class TestFakeQuantize(TestCase):
            quant_min, quant_max = obs._calculate_qmin_qmax()

            # forward pass
-            Y_test, mask = torch.fake_quantize_per_tensor_affine_cachemask(
+            Y_test = torch.fake_quantize_per_tensor_affine(
                X, scale, zero_point, quant_min, quant_max)
            Y_ref = _fake_quantize_per_tensor_affine_reference(
                X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
@ -1246,7 +1246,7 @@ class TestFakeQuantize(TestCase):

            Y = _fake_quantize_per_channel_affine_reference(
                X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max)
-            Y_prime, _mask = torch.fake_quantize_per_channel_affine_cachemask(
+            Y_prime = torch.fake_quantize_per_channel_affine(
                X, scale, zero_point, axis, quant_min, quant_max)
            np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)

@ -1339,7 +1339,7 @@ class TestFakeQuantize(TestCase):
            zero_point = zero_point.to(torch.int64)
            quant_min, quant_max = obs._calculate_qmin_qmax()
            X.requires_grad_()
-            Y_prime, _mask = torch.fake_quantize_per_channel_affine_cachemask(
+            Y_prime = torch.fake_quantize_per_channel_affine(
                X, scale, zero_point, axis, quant_min, quant_max)
            dout = torch.rand(X.shape, dtype=torch.float).to(device)
            dX = _fake_quantize_per_channel_affine_grad_reference(
--- a/test/run_test.py
+++ b/test/run_test.py
@ -108,6 +108,7 @@ TESTS = [
    'test_fx_experimental',
    'test_functional_autograd_benchmark',
    'test_package',
+    'test_license',
    'distributed/pipeline/sync/skip/test_api',
    'distributed/pipeline/sync/skip/test_gpipe',
    'distributed/pipeline/sync/skip/test_inspect_skip_layout',
--- a/test/test_fx.py
+++ b/test/test_fx.py
@ -14,7 +14,7 @@ from math import sqrt
 from pathlib import Path
 from torch.multiprocessing import Process
 from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Interpreter, Tracer, Transformer, Graph, wrap
-from torch.fx.node import Target
+from torch.fx.node import Target, Argument
 from torch.fx.passes import shape_prop
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from copy import deepcopy
@ -187,7 +187,7 @@ class TestFX(JitTestCase):
        # Custom delegate to disallow in-place tensor operations
        class NoMutableCallTracer(Tracer):
            def create_node(self, kind : str, target : Union[str, Callable],
-                            args : Tuple[Any], kwargs : Dict[str, Any], name : Optional[str] = None,
+                            args : Tuple[Argument, ...], kwargs : Dict[str, Any], name : Optional[str] = None,
                            type_expr : Optional[Any] = None) -> Node:
                name = target if isinstance(target, str) else torch.typename(target)
                if name[-1] == '_':
@ -539,7 +539,7 @@ class TestFX(JitTestCase):
    def test_node_tagging(self):
        class TaggingTracer(Tracer):
            def create_node(self, kind : str, target : Union[str, Callable],
-                            args : Tuple[Any], kwargs : Dict[str, Any], name : Optional[str] = None,
+                            args : Tuple[Argument, ...], kwargs : Dict[str, Any], name : Optional[str] = None,
                            type_expr : Optional[Any] = None) -> Node:
                n = super().create_node(kind, target, args, kwargs, name)
                n.tag = 'foo'
@ -1057,6 +1057,13 @@ class TestFX(JitTestCase):
        result = interp.run(torch.ones(3, 4), torch.ones(3, 4), torch.rand(3, 4))
        self.assertEqual(result, torch.ones(3, 4) * 2.0)

+    @skipIfNoTorchVision
+    def test_interpreter_noop_resnet18(self):
+        rn18 = resnet18()
+        transformed = torch.fx.Transformer(symbolic_trace(rn18)).transform()
+        inp = torch.randn(5, 3, 224, 224)
+        self.assertEqual(transformed(inp), rn18(inp))
+
    def test_transformer_noop(self):
        class MyModule(torch.nn.Module):
            def __init__(self):
@ -1377,6 +1384,45 @@ class TestFX(JitTestCase):
        x, y = torch.randn(3, 4), torch.randn(3, 4)
        self.checkGraphModule(foo, (x, y))

+    def test_trace_dict_int_keys(self):
+        class ModWithDictArg(torch.nn.Module):
+            def forward(self, d : Dict[int, torch.Tensor]):
+                return d[42]
+
+        class CallsModWithDict(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.m = ModWithDictArg()
+
+            def forward(self, x):
+                return self.m({42: x})
+
+        class MyTracer(torch.fx.Tracer):
+            def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
+                return isinstance(m, ModWithDictArg)
+
+        traced_graph = MyTracer().trace(CallsModWithDict())
+
+    def test_trace_dict_proxy_keys(self):
+        class ModWithDictArg(torch.nn.Module):
+            def forward(self, d : Dict[torch.Tensor, torch.Tensor]):
+                return d[42]
+
+        class CallsModWithDict(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.m = ModWithDictArg()
+
+            def forward(self, x):
+                return self.m({x: x})
+
+        class MyTracer(torch.fx.Tracer):
+            def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
+                return isinstance(m, ModWithDictArg)
+
+        with self.assertRaisesRegex(RuntimeError, 'cannot contain a Node'):
+            traced_graph = MyTracer().trace(CallsModWithDict())
+
    def test_direct_param_use(self):
        class TransposeTest(torch.nn.Module):
            def __init__(self):
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@ -5,14 +5,14 @@ from typing import Callable, Dict, Union, List
 from torch.fx.symbolic_trace import symbolic_trace
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node
-from torch.fx.experimental import graph_manipulation
-from torch.fx.experimental.accelerator_partitioner import Partitioner
-from torch.fx.experimental.rewriter import RewritingTracer
-from torch.fx.experimental.param_fetch import lift_lowering_attrs_to_nodes
+from torch.fx._experimental import graph_manipulation
+from torch.fx._experimental.accelerator_partitioner import Partitioner
+from torch.fx._experimental.rewriter import RewritingTracer
+from torch.fx._experimental.param_fetch import lift_lowering_attrs_to_nodes
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.fx.passes.split_module import split_module
-from torch.fx.experimental.partitioner_utils import (
+from torch.fx._experimental.partitioner_utils import (
    NodeLatency,
    get_partition_to_latency_mapping,
    get_latency_of_partitioned_graph,
@ -20,8 +20,8 @@ from torch.fx.experimental.partitioner_utils import (
    PartitionerConfig,
    PartitionMode
 )
-from torch.fx.experimental.fuser import fuse
-from torch.fx.experimental import merge_matmul
+from torch.fx._experimental.fuser import fuse
+from torch.fx._experimental import merge_matmul

 try:
    from torchvision.models import resnet18
@ -849,7 +849,7 @@ terrible spacing

    def test_merge_matmuls(self):
        """
-        A collection of test cases for torch.fx.experimental.merge_matmul,
+        A collection of test cases for torch.fx._experimental.merge_matmul,
        a graph transformation that merges matrix multiplication operations.
        """
        # Utility function for counting matmuls for test assertions.
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -6503,6 +6503,38 @@ a")
            self.checkModule(module().train(), ())
            self.checkModule(module().eval(), ())

+    def test_ternary_static_if(self):
+        # Test for True branch when condition variable
+        # is annotated as Final
+        class M1(torch.nn.Module):
+            flag: torch.jit.Final[bool]
+
+            def __init__(self):
+                super().__init__()
+                self.flag = True
+
+            def forward(self) -> torch.Tensor:
+                return torch.ones(3) if self.flag else {}
+
+        # Test for True branch when condition variable
+        # is annotated as Final
+        class M2(torch.nn.Module):
+            flag: torch.jit.Final[bool]
+
+            def __init__(self):
+                super().__init__()
+                self.flag = False
+
+            def forward(self) -> torch.Tensor:
+                return {} if self.flag else torch.ones(3)
+
+        model1 = M1()
+        model2 = M2()
+        script_model_1 = torch.jit.script(model1)
+        script_model_2 = torch.jit.script(model2)
+        self.assertEqual(model1.forward(), script_model_1.forward())
+        self.assertEqual(model2.forward(), script_model_2.forward())
+
    def test_print(self):
        def func(x, y):
            q = (x + y).sigmoid()
--- a/test/test_license.py
+++ b/test/test_license.py
@ -1,6 +1,9 @@
+import glob
 import io
+import os
 import unittest

+import torch
 from torch.testing._internal.common_utils import TestCase, run_tests


@ -10,11 +13,14 @@ except ImportError:
    create_bundled = None

 license_file = 'third_party/LICENSES_BUNDLED.txt'
+starting_txt = 'The Pytorch repository and source distributions bundle'
+site_packages = os.path.dirname(os.path.dirname(torch.__file__))
+distinfo = glob.glob(os.path.join(site_packages, 'torch-*dist-info'))

 class TestLicense(TestCase):

    @unittest.skipIf(not create_bundled, "can only be run in a source tree")
-    def test_license_in_wheel(self):
+    def test_license_for_wheel(self):
        current = io.StringIO()
        create_bundled('third_party', current)
        with open(license_file) as fid:
@ -25,6 +31,18 @@ class TestLicense(TestCase):
                'match the current state of the third_party files. Use '
                '"python third_party/build_bundled.py" to regenerate it')

+    @unittest.skipIf(len(distinfo) == 0, "no installation in site-package to test")
+    def test_distinfo_license(self):
+        """If run when pytorch is installed via a wheel, the license will be in
+        site-package/torch-*dist-info/LICENSE. Make sure it contains the third
+        party bundle of licenses"""
+
+        if len(distinfo) > 1:
+            raise AssertionError('Found too many "torch-*dist-info" directories '
+                                 f'in "{site_packages}, expected only one')
+        with open(os.path.join(os.path.join(distinfo[0], 'LICENSE'))) as fid:
+            txt = fid.read()
+            self.assertTrue(starting_txt in txt)

 if __name__ == '__main__':
    run_tests()
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -4276,6 +4276,37 @@ class TestNN(NNTestCase):
            with torch.backends.mkldnn.flags(enabled=enabled):
                gradcheck(F.conv2d, (input, mod.weight))

+    def test_Conv2d_OneDNN(self):
+        def run_once():
+            group_val = 24
+            ifm = torch.ones([1, group_val, 6, 6], dtype=torch.float32)
+            weights = torch.ones([group_val, 1, 3, 3], dtype=torch.float32)
+            op = torch.nn.Conv2d(
+                in_channels=group_val,
+                out_channels=group_val,
+                kernel_size=[3, 3],
+                stride=[2, 2],
+                padding=[1, 1],
+                dilation=[1, 1],
+                groups=group_val,
+                bias=False,
+                padding_mode='zeros'
+            )
+
+            op.weight.data = weights
+            res = op(ifm)
+            grad_in = torch.ones(res.shape, dtype=torch.float32)
+            res.backward(grad_in)
+            return op.weight.grad
+
+        with torch.backends.mkldnn.flags(enabled=False):
+            without_onednn = run_once()
+
+        with torch.backends.mkldnn.flags(enabled=True):
+            with_onednn = run_once()
+
+        self.assertEqual(without_onednn, with_onednn)
+
    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
    def test_cudnn_non_contiguous(self):
@ -8643,7 +8674,7 @@ class TestNN(NNTestCase):
            kwargs = dict(mode='bicubic', align_corners=align_corners)
            # test float scale factor up & downsampling
            for device in device_list:
-                for scale_factor in [0.5, 1.5, 2]:
+                for scale_factor in [0.5, 1, 1.5, 2]:
                    in_t = torch.ones(2, 2, 2, 2).to(device)
                    out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
                    out_size = int(math.floor(in_t.shape[-1] * scale_factor))
--- a/test/test_vmap.py
+++ b/test/test_vmap.py
@ -1,7 +1,8 @@
 from torch.testing._internal.common_utils import TestCase, run_tests
 import torch
 import torch.nn.functional as F
-from torch import Tensor, vmap
+from torch import Tensor
+from torch._vmap_internals import vmap
 import functools
 import itertools
 import warnings
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@ -82,7 +82,8 @@ SKIP_PYTHON_BINDINGS = [
    'set_data',
    '.*_overrideable',  # overrideable functions for backend extension
    'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retain_grad', 'set_',
-    '_fw_primal'
+    '_fw_primal', 'fake_quantize_per_tensor_affine_cachemask',
+    'fake_quantize_per_channel_affine_cachemask',
 ]

 # These function signatures are not exposed to Python. Note that this signature
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@ -350,8 +350,8 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
        'saddmm': ['def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number=1, '
                   'alpha: Number=1, out: Optional[Tensor]=None) -> Tensor: ...'],
        'spmm': ['def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...'],
-        'div': ['def div(input: Union[Tensor, Number], other: Union[Tensor, Number], '
-                'rounding_mode: str = "true", *, out: Optional[Tensor]=None) -> Tensor: ...'],
+        'div': ['def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, '
+                'rounding_mode: Optional[str]=None, out: Optional[Tensor]=None) -> Tensor: ...'],
    })
    for binop in ['mul', 'true_divide', 'floor_divide']:
        unsorted_function_hints[binop].append(
@ -462,9 +462,9 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
                 'def set_(self, storage: Storage) -> Tensor: ...'],
        'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...',
                  'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'],
-        'div': ['def div(self, other: Union[Tensor, Number], '
-                'rounding_mode: str = "true", *, out: Optional[Tensor]=None) -> Tensor: ...'],
-        'div_': ['def div_(self, other: Union[Tensor, Number], rounding_mode: str = "true") -> Tensor: ...'],
+        'div': ['def div(self, other: Union[Tensor, Number], *, '
+                'rounding_mode: Optional[str]=None, out: Optional[Tensor]=None) -> Tensor: ...'],
+        'div_': ['def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str]=None) -> Tensor: ...'],
    })
    for binop in ['mul', 'true_divide', 'floor_divide']:
        for inplace in [False, True]:
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@ -162,7 +162,7 @@ endif()

 # In the most recent CMake versions, a new 'TRANSFORM' subcommand of 'list' allows much of the boilerplate of defining the lists
 # of type stub files to be omitted.
-# For comptability with older CMake versions, we omit it for now, but leave it as a comment in case comptability with the older
+# For compatibility with older CMake versions, we omit it for now, but leave it as a comment in case compatibility with the older
 # CMake versions is eventually dropped.
 # set(Modules
 #     __init__
--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -174,6 +174,11 @@ def _freeze_module(module: ScriptModule,
                   freeze_interfaces: _bool = True,
                   preserveParameters: _bool = True) -> ScriptModule: ...
 def _jit_pass_optimize_frozen_graph(Graph) -> None: ...
+def _jit_pass_fold_frozen_conv_bn(graph: Graph): ...
+def _jit_pass_fold_frozen_conv_add_or_sub(graph: Graph): ...
+def _jit_pass_fold_frozen_conv_mul_or_div(graph: Graph): ...
+def _jit_pass_remove_dropout(module: 'torch.jit.ScriptModule'): ...
+
 def _is_tracing() -> _bool: ...
 def _jit_init() -> _bool: ...
 def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ...
--- a/torch/init.py
+++ b/torch/init.py
@ -662,8 +662,6 @@ del register_after_fork
 # torch.jit.script as a decorator, for instance):
 from ._lobpcg import lobpcg

-from ._vmap_internals import vmap
-
 # These were previously defined in native_functions.yaml and appeared on the
 # `torch` namespace, but we moved them to c10 dispatch to facilitate custom
 # class usage. We add these lines here to preserve backward compatibility.
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -1194,25 +1194,25 @@ See :func:`torch.dist`
 """)

 add_docstr_all('div', r"""
-div(value, *, rounding_mode='true') -> Tensor
+div(value, *, rounding_mode=None) -> Tensor

 See :func:`torch.div`
 """)

 add_docstr_all('div_', r"""
-div_(value, *, rounding_mode='true') -> Tensor
+div_(value, *, rounding_mode=None) -> Tensor

 In-place version of :meth:`~Tensor.div`
 """)

 add_docstr_all('divide', r"""
-divide(value, *, rounding_mode='true') -> Tensor
+divide(value, *, rounding_mode=None) -> Tensor

 See :func:`torch.divide`
 """)

 add_docstr_all('divide_', r"""
-divide_(value, *, rounding_mode='true') -> Tensor
+divide_(value, *, rounding_mode=None) -> Tensor

 In-place version of :meth:`~Tensor.divide`
 """)
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@ -2741,7 +2741,7 @@ Example::
 """.format(**common_args))

 add_docstr(torch.div, r"""
-div(input, other, *, rounding_mode='true' out=None) -> Tensor
+div(input, other, *, rounding_mode=None, out=None) -> Tensor

 Divides each element of the input ``input`` by the corresponding element of
 :attr:`other`.
@ -2764,7 +2764,7 @@ Args:
 Keyword args:
    rounding_mode (str, optional): Type of rounding applied to the result:

-        * ``"true"`` - default behavior. Performs no rounding and, if both :attr:`input` and
+        * None - default behavior. Performs no rounding and, if both :attr:`input` and
          :attr:`other` are integer types, promotes the inputs to the default scalar type.
          Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``.
        * ``"trunc"`` - rounds the results of the division towards zero.
@ -2806,7 +2806,7 @@ Examples::
 """.format(**common_args))

 add_docstr(torch.divide, r"""
-divide(input, other, *, rounding_mode='true', out=None) -> Tensor
+divide(input, other, *, rounding_mode=None, out=None) -> Tensor

 Alias for :func:`torch.div`.
 """)
@ -8515,9 +8515,9 @@ If :attr:`upper` is ``False``, then lower triangular portion is used.
 .. note:: Irrespective of the original strides, the returned matrix `V` will
          be transposed, i.e. with strides `V.contiguous().transpose(-1, -2).stride()`.

-.. note:: Extra care needs to be taken when backward through outputs. Such
-          operation is really only stable when all eigenvalues are distinct.
-          Otherwise, ``NaN`` can appear as the gradients are not properly defined.
+.. warning:: Extra care needs to be taken when backward through outputs. Such
+             operation is only stable when all eigenvalues are distinct and becomes
+             less stable the smaller :math:`\min_{i \neq j} |\lambda_i - \lambda_j|` is.

 Args:
    input (Tensor): the input tensor of size :math:`(*, n, n)` where `*` is zero or more
@ -9207,7 +9207,7 @@ Example::
 add_docstr(torch.true_divide, r"""
 true_divide(dividend, divisor, *, out) -> Tensor

-Alias for :func:`torch.div` with ``rounding_mode='true'``.
+Alias for :func:`torch.div` with ``rounding_mode=None``.
 """.format(**common_args))

 add_docstr(torch.trunc,
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@ -8,6 +8,8 @@
 #include <torch/csrc/jit/frontend/schema_matching.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/freeze_module.h>
+#include <torch/csrc/jit/passes/frozen_graph_optimizations.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/operator.h>

@ -336,6 +338,21 @@ IValue Module::create_class(const c10::QualifiedName& name, Stack stack) const {
  return obj;
 }

+Module freeze(
+    const Module& module,
+    c10::optional<std::vector<std::string>> preserved_attrs,
+    bool optimize_numerics) {
+  TORCH_CHECK(
+      module.is_training(),
+      "Freezing is currently only implemented for modules in eval mode. Please call .eval() before freezing");
+
+  Module out_mod = freeze_module(
+      module, preserved_attrs.value_or(std::vector<std::string>({})));
+  auto graph = module.get_method("forward").graph();
+  OptimizeFrozenGraph(graph, optimize_numerics);
+  return out_mod;
+}
+
 buffer_list Module::buffers(bool recurse) const {
  return buffer_list(*this, recurse, /*return_module=*/false);
 }
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@ -276,6 +276,13 @@ struct TORCH_API Module : public Object {
      bool non_blocking);
 };

+// C++ equivalent api of `torch.jit.freeze`. See documentation there for
+// details.
+TORCH_API Module freeze(
+    const Module& module,
+    c10::optional<std::vector<std::string>> preserved_attrs = c10::nullopt,
+    bool optimize_numerics = true);
+
 namespace detail {

 struct TORCH_API SlotCursor {
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h>

+#include <ATen/DynamicLibrary.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 #include <torch/csrc/jit/codegen/fuser/compiler.h>
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
@ -9,13 +9,18 @@
 #include <memory>
 #include <string>

+// Forward declare DynamicLibrary
+namespace at {
+struct DynamicLibrary;
+}
+
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cpu {

 // Represents a compiled CPU kernel and the metadata necessary to run it
-struct TORCH_API FusedKernelCPU : public ::torch::jit::fuser::FusedKernel {
+struct TORCH_API FusedKernelCPU : public FusedKernel {
  FusedKernelCPU(
      std::string name,
      std::string code,
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@ -1258,6 +1258,15 @@ struct to_ir {
      const TernaryIf& expr,
      const TypePtr& type_hint = nullptr) {
    CondValue cond_value = emitCondExpr(expr.cond());
+    // If the cond expr is a static value, then we metacompile the `if`
+    // statemement and only emit true or false branch
+    if (cond_value.staticIf()) {
+        if (*cond_value.staticIf()) {
+            return emitExpr(expr.true_expr(), type_hint);
+        } else {
+            return emitExpr(expr.false_expr(), type_hint);
+        }
+    }
    auto true_expr = [&] { return emitExpr(expr.true_expr(), type_hint); };
    auto false_expr = [&] { return emitExpr(expr.false_expr(), type_hint); };
    return emitIfExpr(expr.range(), cond_value, true_expr, false_expr);
--- a/torch/csrc/jit/passes/frozen_graph_optimizations.cpp
+++ b/torch/csrc/jit/passes/frozen_graph_optimizations.cpp
@ -8,12 +8,16 @@
 namespace torch {
 namespace jit {

-void OptimizeFrozenGraph(std::shared_ptr<Graph>& graph) {
+void OptimizeFrozenGraph(
+    std::shared_ptr<Graph>& graph,
+    bool optimize_numerics) {
  // run a couple times to capture Conv -> Mul -> Add etc
-  for (size_t i = 0; i < 2; i++) {
-    FoldFrozenConvBatchnorm(graph);
-    FoldFrozenConvAddOrSub(graph);
-    FoldFrozenConvMulOrDiv(graph);
+  if (optimize_numerics) {
+    for (size_t i = 0; i < 2; i++) {
+      FoldFrozenConvBatchnorm(graph);
+      FoldFrozenConvAddOrSub(graph);
+      FoldFrozenConvMulOrDiv(graph);
+    }
  }
 }

--- a/torch/csrc/jit/passes/frozen_graph_optimizations.h
+++ b/torch/csrc/jit/passes/frozen_graph_optimizations.h
@ -13,7 +13,9 @@
 namespace torch {
 namespace jit {

-TORCH_API void OptimizeFrozenGraph(std::shared_ptr<Graph>& graph);
+TORCH_API void OptimizeFrozenGraph(
+    std::shared_ptr<Graph>& graph,
+    bool optimize_numerics = true);

 } // namespace jit
 } // namespace torch
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@ -668,6 +668,24 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
      auto prev = it->input(0)->node();
      Node* origNllLossNode = *it;
      Node* origLogSoftmaxNode;
+
+      // Check for patterns especially in cases with autocasting enabled
+      // in which a cast node is inserted before the NegativeLogLikelihoodLoss
+      // node and this causes the patterns below not to be recognizable by the
+      // fuseLogSoftmaxNllLoss function
+      // For example if the input is 2D
+      // graph(%input : Half(3, 5),
+      // %target : Long(3)):
+      // %4 : Half(3, 5) = onnx::LogSoftmaxaxis=1
+      // %8 : Float = onnx::Cast[to=1](%4)
+      // %9 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"]
+      // return (%8)
+      Node* castNode = nullptr;
+      if (prev->kind() == onnx::Cast) {
+        castNode = prev;
+        prev = prev->input(0)->node();
+      }
+
      if (prev->kind() == onnx::LogSoftmax) {
        // if the input is 2D
        // graph(%input : Float(3, 5),
@ -675,7 +693,7 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
        // %4 : Float(3, 5) = onnx::LogSoftmaxaxis=1
        // %8 : Float(3) = onnx::NegativeLogLikelihoodLoss[reduction="none"]
        // return (%8)
-        origLogSoftmaxNode = it->input(0)->node();
+        origLogSoftmaxNode = prev;
      } else if (
          prev->kind() == onnx::Transpose &&
          prev->input(0)->node()->kind() == onnx::LogSoftmax) {
@ -751,6 +769,19 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
        continue;
      }

+      // If the pattern indeed consists of a cast node before the
+      // NegativeLogLikelihoodLoss node, place a cast node in the beginning
+      // of the pattern instead
+      if (castNode != nullptr) {
+        auto onnx_type = castNode->i(attr::to);
+        Node* cast_node = b->owningGraph()->create(onnx::Cast, 1);
+        cast_node->addInput(origLogSoftmaxNode->inputs().at(0));
+        cast_node->i_(attr::to, onnx_type);
+        cast_node->insertBefore(origLogSoftmaxNode);
+        origLogSoftmaxNode->replaceInputWith(
+            origLogSoftmaxNode->inputs().at(0), cast_node->output());
+      }
+
      Node* softmaxCrossEntropyNode = b->owningGraph()->create(
          onnx::SoftmaxCrossEntropyLoss, it->outputs().size());
      for (size_t i = 0; i < softmaxCrossEntropyNode->outputs().size(); ++i) {
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@ -33,39 +33,38 @@ def _orthogonalize(matrix, epsilon=1e-8):


 class PowerSGDState(object):
-    """
-    Stores both the gradient compression configs and the internal states for all the gradients during the training.
-    Particularly, `matrix_approximation_rank` and `start_powerSGD_iter` are the main configs that need to be tuned by the user.
-    Although `use_error_feedback` and `warm_start` can also be tuned by the user,
-    they are typically turned on for performance.
+    r"""
+    Stores both the algorithm's hyperparameters and the internal state for all the gradients during the training.
+    Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main hyperparameters that should be tuned by the user.
+    For performance, we suggest to keep binary hyperparameters ``use_error_feedback`` and ``warm_start`` on.

-    Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`]
-    ~~~~~~~~~~~~~~~~~~~~~~~~~~
-    1) To tune `matrix_approximation_rank`, the user can increase it from 1 by factors of 2,
-    until a satisfying accuracy can be reached.
-    The increase of `matrix_approximation_rank` can substantially increase the computation costs of the compression.
-    However, the accuracy may not be futher improved beyond a certain `matrix_approximation_rank` value.
-    2) To tune `start_powerSGD_iter`, the user can typically start with 10% of total training steps,
-    and increase it until a satisfying accuracy can be reached.
-    Deferrring PowerSGD can effectively improve the accuracy,
-    even a relatively small `matrix_approximation_rank` is used.
-    This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients,
-    and compressing gradients too early may make the training quickly take a suboptimal trajectory,
-    which can result in an irrecoverable impact on the accuracy.
-    The minimum value allowed in DDP is 2, if error feedback or warm-up is enabled.
-    This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP,
-    and this can conflict with any tensor memorized before the rebuild process.
-    """
+    1. ``matrix_approximation_rank`` controls the size of compressed low-rank tensors, which determines the compression rate. The lower the rank, the stronger the compression.
+
+        1.1. If ``matrix_approximation_rank`` is too low, the full model quality will need more training steps to reach or will never reach and yield loss in accuracy.
+
+        1.2. The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression, and the accuracy may not be futher improved beyond a certain ``matrix_approximation_rank`` threshold.
+
+    To tune ``matrix_approximation_rank``, we suggest to start from 1 and increase by factors of 2 (like an expoential grid search, 1, 2, 4, ...), until a satisfactory accuracy is reached. Typically only a small value 1-4 is used. For some NLP tasks (as shown in Appendix D of the original paper), this value has been increased to 32.
+
+    2. ``start_powerSGD_iter`` defers PowerSGD compression util step ``start_powerSGD_iter``, and vanilla allreduce runs prior to step ``start_powerSGD_iter``. This hybrid scheme of **vanilla allreduce + PowerSGD** can effectively improve the accuracy, even a relatively small ``matrix_approximation_rank`` is used. This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, and compressing gradients too early may make the training quickly take a suboptimal trajectory, which can result in an irrecoverable impact on the accuracy.
+
+    To tune ``start_powerSGD_iter``, we suggest to start with 10% of total training steps, and increase it until a satisfactory accuracy is reached.
+
+    .. warning ::
+        If error feedback or warm-up is enabled, the minimum value of ``start_powerSGD_iter`` allowed in DDP is 2.
+        This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP,
+        and this can conflict with any tensor memorized before the rebuild process.
+    """  # noqa

    __slots__ = [
        "process_group",
-        # The two fields below are the configs that usually need to be tuned by the user.
+        # The two fields below are the hyperparameters that should be tuned by the user.
        "matrix_approximation_rank",
        "start_powerSGD_iter",
-        # The two fields below are the configs that usually need to be turned on for performance.
+        # The two fields below are the binary hyperparameters recommended to be turned on for performance.
        "use_error_feedback",
        "warm_start",
-        # The fields below are not configs.
+        # The fields below are internal state.
        "rng",
        "error_dict",
        "p_memory_dict",
@ -93,21 +92,12 @@ class PowerSGDState(object):
        )

        self.process_group = process_group
-        # The low rank for matrix approximation controls the size of compressed low-rank tensors,
-        # which determines the computation ratio.
-        # Typically only a small value 1-4 is used.
-        # For some NLP tasks (as shown in Appendix D of the original paper
-        # https://arxiv.org/pdf/1905.13727.pdf, the rank value has been increased to 32.
-        # A high rank value will increase the computation costs of compression exponentially.
-        # A good choice depends on how much extra computation can be hidden by the dominating communication costs.
        self.matrix_approximation_rank = matrix_approximation_rank
-        # This defers PowerSGD compression util step 'start_powerSGD_iter',
-        # and vanilla allreduce runs before step 'start_powerSGD_iter'.
-        # This hybrid scheme of vanilla allreduce + PowerSGD can have two advantages:
+        # Deferring PowerSGD compression util step 'start_powerSGD_iter' can have two advantages:
        # 1) It turns out that PowerSGD may lead to a non-trivial accuracy loss,
        # even if the matrix approximation rank is increased to a large value.
        # To mitigate the accuracy loss, a simple yet effective way is mixing vanilla allreduce
-        # (or a more convervative compression such as FP16 compression) with PowerSGD.
+        # (or a more conservative compression such as FP16 compression) with PowerSGD.
        # 2) There is an internal optimization of rebuilding buckets process in DDP,
        # in order to save the memory space.
        # This step takes place after the first iteration.
@ -162,38 +152,44 @@ class PowerSGDState(object):


 def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
-    """
-    This DDP communication hook implements the original PowerSGD gradient compression
-    algorithm described in https://arxiv.org/abs/1905.13727.
+    r"""
+    This DDP communication hook implements PowerSGD gradient compression
+    algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_.
    Once gradient tensors are aggregated across all workers, this hook applies
    compression as follows:
-    1) Views the input flattened 1D gradient tensor as two groups of per-parameter tensors:
-    high-rank tensors and vector-like rank-1 tensors (for biases).
-    2) Handles rank-1 tensors by allreducing them without compression:
-        2.1) Allocate contiguous memory for those rank-1 tensors,
-        and allreduces all the rank-1 tensors as a batch, without compression;
-        2.2) Copies the individual rank-1 tensors from the contiguous memory back to the input tensor.
-    3) Handles high-rank tensors by PowerSGD compression:
-        3.1) For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M,
+
+    1. Views the input flattened 1D gradient tensor as two groups of per-parameter tensors: high-rank tensors and vector-like rank-1 tensors (for biases).
+
+    2. Handles rank-1 tensors by allreducing them without compression:
+
+        2.1. Allocate contiguous memory for those rank-1 tensors, and allreduces all the rank-1 tensors as a batch, without compression;
+
+        2.2. Copies the individual rank-1 tensors from the contiguous memory back to the input tensor.
+
+    3. Handles high-rank tensors by PowerSGD compression:
+
+        3.1. For each high-rank tensor M, creates two low-rank tensors P and Q for decomposing M,
        such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
-        3.2) Computes each P in Ps, which is equal to MQ;
-        3.3) Allreduces Ps as a batch;
-        3.4) Orthogonalizes each P in Ps;
-        3.5) Computes each Q in Qs, which is approximately equal to M^TP;
-        3.6) Allreduces Qs as a batch;
-        3.7) Computes each M among all the high-rank tensors, which is approximately equal to PQ^T.

-    Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations.
-    This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy,
-    but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers.
+        3.2. Computes each P in Ps, which is equal to MQ;

-    TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration --
-    one left multiplication and one right multiplication.
-    For warm-start, can take one such step at a time, and alternate between them.
+        3.3. Allreduces Ps as a batch;
+
+        3.4. Orthogonalizes each P in Ps;
+
+        3.5. Computes each Q in Qs, which is approximately equal to M^TP;
+
+        3.6. Allreduces Qs as a batch;
+
+        3.7. Computes each M among all the high-rank tensors, which is approximately equal to PQ^T.
+
+    Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
+    This not only gives the user more control over the tradeoff between speedup and accuracy,
+    but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.

    Args:
        state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
-            To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`].
+            To tune the compression configs, mainly need to tune `matrix_approximation_rank`` and ``start_powerSGD_iter``.
        bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
            Note that since DDP comm hook only supports single process single device mode at this time,
            only exactly one tensor is stored in this bucket.
@ -202,9 +198,9 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
        Future handler of the communication, which updates the gradients in place.

    Example::
-        state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
        >>> ddp_model.register_comm_hook(state, powerSGD_hook)
-    """
+    """  # noqa
    process_group = state.process_group
    group_to_use = process_group if process_group is not None else dist.group.WORLD
    world_size = group_to_use.size()
@ -374,6 +370,10 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
        for tensor, p, q in zip(high_rank_tensors, ps, qs):
            torch.matmul(tensor.t(), p, out=q)

+        # TODO: The above procedure does two matmul+allreduce steps per iteration --
+        # one left multiplication and one right multiplication.
+        # For warm-start, can take one such step at a time, and alternate between them.
+
        # Allreduce Qs.
        return [
            dist.all_reduce(
@ -412,40 +412,48 @@ def powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:


 def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
-    """
+    r"""
    This DDP communication hook implements a simplified PowerSGD gradient compression
-    algorithm described in https://arxiv.org/abs/1905.13727.
+    algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_.
+    This variant does not compress the gradients layer by layer,
+    but instead compresses the flattened input tensor that batches all the gradients.
+    Therefore, it is **faster** than :meth:`powerSGD_hook`,
+    but usually results in a **much lower accuracy**, unless ``matrix_approximation_rank`` is 1.
+
+    .. warning ::
+        Increasing ``matrix_approximation_rank`` here may not necessarily increase the accuracy,
+        because batching per-parameter tensors without column/row alignment can destroy low-rank structure.
+        Therefore, the user should always consider :meth:`powerSGD_hook` first,
+        and only consider this variant when a satisfactory accuracy can be achieved when ``matrix_approximation_rank`` is 1.
+
    Once gradient tensors are aggregated across all workers, this hook applies
-    compression to the flattened input tensor that batches per-parameter tensors as follows:
-    1) Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
-    2) Creates two low-rank tensors P and Q for decomposing M,
-    such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
-    2) Computes P, which is equal to MQ;
-    3) Allreduces P;
-    4) Orthogonalizes P;
-    5) Computes Q, which is approximately equal to M^TP;
-    6) Allreduces Q;
-    7) Computes M, which is approximately equal to PQ^T.
-    8) Truncates the input tensor to the original length.
+    compression as follows:

-    This variant is faster than `powerSGD_hook` that runs layer-wise gradient compression,
-    but it usually results in a much lower accuracy, unless `matrix_approximation_rank` in the state is 1.
-    Increasing `matrix_approximation_rank` may not necessarily increase the accuracy,
-    because batching per-parameter tensors without column/row alignment can destroy low-rank structure.
-    Therefore, the user shoud always consider `powerSGD_hook` first,
-    and only consider this variant when a satisfying accuracy can be achieved when `matrix_approximation_rank` is 1.
+    1. Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;

-    Note that this communication hook enforces vanilla allreduce for the first `state.start_powerSGD_iter` iterations.
-    This can not only allow the user to have a finer tuning over the tradeoff between speedup and accuracy,
-    but also help abstract away some complexity of the internal optimization of DDP for future communication hook developers.
+    2. Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;

-    TODO(wayi@): The above procedure does two matmul+allreduce steps per iteration --
-    one left multiplication and one right multiplication.
-    For warm-start, can take one such step at a time, and alternate between them.
+    3. Computes P, which is equal to MQ;
+
+    4. Allreduces P;
+
+    5. Orthogonalizes P;
+
+    6. Computes Q, which is approximately equal to M^TP;
+
+    7. Allreduces Q;
+
+    8. Computes M, which is approximately equal to PQ^T.
+
+    9. Truncates the input tensor to the original length.
+
+    Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
+    This not only gives the user more control over the tradeoff between speedup and accuracy,
+    but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.

    Args:
        state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
-            To tune the compression configs, see Note [Guidance to Tune `matrix_approximation_rank` And `start_powerSGD_iter`].
+            To tune the compression configs, mainly need to tune ``matrix_approximation_rank`` and ``start_powerSGD_iter``.
        bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
            Note that since DDP comm hook only supports single process single device mode at this time,
            only exactly one tensor is stored in this bucket.
@ -454,9 +462,9 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
        Future handler of the communication, which updates the gradients in place.

    Example::
-        state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
        >>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
-    """
+    """  # noqa
    process_group = state.process_group
    group_to_use = process_group if process_group is not None else dist.group.WORLD
    world_size = group_to_use.size()
@ -563,6 +571,10 @@ def batched_powerSGD_hook(state: PowerSGDState, bucket) -> torch.futures.Future:
            out=state.q_memory_dict[bucket_index],
        )

+        # TODO: The above procedure does two matmul+allreduce steps per iteration --
+        # one left multiplication and one right multiplication.
+        # For warm-start, can take one such step at a time, and alternate between them.
+
        return [
            dist.all_reduce(
                state.q_memory_dict[bucket_index], group=group_to_use, async_op=True
--- a/torch/fx/_experimental/init.py
+++ b/torch/fx/_experimental/init.py
--- a/torch/fx/_experimental/accelerator_partitioner.py
+++ b/torch/fx/_experimental/accelerator_partitioner.py
@ -4,7 +4,7 @@ from typing import Dict, List, Set, NamedTuple, Tuple
 import torch
 from torch.fx.passes.split_module import split_module
 import operator
-from torch.fx.experimental.partitioner_utils import Partition, \
+from torch.fx._experimental.partitioner_utils import Partition, \
    Device, PartitionerConfig, get_partition_to_latency_mapping,\
    get_latency_of_partitioned_graph, NodeLatency, get_extra_size_of, \
    PartitionMode
--- a/torch/fx/_experimental/const_fold.py
+++ b/torch/fx/_experimental/const_fold.py
--- a/torch/fx/_experimental/fuser.py
+++ b/torch/fx/_experimental/fuser.py
--- a/torch/fx/_experimental/graph_manipulation.py
+++ b/torch/fx/_experimental/graph_manipulation.py
@ -2,7 +2,7 @@ from typing import Dict, List, NamedTuple, Any

 import torch
 from torch.fx.passes.shape_prop import ShapeProp
-from torch.fx.experimental.param_fetch import lift_lowering_attrs_to_nodes
+from torch.fx._experimental.param_fetch import lift_lowering_attrs_to_nodes
 from torch.fx.graph import Graph, get_qualified_name
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node, Target, map_arg
--- a/torch/fx/_experimental/merge_matmul.py
+++ b/torch/fx/_experimental/merge_matmul.py
--- a/torch/fx/_experimental/param_fetch.py
+++ b/torch/fx/_experimental/param_fetch.py
--- a/torch/fx/_experimental/partitioner_utils.py
+++ b/torch/fx/_experimental/partitioner_utils.py
--- a/torch/fx/_experimental/rewriter.py
+++ b/torch/fx/_experimental/rewriter.py
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@ -116,7 +116,7 @@ class Interpreter:

    # Main Node running APIs

-    def placeholder(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
+    def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
        """
        Execute a ``placeholder`` node. Note that this is stateful:
        ``Interpreter`` maintains an internal iterator over
@ -141,7 +141,7 @@ class Interpreter:
        else:
            return next(self.args_iter)

-    def get_attr(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
+    def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
        """
        Execute a ``get_attr`` node. Will retrieve an attribute
        value from the ``Module`` hierarchy of ``self.module``.
@ -159,7 +159,7 @@ class Interpreter:
        assert isinstance(target, str)
        return self.fetch_attr(target)

-    def call_function(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
+    def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
        """
        Execute a ``call_function`` node and return the result.

@ -178,7 +178,7 @@ class Interpreter:
        # Execute the function and return the result
        return target(*args, **kwargs)

-    def call_method(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
+    def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
        """
        Execute a ``call_method`` node and return the result.

@ -199,7 +199,7 @@ class Interpreter:
        assert isinstance(target, str)
        return getattr(self_obj, target)(*args_tail, **kwargs)

-    def call_module(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
+    def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
        """
        Execute a ``call_module`` node and return the result.

@ -221,7 +221,7 @@ class Interpreter:

        return submod(*args, **kwargs)

-    def output(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
+    def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
        """
        Execute an ``output`` node. This really just retrieves
        the value referenced by the ``output`` node and returns it.
@ -307,12 +307,12 @@ class Transformer(Interpreter):
        method equivalents). We could subclass ``Transformer`` like so::

            class NegSigmSwapXformer(Transformer):
-                def call_function(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
+                def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
                    if target == torch.sigmoid:
                        return torch.neg(*args, **kwargs)
                    return super().call_function(n)

-                def call_method(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Any:
+                def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
                    if target == 'neg':
                        call_self, *args_tail = args
                        return call_self.sigmoid(*args_tail, **kwargs)
@ -344,7 +344,7 @@ class Transformer(Interpreter):
        self.tracer = TransformerTracer(self.new_graph)
        self.tracer.root = module

-    def placeholder(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Proxy:
+    def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
        """
        Execute a ``placeholder`` node. In ``Transformer``, this is
        overridden to insert a new ``placeholder`` into the output
@ -360,7 +360,7 @@ class Transformer(Interpreter):
        assert isinstance(target, str)
        return Proxy(self.new_graph.placeholder(target), self.tracer)

-    def get_attr(self, target : 'Target', args : Tuple[Any], kwargs : Dict[str, Any]) -> Proxy:
+    def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
        """
        Execute a ``get_attr`` node. In ``Transformer``, this is
        overridden to insert a new ``get_attr`` node into the output
@ -376,6 +376,12 @@ class Transformer(Interpreter):
        assert isinstance(target, str)
        return Proxy(self.new_graph.get_attr(target), self.tracer)

+    def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        # Override so that the leaf module policy from `self.tracer` is respected.
+        assert isinstance(target, str)
+        submod = self.fetch_attr(target)
+        return self.tracer.call_module(submod, submod.forward, args, kwargs)
+
    def transform(self) -> GraphModule:
        """
        Transform ``self.module`` and return the transformed
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@ -5,7 +5,7 @@ import operator

 from .graph import magic_methods, reflectable_magic_methods, Graph
 from typing import Tuple, Dict, Optional, Iterable, Any, Iterator
-from .node import Target, Node, Argument, base_types
+from .node import Target, Node, Argument, base_types, map_aggregate

 class TracerBase:
    graph: Graph
@ -61,8 +61,17 @@ class TracerBase:
        elif isinstance(a, dict):
            r = {}
            for k, v in a.items():
-                if not isinstance(k, str):
-                    raise NotImplementedError(f"dictionaries with non-string keys: {a}")
+                # Check for invalid dict keys. We do not want a Proxy to appear
+                # anywhere within the key. Since keys can be collection types,
+                # we iterate through the key with map_aggregate
+                k = self.create_arg(k)
+
+                def no_node(arg):
+                    if isinstance(arg, Node):
+                        raise RuntimeError("Keys for dictionaries used as an argument cannot contain a "
+                                           "Node. Got key: {k}")
+                map_aggregate(k, no_node)
+
                r[k] = self.create_arg(v)
            return r
        elif isinstance(a, slice):
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@ -10,7 +10,7 @@ import torch
 from torch.jit._script import RecursiveScriptModule, ScriptModule


-def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = True):
+def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics: bool = True):
    r"""
    Freezing a :class:`ScriptModule` will clone it and attempt to inline the cloned
    module's submodules, parameters, and attributes as constants in the TorchScript IR Graph.
@ -26,10 +26,8 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = Tr
        preserved_attrs (Optional[List[str]]): a list of attributes to preserve in addition to the forward method.
        Attributes modified in preserved methods will also be preserved.

-        optimize (bool): If ``True``, a set of optimization passes will be run to prepare the graph for inference,
-        in addition to the graph cleanup that already occurs. The details of the optimizations can be found in
-        `torch.jit.optimize_frozen_module.`
-
+        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly 
+        preserve numerics. Full details of optimization can be found at `torch.jit.optimize_frozen_module`.

    Returns:
        Frozen :class:`ScriptModule`.
@ -102,16 +100,16 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize: bool = Tr

    out = RecursiveScriptModule(torch._C._freeze_module(mod._c, preserved_attrs))
    RecursiveScriptModule._finalize_scriptmodule(out)
-    if optimize:
-        optimize_frozen_module(out)
+    optimize_frozen_module(out, optimize_numerics)

    return out


-def optimize_frozen_module(mod):
+def optimize_frozen_module(mod, optimize_numerics: bool = True):
    r"""
    Runs a series of optimizations looking for patterns that occur in frozen graphs.
    The current set of optimizations is:
+        - Dropout Removal
        - Conv -> Batchnorm folding
        - Conv -> Add/Sub folding
        - Conv -> Mul/Div folding
@ -119,6 +117,12 @@ def optimize_frozen_module(mod):
    Args:
        mod (:class:`ScriptModule`): a frozen module to be optimized

+        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly 
+        preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_allclose` 
+        when applied on a single transformation, however in a module where many transformations are applied 
+        the rtol or atol may no longer fall within the default `assert_allclose` tolerance. Conv -> Batchnorm folding, 
+        Conv-Add/Sub, and Conv -> Mul/Div folding all may alter numerics.
+
    Returns:
        None

@ -140,4 +144,12 @@ def optimize_frozen_module(mod):
        assert "batch_norm" not in str(frozen_mod.graph)

    """
-    torch._C._jit_pass_optimize_frozen_graph(mod.graph)
+    # xxx: keep in sync with frozen_graph_optimization.cpp
+    # intentionally duplicated to make to make it easier to create custom optimization sequence
+    torch._C._jit_pass_remove_dropout(mod._c)
+    if optimize_numerics:
+        # run a couple times to capture Conv -> Mul -> Add etc
+        for _ in range(2):
+            torch._C._jit_pass_fold_frozen_conv_bn(mod.graph)
+            torch._C._jit_pass_fold_frozen_conv_add_or_sub(mod.graph)
+            torch._C._jit_pass_fold_frozen_conv_mul_or_div(mod.graph)
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@ -24,6 +24,7 @@ from .replicate import replicate
 from .scatter_gather import scatter_kwargs, gather, is_namedtuple
 from .parallel_apply import parallel_apply
 from torch._utils import _get_device_index, _get_all_device_indices
+from ._functions import _get_stream


 def _find_tensors(obj):
@ -438,6 +439,8 @@ class DistributedDataParallel(Module):

        # reduction bucket size
        self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
+        # Whether to perform input tensor CPU to GPU copies on a side-stream
+        self.use_side_stream_for_tensor_copies = os.environ.get("PYTORCH_DDP_USE_SIDE_STREAM", "1") == "1"

        # Sync params and buffers
        self._sync_params_and_buffers(authoritative_rank=0)
@ -732,7 +735,23 @@ class DistributedDataParallel(Module):
        """
        def to_map(obj):
            if isinstance(obj, torch.Tensor):
-                return (obj.to(target_gpu), )
+                if not self.use_side_stream_for_tensor_copies:
+                    return (obj.to(target_gpu), )
+                else:
+                    # Perform CPU -> GPU copies in a background stream. This code is
+                    # motivated from similar logic in torch/nn/parallel/_functions.py
+                    stream = _get_stream(target_gpu)
+                    with torch.cuda.stream(stream):
+                        output = obj.to(target_gpu)
+                    # synchronize with the copy stream
+                    with torch.cuda.device(target_gpu):
+                        current_stream = torch.cuda.current_stream()
+                        # Sync the current stream with the copy stream
+                        current_stream.wait_stream(stream)
+                        # Ensure tensor memory is not reused until work on
+                        # main stream is complete
+                        output.record_stream(current_stream)
+                    return (output, )
            if is_namedtuple(obj):
                return [type(obj)(*args) for args in zip(*map(to_map, obj))]
            if isinstance(obj, tuple) and len(obj) > 0:
@ -1021,13 +1040,14 @@ class DistributedDataParallel(Module):
        parameter syncs while running Distributed DataParallel training.

        Args:
-            state (object): state is passed to the hook and can be used to maintain
-                            and update any state information that users would like to
-                            maintain as part of the training process. Examples: error
-                            feedback in gradient compression, peers to communicate with
-                            next in GossipGrad etc.
-            hook (callable): is defined as:
-                             hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future:
+            state (object): Passed to the hook to maintain any state information during the training process.
+                            Examples include error feedback in gradient compression,
+                            peers to communicate with next in GossipGrad, etc.
+
+                            It is locally stored by each worker
+                            and shared by all the gradient tensors on the worker.
+            hook (callable): Averages gradient tensors across workers and defined as:
+                             ``hook(state: object, bucket: dist._GradBucket) -> torch.futures.Future``:

                             This function is called once the bucket is ready. The
                             hook can perform whatever processing is needed and return
@ -1067,7 +1087,7 @@ class DistributedDataParallel(Module):
            DDP communication hook is experimental and subject to change.

        Example::
-            Below is an example of a noop hook that returns back the same tensors:
+            Below is an example of a noop hook that returns the same tensors.

            >>> def noop(state: object, bucket: dist._GradBucket): -> torch.futures.Future
            >>>     fut = torch.futures.Future()
@ -1091,7 +1111,6 @@ class DistributedDataParallel(Module):
            >>>     return fut.then(decode)

            >>> ddp.register_comm_hook(state = None, hook = encode_and_decode)
-
        """
        self._check_comm_hook(hook)
        dist._register_comm_hook(self.reducer, state, hook)
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@ -296,6 +296,22 @@ def _is_fp(value):
            return (type == 'Float') or (type == 'Double') or (type == 'Half')
    return False

+def _generate_wrapped_number(g, scalar):
+    """
+    Create a wrapped number based on https://github.com/pytorch/pytorch/issues/9515
+    A Tensor is a considered a "wrapped number" if it is
+    auto-wrapped from a C++ or Python number type. Integer types are
+    wrapped as 0-dim int64 tensors and floating-point types are
+    wrapped as 0-dim double tensors.
+
+    The input to this function is constant value. If the data type 
+    is a floating point type, it is converted to a 0-dim double
+    tensor, else it is converted to a 0-dim tensor of its original type
+    """
+    assert not isinstance(scalar, torch.Tensor)
+    if isinstance(scalar, float):
+        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
+    return g.op("Constant", value_t=torch.tensor(scalar))

 def _sort_helper(g, input, dim, decending=True, out=None):
    if out is not None:
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@ -121,6 +121,21 @@ def where(g, condition, self=None, other=None, _outputs=None):
        return sym_help._unbind_helper(g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs)
    return g.op("Where", condition, self, other)

+@parse_args('v', 'v', 'v', 'i', 'i', 'i')
+def fake_quantize_per_channel_affine(g, inputs, scale, zero_point, axis, quant_min=-128, quant_max=127):
+    if quant_min not in [0, -128] or quant_max not in [127, 255]:
+        raise RuntimeError(
+            "ONNX defines [0, 255] for quint8 and [-128, 127] for qint8, got [{}, {}]".format(quant_min, quant_max))
+
+    # ONNX defines zero_point to be int8 or uint8
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=sym_help.cast_pytorch_to_onnx['Byte'])
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=sym_help.cast_pytorch_to_onnx['Char'])
+    return g.op(
+        "DequantizeLinear",
+        g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis),
+        scale, zero_point, axis_i=axis)

 def _reduce_op_symbolic(onnx_op_name):
    def symbolic(g, self, dim=None, keepdim=None):
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@ -1319,8 +1319,8 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):

    axes = [-i for i in range(len(normalized_shape), 0, -1)]

-    two_cst = g.op("Constant", value_t=torch.tensor(2.))
-    eps_cst = g.op("Constant", value_t=torch.tensor(eps))
+    two_cst = sym_help._generate_wrapped_number(g, 2.)
+    eps_cst = sym_help._generate_wrapped_number(g, eps)

    mean = g.op("ReduceMean", input, axes_i=axes)
    numerator = sub(g, input, mean)
--- a/torch/overrides.py
+++ b/torch/overrides.py
@ -391,9 +391,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
        torch.exp2: lambda input, out=None: -1,
        torch.expm1: lambda input, out=None: -1,
        torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1,
-        torch.fake_quantize_per_channel_affine_cachemask: lambda input, scale, zero_point, axis, quant_min, quant_max: -1,
        torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1,
-        torch.fake_quantize_per_tensor_affine_cachemask: lambda input, scale, zero_point, quant_min, quant_max: -1,
        torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1,
        torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1,
        torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1,
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@ -22,16 +22,21 @@ from typing import List, Optional, Union
 from setuptools.command.build_ext import build_ext
 from pkg_resources import packaging  # type: ignore

-BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA')
 IS_WINDOWS = sys.platform == 'win32'
 LIB_EXT = '.pyd' if IS_WINDOWS else '.so'
 EXEC_EXT = '.exe' if IS_WINDOWS else ''
+CLIB_PREFIX = '' if IS_WINDOWS else 'lib'
+CLIB_EXT = '.dll' if IS_WINDOWS else '.so'
 SHARED_FLAG = '/DLL' if IS_WINDOWS else '-shared'

 _HERE = os.path.abspath(__file__)
 _TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
 TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib')

+
+BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA') or (os.path.exists(os.path.join(
+    TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cu{CLIB_EXT}')) and os.path.exists(os.path.join(TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cpp{CLIB_EXT}')))
+
 # Taken directly from python stdlib < 3.9
 # See https://github.com/pytorch/pytorch/issues/48617
 def _nt_quote_args(args: Optional[List[str]]) -> List[str]:
Author	SHA1	Message	Date
X Wang	d6943ea58d	apply diff 52351 (#52649 )	2021-02-23 07:51:38 -08:00
Nikita Shulga	02b61b49ea	[1.8] Update XNNPACK (#52647 ) Cherry-pick `55d53a4e70` into release/1.8 branch	2021-02-23 05:31:57 -08:00
Luca Wehrstedt	d553478c98	[v1.8] Make TensorPipe work around bug in old versions of libibverbs (#52615 ) The bug affects PyTorch users who meet two conditions: - they have an old version of libibverbs installed (the userspace library), namely older than v25, which dates from Jul 29, 2019; - but they do _not_ have an InfiniBand kernel module loaded. In those cases they will experience a crash (uncaught exception) happening when initializing RPC, mentioning an "unknown error -38". There is a workaround, which is for those users to activate a killswitch (which is private and undocumented) to disable the `ibv` backend of TensorPipe.	2021-02-22 16:55:12 -08:00
Nikita Shulga	63333e2a25	[1.8] Update api doc for enabling TcpStore on Windows (#52601 ) Summary: Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/51847 Reviewed By: albanD Differential Revision: D26405678 Pulled By: malfet fbshipit-source-id: 073b675225b48d1732771583f8f2473e0fdcf35c Co-authored-by: Joe Zhu <jozh@microsoft.com>	2021-02-22 10:14:09 -08:00
Bowen Bao	8e7eebfc9a	[1.8] Fix onnx mixed precision export for layernorm & fuseLogSoftmaxNllLoss (#52510 ) Co-authored-by: Shubham Bhokare <32080845+shubhambhokare1@users.noreply.github.com>	2021-02-19 14:40:53 -08:00
Eli Uriegas	f8afb8bdd0	[v1.8.0] Various CUDA 11.1 with BUILD_SPLIT_CUDA_FIXES (#52518 ) Co-authored-by: Nikita Shulga <nshulga@fb.com> Co-authored-by: peterjc123 <peterghost86@gmail.com> Co-authored-by: Jane Xu <janeyx@fb.com>	2021-02-19 12:41:21 -08:00
eellison	0851cc42b0	Update freezing API - changes from 52337 (#52392 ) Co-authored-by: eellison <eellison@fb.com>	2021-02-18 15:36:51 -08:00
Jane (Yuan) Xu	804f7b6018	Add arm64 binary build (#52443 ) (#52469 ) Summary: This is getting tested by https://github.com/pytorch/pytorch/issues/52441. Adds new config for macos arm64 to our binary builds. Now stores artifacts for mac builds. Pull Request resolved: https://github.com/pytorch/pytorch/pull/52443 Reviewed By: walterddr Differential Revision: D26517330 Pulled By: janeyx99 fbshipit-source-id: 02774937a827bdd4c08486dc9f8fe63446917f1e	2021-02-18 15:17:27 -08:00
SplitInfinity	32758d30b3	onnx export of per channel fake quantize functions (#42835 ) (#52430 ) Summary: Fixes https://github.com/pytorch/pytorch/issues/39502 This PR adds support for exporting fake_quantize_per_channel_affine to a pair of QuantizeLinear and DequantizeLinear. Per tensor support was added by PR https://github.com/pytorch/pytorch/pull/39738. `axis` attribute of QuantizeLinear and DequantizeLinear, which is required for per channel support, is added in opset13 added by https://github.com/onnx/onnx/pull/2772. [update 1/20/2021]: opset13 is being supported on master, the added function is now properly tested. Code also rebased to new master. The function is also tested offline with the following code ```python import torch from torch import quantization from torchvision import models qat_resnet18 = models.resnet18(pretrained=True).eval().cuda() qat_resnet18.qconfig = quantization.QConfig( activation=quantization.default_fake_quant, weight=quantization.default_per_channel_weight_fake_quant) quantization.prepare_qat(qat_resnet18, inplace=True) qat_resnet18.apply(quantization.enable_observer) qat_resnet18.apply(quantization.enable_fake_quant) dummy_input = torch.randn(16, 3, 224, 224).cuda() _ = qat_resnet18(dummy_input) for module in qat_resnet18.modules(): if isinstance(module, quantization.FakeQuantize): module.calculate_qparams() qat_resnet18.apply(quantization.disable_observer) qat_resnet18.cuda() input_names = [ "actual_input_1" ] output_names = [ "output1" ] torch.onnx.export(qat_resnet18, dummy_input, "quant_model.onnx", verbose=True, opset_version=13) ``` It can generate the desired graph. Pull Request resolved: https://github.com/pytorch/pytorch/pull/42835 Reviewed By: houseroad Differential Revision: D26293823 Pulled By: SplitInfinity fbshipit-source-id: 300498a2e24b7731b12fa2fbdea4e73dde80e7ea Co-authored-by: Hao Wu <skyw@users.noreply.github.com>	2021-02-18 12:50:40 -08:00
gchanan	bcb64a8084	Fix upsample bicubic2d batching handling on CPU. (#52389 ) (#52445 ) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/52389 Fixes: https://github.com/pytorch/pytorch/issues/49159 Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D26496319 Pulled By: gchanan fbshipit-source-id: d385cd683ef09e0596a9875ce84d03e6e77acc93	2021-02-18 12:46:39 -08:00
albanD	f07991d396	update symeig backward note about similar eigenvalues (#52311 ) (#52446 ) Summary: First part of https://github.com/pytorch/pytorch/issues/49886 to at least properly warn users of the current state Pull Request resolved: https://github.com/pytorch/pytorch/pull/52311 Reviewed By: soulitzer Differential Revision: D26495644 Pulled By: albanD fbshipit-source-id: 72abdfe41cdbcc1ac739a536eb85d1aa4ba90897	2021-02-18 12:45:47 -08:00
Eli Uriegas	c458cd4852	[v1.8.0] .circleci: Downgrade CUDA 11.2 -> 11.1 for binaries (#52151 ) (#52406 ) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/52151 CUDA 11.2 might not be as performant as we thought so let's downgrade to something we think is more performant. Signed-off-by: Eli Uriegas <eliuriegas@fb.com> Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D26408314 Pulled By: seemethere fbshipit-source-id: e2446aa0115e2c2a79718b1fdfd9fccf2072822d (cherry picked from commit a11650b069729997b002032d70e9793477147851) Signed-off-by: Eli Uriegas <eliuriegas@fb.com>	2021-02-18 10:59:03 -08:00
Nikita Shulga	f7c4afc0f4	[cmake] Add explicit cublas->cudart dependency (#52243 ) (#52404 ) Summary: Necessary to ensure correct link order, especially if libraries are linked statically. Otherwise, one might run into: ``` /usr/bin/ld: /usr/local/cuda/lib64/libcublasLt_static.a(libcublasLt_static.a.o): undefined reference to symbol 'cudaStreamWaitEvent@libcudart.so.11.0' /usr/local/cuda/lib64/libcudart.so: error adding symbols: DSO missing from command line ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/52243 Reviewed By: seemethere, ngimel Differential Revision: D26437159 Pulled By: malfet fbshipit-source-id: 33b8bb5040bda10537833f3ad737f535488452ea	2021-02-17 16:07:41 -08:00
Richard Zou	20554c00b6	[1.8] Remove torch.vmap (#52397 ) torch.vmap is a prototype feature and should not be in the stable binary. This PR: - Removes the `torch.vmap` API - Removes the documentation entry for torch.vmap - Changes the vmap tests to use an internal API instead of torch.vmap. Test Plan: - Tested locally (test_torch, test_autograd, test_type_hints, test_vmap), but also wait for CI.	2021-02-17 16:05:34 -08:00
Nikita Shulga	3464d64f08	[1.8] Fix libnvrtc discoverability in package patched by `auditwheel` (#52365 )	2021-02-17 16:05:05 -08:00
Vitaly Fedyunin	c6972eb3ac	Skip OneDNN Convolution in case of groups = 24 #50042 (#52313 ) Co-authored-by: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>	2021-02-17 16:04:26 -08:00
Rohan Varma	25562d3d41	Use side-stream in CPU to GPU copies in DDP (#50180 ) (#52270 ) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50180 Resolves the regression in https://github.com/pytorch/pytorch/issues/49819 by adding copy over background stream similar to scatter. For internal use cases, this is gated with an env var that maintains the previous behavior when it is off. Test Plan: CI Reviewed By: mrshenli, ngimel Differential Revision: D25818170 fbshipit-source-id: e50c76c035504b2a44e2be084701cee45c90df75	2021-02-17 09:49:30 -08:00
Mike Ruberry	cd63c37bc6	ports fix (#52242 ) Co-authored-by: Mike Ruberry <mruberry@devfair044.maas>	2021-02-13 17:59:51 -08:00
Yi Wang	c79decdbba	[v1.8 patch] [Resubmission] Add a documentation page for DDP communication hooks (#52215 ) Co-authored-by: wayi <wayi@devgpu238.prn2.facebook.com>	2021-02-12 16:37:23 -08:00
Nikita Shulga	c307a3f336	[1.8] Do not print warning if CUDA driver not found (#51806 ) (#52050 ) Summary: It frequently happens when PyTorch compiled with CUDA support is installed on machine that does not have NVIDIA GPUs. Fixes https://github.com/pytorch/pytorch/issues/47038 Pull Request resolved: https://github.com/pytorch/pytorch/pull/51806 Reviewed By: ezyang Differential Revision: D26285827 Pulled By: malfet fbshipit-source-id: 9fd5e690d0135a2b219c1afa803fb69de9729f5e	2021-02-12 12:20:46 -08:00
Nikita Shulga	f071020756	Workaround arm64 gcc error in `std::copysign` (#51900 ) (#52049 ) Summary: Move definition of copysign template and specialization for bfloat16/half types before first use of copysign in that file Add comment explaining why this is necessary Fixes https://github.com/pytorch/pytorch/issues/51889 Pull Request resolved: https://github.com/pytorch/pytorch/pull/51900 Reviewed By: walterddr Differential Revision: D26321741 Pulled By: malfet fbshipit-source-id: 888858b11d9708fa140fe9c0570cc5a24599205b	2021-02-12 08:00:46 -08:00
Vasiliy Kuznetsov	4f436f8570	fake_quant cachemask: remove Python bindings (#51878 ) (#52160 ) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51878 `fake_quantize_per_tensor_affine_cachemask` and `fake_quantize_per_channel_affine_cachemask` are implementation details of `fake_quantize_per_tensor_affine` and `fake_quantize_per_channel_affine`, removing the Python bindings for them since there is no need to expose them. Test Plan: ``` python test/test_quantization.py TestFakeQuantize ``` Imported from OSS Reviewed By: albanD, bugra Differential Revision: D26314173 fbshipit-source-id: 733c93a3951453e739b6ed46b72fbad2244f6e97 (cherry picked from commit 33afb5f19f4e427f099653139ae45b661b8bc596)	2021-02-12 07:37:00 -08:00
James Reed	ae11589710	[FX][1.8] Cherrypick three FX fixes to 1.8 (#52021 ) * Fix leaf modules in Transformer [ghstack-poisoned] * Fix tuple type annotations [ghstack-poisoned] * Generalize dict key check in `create-arg` (#51927) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51927 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D26329655 Pulled By: jamesr66a fbshipit-source-id: a15e7d9564551521af12a8fde1c7524856f0cbc2	2021-02-12 07:35:34 -08:00
Yuxin Wu	9e5bcc1020	1.8 cherrypick: Add metacompile of Ternary if (#51789 ) (#51913 ) Summary: Fixes issue: https://github.com/pytorch/pytorch/issues/49728 ======== Ternary if operation fails in Torchscript when the condition variable is annotated as Final. Tests: ======= pytest -k test_ternary_static_if test/test_jit.py Pull Request resolved: https://github.com/pytorch/pytorch/pull/51789 Reviewed By: gmagogsfm Differential Revision: D26278969 Pulled By: nikithamalgifb fbshipit-source-id: 27d1383290211503188428fb2e8b7749f59ba16e Co-authored-by: nikithamalgi <nikithamalgi@devvm146.prn0.facebook.com>	2021-02-09 21:34:26 -08:00
Eli Uriegas	fa8578241d	.jenkins: Release branch specific updates (#51982 )	2021-02-09 21:33:29 -08:00
Eli Uriegas	1368809532	[v1.8.0] [wip] doc_fix (#52006 ) Summary: tries to fix doc_test Pull Request resolved: https://github.com/pytorch/pytorch/pull/51825 Reviewed By: bertmaher Differential Revision: D26295583 Pulled By: ngimel fbshipit-source-id: 13f6e7f1675d810adfd4abd2d579e2812fe54c80 (cherry picked from commit 6c0bf28da651eb8ff1d2d0dcfe807ea757fb61e5) Signed-off-by: Eli Uriegas <eliuriegas@fb.com> Co-authored-by: Natalia Gimelshein <ngimel@fb.com>	2021-02-09 21:32:32 -08:00
James Reed	4073248fc2	[FX] Hide experimental folder (#51987 )	2021-02-09 15:44:33 -08:00
Jane (Yuan) Xu	75153cb730	Disable unaliged-access test from TestVectorizedMemoryAccess.CopyKernel (#51864 ) (#51890 ) Summary: Test begins to fail after the driver udpate See https://github.com/pytorch/pytorch/issues/51863 Pull Request resolved: https://github.com/pytorch/pytorch/pull/51864 Reviewed By: bertmaher Differential Revision: D26304018 Pulled By: malfet fbshipit-source-id: bb7ade2f28d8cf8f847159d4ce92391f0794c258 Co-authored-by: Nikita Shulga <nshulga@fb.com>	2021-02-09 10:17:18 -08:00
Rong Rong	5bb69b080c	concantenate LICENSE files when building a wheel (#51634 ) (#51882 ) Summary: Fixes https://github.com/pytorch/pytorch/issues/50695 I checked locally that the concatenated license file appears at `torch-<version>.dist-info/LICENSE` in the wheel. Pull Request resolved: https://github.com/pytorch/pytorch/pull/51634 Reviewed By: zhangguanheng66 Differential Revision: D26225550 Pulled By: walterddr fbshipit-source-id: 830c59fb7aea0eb50b99e295edddad9edab6ba3a Co-authored-by: mattip <matti.picus@gmail.com>	2021-02-09 10:16:12 -08:00